intel/elk: Fork Gfx8- compiler by copying existing code

Based on code from commit c3ceec6cd8.

Acked-by: Ian Romanick <ian.d.romanick@intel.com>
Acked-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27563>
This commit is contained in:
Caio Oliveira 2024-01-19 11:32:57 -08:00 committed by Marge Bot
parent a9214460ee
commit d44462c08d
777 changed files with 151345 additions and 0 deletions

View file

@ -0,0 +1,122 @@
/*
* Copyright © 2018 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#ifndef BRW_ASM_H
#define BRW_ASM_H
#include <inttypes.h>
#include <stdbool.h>
#include <assert.h>
#include "compiler/brw_reg.h"
#include "compiler/brw_reg_type.h"
#include "compiler/brw_eu_defines.h"
#include "compiler/brw_inst.h"
#include "compiler/brw_eu.h"
#include "dev/intel_device_info.h"
#include "util/list.h"
/* glibc < 2.27 defines OVERFLOW in /usr/include/math.h. */
#undef OVERFLOW
int yyparse(void);
int yylex(void);
char *lex_text(void);
extern struct brw_codegen *p;
extern int errors;
extern char *input_filename;
extern struct list_head instr_labels;
extern struct list_head target_labels;
struct condition {
unsigned cond_modifier:4;
unsigned flag_reg_nr:1;
unsigned flag_subreg_nr:1;
};
struct predicate {
unsigned pred_control:4;
unsigned pred_inv:1;
unsigned flag_reg_nr:1;
unsigned flag_subreg_nr:1;
};
enum instoption_type {
INSTOPTION_FLAG,
INSTOPTION_DEP_INFO,
};
struct instoption {
enum instoption_type type;
union {
unsigned uint_value;
struct tgl_swsb depinfo_value;
};
};
struct options {
unsigned access_mode:1;
unsigned compression_control:2;
unsigned thread_control:2;
unsigned no_dd_check:1; // Dependency control
unsigned no_dd_clear:1; // Dependency control
unsigned mask_control:1;
unsigned debug_control:1;
unsigned acc_wr_control:1;
unsigned end_of_thread:1;
unsigned compaction:1;
unsigned qtr_ctrl:2;
unsigned nib_ctrl:1;
unsigned is_compr:1;
struct tgl_swsb depinfo;
};
struct msgdesc {
unsigned ex_bso:1;
unsigned src1_len:5;
};
enum instr_label_type {
INSTR_LABEL_JIP,
INSTR_LABEL_UIP,
};
struct instr_label {
struct list_head link;
char *name;
int offset;
enum instr_label_type type;
};
struct target_label {
struct list_head link;
char *name;
int offset;
};
#endif /* BRW_ASM_H */

View file

@ -0,0 +1,385 @@
/*
* Copyright © 2018 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <stdio.h>
#include <getopt.h>
#include "brw_asm.h"
#include "intel/compiler/brw_disasm_info.h"
enum opt_output_type {
OPT_OUTPUT_HEX,
OPT_OUTPUT_C_LITERAL,
OPT_OUTPUT_BIN,
};
extern FILE *yyin;
struct brw_codegen *p;
static enum opt_output_type output_type = OPT_OUTPUT_BIN;
char *input_filename = NULL;
int errors;
struct list_head instr_labels;
struct list_head target_labels;
static void
print_help(const char *progname, FILE *file)
{
fprintf(file,
"Usage: %s [OPTION] inputfile\n"
"Assemble i965 instructions from input file.\n\n"
" -h, --help display this help and exit\n"
" -t, --type=OUTPUT_TYPE OUTPUT_TYPE can be 'bin' (default if omitted),\n"
" 'c_literal', or 'hex'\n"
" -o, --output specify output file\n"
" --compact print compacted instructions\n"
" -g, --gen=platform assemble instructions for given \n"
" platform (3 letter platform name)\n"
"Example:\n"
" i965_asm -g kbl input.asm -t hex -o output\n",
progname);
}
static uint32_t
get_dword(const brw_inst *inst, int idx)
{
uint32_t dword;
memcpy(&dword, (char *)inst + 4 * idx, sizeof(dword));
return dword;
}
static void
print_instruction(FILE *output, bool compact, const brw_inst *instruction)
{
int byte_limit;
byte_limit = (compact == true) ? 8 : 16;
switch (output_type) {
case OPT_OUTPUT_HEX: {
fprintf(output, "%02x", ((unsigned char *)instruction)[0]);
for (unsigned i = 1; i < byte_limit; i++) {
fprintf(output, " %02x", ((unsigned char *)instruction)[i]);
}
break;
}
case OPT_OUTPUT_C_LITERAL: {
fprintf(output, "\t0x%08x,", get_dword(instruction, 0));
for (unsigned i = 1; i < byte_limit / 4; i++)
fprintf(output, " 0x%08x,", get_dword(instruction, i));
break;
}
case OPT_OUTPUT_BIN:
fwrite(instruction, 1, byte_limit, output);
break;
}
if (output_type != OPT_OUTPUT_BIN) {
fprintf(output, "\n");
}
}
static struct intel_device_info *
i965_disasm_init(uint16_t pci_id)
{
struct intel_device_info *devinfo;
devinfo = malloc(sizeof *devinfo);
if (devinfo == NULL)
return NULL;
if (!intel_get_device_info_from_pci_id(pci_id, devinfo)) {
fprintf(stderr, "can't find device information: pci_id=0x%x\n",
pci_id);
free(devinfo);
return NULL;
}
return devinfo;
}
static bool
i965_postprocess_labels()
{
if (p->devinfo->ver < 6) {
return true;
}
void *store = p->store;
struct target_label *tlabel;
struct instr_label *ilabel, *s;
const unsigned to_bytes_scale = brw_jump_scale(p->devinfo);
LIST_FOR_EACH_ENTRY(tlabel, &target_labels, link) {
LIST_FOR_EACH_ENTRY_SAFE(ilabel, s, &instr_labels, link) {
if (!strcmp(tlabel->name, ilabel->name)) {
brw_inst *inst = store + ilabel->offset;
int relative_offset = (tlabel->offset - ilabel->offset) / sizeof(brw_inst);
relative_offset *= to_bytes_scale;
unsigned opcode = brw_inst_opcode(p->isa, inst);
if (ilabel->type == INSTR_LABEL_JIP) {
switch (opcode) {
case BRW_OPCODE_IF:
case BRW_OPCODE_ELSE:
case BRW_OPCODE_ENDIF:
case BRW_OPCODE_WHILE:
if (p->devinfo->ver >= 7) {
brw_inst_set_jip(p->devinfo, inst, relative_offset);
} else if (p->devinfo->ver == 6) {
brw_inst_set_gfx6_jump_count(p->devinfo, inst, relative_offset);
}
break;
case BRW_OPCODE_BREAK:
case BRW_OPCODE_HALT:
case BRW_OPCODE_CONTINUE:
brw_inst_set_jip(p->devinfo, inst, relative_offset);
break;
default:
fprintf(stderr, "Unknown opcode %d with JIP label\n", opcode);
return false;
}
} else {
switch (opcode) {
case BRW_OPCODE_IF:
case BRW_OPCODE_ELSE:
if (p->devinfo->ver > 7) {
brw_inst_set_uip(p->devinfo, inst, relative_offset);
} else if (p->devinfo->ver == 7) {
brw_inst_set_uip(p->devinfo, inst, relative_offset);
} else if (p->devinfo->ver == 6) {
// Nothing
}
break;
case BRW_OPCODE_WHILE:
case BRW_OPCODE_ENDIF:
fprintf(stderr, "WHILE/ENDIF cannot have UIP offset\n");
return false;
case BRW_OPCODE_BREAK:
case BRW_OPCODE_CONTINUE:
case BRW_OPCODE_HALT:
brw_inst_set_uip(p->devinfo, inst, relative_offset);
break;
default:
fprintf(stderr, "Unknown opcode %d with UIP label\n", opcode);
return false;
}
}
list_del(&ilabel->link);
}
}
}
LIST_FOR_EACH_ENTRY(ilabel, &instr_labels, link) {
fprintf(stderr, "Unknown label '%s'\n", ilabel->name);
}
return list_is_empty(&instr_labels);
}
int main(int argc, char **argv)
{
char *output_file = NULL;
char c;
FILE *output = stdout;
bool help = false, compact = false;
void *store;
uint64_t pci_id = 0;
int offset = 0, err;
int start_offset = 0;
struct disasm_info *disasm_info;
struct intel_device_info *devinfo = NULL;
int result = EXIT_FAILURE;
list_inithead(&instr_labels);
list_inithead(&target_labels);
const struct option i965_asm_opts[] = {
{ "help", no_argument, (int *) &help, true },
{ "type", required_argument, NULL, 't' },
{ "gen", required_argument, NULL, 'g' },
{ "output", required_argument, NULL, 'o' },
{ "compact", no_argument, (int *) &compact, true },
{ NULL, 0, NULL, 0 }
};
while ((c = getopt_long(argc, argv, ":t:g:o:h", i965_asm_opts, NULL)) != -1) {
switch (c) {
case 'g': {
const int id = intel_device_name_to_pci_device_id(optarg);
if (id < 0) {
fprintf(stderr, "can't parse gen: '%s', expected 3 letter "
"platform name\n", optarg);
goto end;
} else {
pci_id = id;
}
break;
}
case 'h':
help = true;
print_help(argv[0], stderr);
goto end;
case 't': {
if (strcmp(optarg, "hex") == 0) {
output_type = OPT_OUTPUT_HEX;
} else if (strcmp(optarg, "c_literal") == 0) {
output_type = OPT_OUTPUT_C_LITERAL;
} else if (strcmp(optarg, "bin") == 0) {
output_type = OPT_OUTPUT_BIN;
} else {
fprintf(stderr, "invalid value for --type: %s\n", optarg);
goto end;
}
break;
}
case 'o':
output_file = strdup(optarg);
break;
case 0:
break;
case ':':
fprintf(stderr, "%s: option `-%c' requires an argument\n",
argv[0], optopt);
goto end;
case '?':
default:
fprintf(stderr, "%s: option `-%c' is invalid: ignored\n",
argv[0], optopt);
goto end;
}
}
if (help || !pci_id) {
print_help(argv[0], stderr);
goto end;
}
if (!argv[optind]) {
fprintf(stderr, "Please specify input file\n");
goto end;
}
input_filename = strdup(argv[optind]);
yyin = fopen(input_filename, "r");
if (!yyin) {
fprintf(stderr, "Unable to read input file : %s\n",
input_filename);
goto end;
}
if (output_file) {
output = fopen(output_file, "w");
if (!output) {
fprintf(stderr, "Couldn't open output file\n");
goto end;
}
}
devinfo = i965_disasm_init(pci_id);
if (!devinfo) {
fprintf(stderr, "Unable to allocate memory for "
"intel_device_info struct instance.\n");
goto end;
}
struct brw_isa_info isa;
brw_init_isa_info(&isa, devinfo);
p = rzalloc(NULL, struct brw_codegen);
brw_init_codegen(&isa, p, p);
p->automatic_exec_sizes = false;
err = yyparse();
if (err || errors)
goto end;
if (!i965_postprocess_labels())
goto end;
store = p->store;
disasm_info = disasm_initialize(p->isa, NULL);
if (!disasm_info) {
fprintf(stderr, "Unable to initialize disasm_info struct instance\n");
goto end;
}
if (output_type == OPT_OUTPUT_C_LITERAL)
fprintf(output, "{\n");
brw_validate_instructions(p->isa, p->store, 0,
p->next_insn_offset, disasm_info);
const int nr_insn = (p->next_insn_offset - start_offset) / 16;
if (compact)
brw_compact_instructions(p, start_offset, disasm_info);
for (int i = 0; i < nr_insn; i++) {
const brw_inst *insn = store + offset;
bool compacted = false;
if (compact && brw_inst_cmpt_control(p->devinfo, insn)) {
offset += 8;
compacted = true;
} else {
offset += 16;
}
print_instruction(output, compacted, insn);
}
ralloc_free(disasm_info);
if (output_type == OPT_OUTPUT_C_LITERAL)
fprintf(output, "}");
result = EXIT_SUCCESS;
goto end;
end:
free(input_filename);
free(output_file);
if (yyin)
fclose(yyin);
if (output)
fclose(output);
if (p)
ralloc_free(p);
if (devinfo)
free(devinfo);
exit(result);
}

View file

@ -0,0 +1,833 @@
/*
* Copyright © 2012 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* Authors:
* Eric Anholt <eric@anholt.net>
*
*/
#include "brw_cfg.h"
#include "util/u_dynarray.h"
#include "brw_shader.h"
/** @file brw_cfg.cpp
*
* Walks the shader instructions generated and creates a set of basic
* blocks with successor/predecessor edges connecting them.
*/
using namespace brw;
static bblock_t *
pop_stack(exec_list *list)
{
bblock_link *link = (bblock_link *)list->get_tail();
bblock_t *block = link->block;
link->link.remove();
return block;
}
static exec_node *
link(void *mem_ctx, bblock_t *block, enum bblock_link_kind kind)
{
bblock_link *l = new(mem_ctx) bblock_link(block, kind);
return &l->link;
}
void
push_stack(exec_list *list, void *mem_ctx, bblock_t *block)
{
/* The kind of the link is immaterial, but we need to provide one since
* this is (ab)using the edge data structure in order to implement a stack.
*/
list->push_tail(link(mem_ctx, block, bblock_link_logical));
}
bblock_t::bblock_t(cfg_t *cfg) :
cfg(cfg), start_ip(0), end_ip(0), end_ip_delta(0), num(0)
{
instructions.make_empty();
parents.make_empty();
children.make_empty();
}
void
bblock_t::add_successor(void *mem_ctx, bblock_t *successor,
enum bblock_link_kind kind)
{
successor->parents.push_tail(::link(mem_ctx, this, kind));
children.push_tail(::link(mem_ctx, successor, kind));
}
bool
bblock_t::is_predecessor_of(const bblock_t *block,
enum bblock_link_kind kind) const
{
foreach_list_typed_safe (bblock_link, parent, link, &block->parents) {
if (parent->block == this && parent->kind <= kind) {
return true;
}
}
return false;
}
bool
bblock_t::is_successor_of(const bblock_t *block,
enum bblock_link_kind kind) const
{
foreach_list_typed_safe (bblock_link, child, link, &block->children) {
if (child->block == this && child->kind <= kind) {
return true;
}
}
return false;
}
static bool
ends_block(const backend_instruction *inst)
{
enum opcode op = inst->opcode;
return op == BRW_OPCODE_IF ||
op == BRW_OPCODE_ELSE ||
op == BRW_OPCODE_CONTINUE ||
op == BRW_OPCODE_BREAK ||
op == BRW_OPCODE_DO ||
op == BRW_OPCODE_WHILE;
}
static bool
starts_block(const backend_instruction *inst)
{
enum opcode op = inst->opcode;
return op == BRW_OPCODE_DO ||
op == BRW_OPCODE_ENDIF;
}
bool
bblock_t::can_combine_with(const bblock_t *that) const
{
if ((const bblock_t *)this->link.next != that)
return false;
if (ends_block(this->end()) ||
starts_block(that->start()))
return false;
return true;
}
void
bblock_t::combine_with(bblock_t *that)
{
assert(this->can_combine_with(that));
foreach_list_typed (bblock_link, link, link, &that->parents) {
assert(link->block == this);
}
this->end_ip = that->end_ip;
this->instructions.append_list(&that->instructions);
this->cfg->remove_block(that);
}
void
bblock_t::dump(FILE *file) const
{
const backend_shader *s = this->cfg->s;
int ip = this->start_ip;
foreach_inst_in_block(backend_instruction, inst, this) {
fprintf(file, "%5d: ", ip);
s->dump_instruction(inst, file);
ip++;
}
}
void
bblock_t::unlink_list(exec_list *list)
{
assert(list == &parents || list == &children);
const bool remove_parent = list == &children;
foreach_list_typed_safe(bblock_link, link, link, list) {
/* Also break the links from the other block back to this block. */
exec_list *sub_list = remove_parent ? &link->block->parents : &link->block->children;
foreach_list_typed_safe(bblock_link, sub_link, link, sub_list) {
if (sub_link->block == this) {
sub_link->link.remove();
ralloc_free(sub_link);
}
}
link->link.remove();
ralloc_free(link);
}
}
cfg_t::cfg_t(const backend_shader *s, exec_list *instructions) :
s(s)
{
mem_ctx = ralloc_context(NULL);
block_list.make_empty();
blocks = NULL;
num_blocks = 0;
bblock_t *cur = NULL;
int ip = 0;
bblock_t *entry = new_block();
bblock_t *cur_if = NULL; /**< BB ending with IF. */
bblock_t *cur_else = NULL; /**< BB ending with ELSE. */
bblock_t *cur_do = NULL; /**< BB starting with DO. */
bblock_t *cur_while = NULL; /**< BB immediately following WHILE. */
exec_list if_stack, else_stack, do_stack, while_stack;
bblock_t *next;
set_next_block(&cur, entry, ip);
foreach_in_list_safe(backend_instruction, inst, instructions) {
/* set_next_block wants the post-incremented ip */
ip++;
inst->exec_node::remove();
switch (inst->opcode) {
case BRW_OPCODE_IF:
cur->instructions.push_tail(inst);
/* Push our information onto a stack so we can recover from
* nested ifs.
*/
push_stack(&if_stack, mem_ctx, cur_if);
push_stack(&else_stack, mem_ctx, cur_else);
cur_if = cur;
cur_else = NULL;
/* Set up our immediately following block, full of "then"
* instructions.
*/
next = new_block();
cur_if->add_successor(mem_ctx, next, bblock_link_logical);
set_next_block(&cur, next, ip);
break;
case BRW_OPCODE_ELSE:
cur->instructions.push_tail(inst);
cur_else = cur;
next = new_block();
assert(cur_if != NULL);
cur_if->add_successor(mem_ctx, next, bblock_link_logical);
cur_else->add_successor(mem_ctx, next, bblock_link_physical);
set_next_block(&cur, next, ip);
break;
case BRW_OPCODE_ENDIF: {
bblock_t *cur_endif;
if (cur->instructions.is_empty()) {
/* New block was just created; use it. */
cur_endif = cur;
} else {
cur_endif = new_block();
cur->add_successor(mem_ctx, cur_endif, bblock_link_logical);
set_next_block(&cur, cur_endif, ip - 1);
}
cur->instructions.push_tail(inst);
if (cur_else) {
cur_else->add_successor(mem_ctx, cur_endif, bblock_link_logical);
} else {
assert(cur_if != NULL);
cur_if->add_successor(mem_ctx, cur_endif, bblock_link_logical);
}
assert(cur_if->end()->opcode == BRW_OPCODE_IF);
assert(!cur_else || cur_else->end()->opcode == BRW_OPCODE_ELSE);
/* Pop the stack so we're in the previous if/else/endif */
cur_if = pop_stack(&if_stack);
cur_else = pop_stack(&else_stack);
break;
}
case BRW_OPCODE_DO:
/* Push our information onto a stack so we can recover from
* nested loops.
*/
push_stack(&do_stack, mem_ctx, cur_do);
push_stack(&while_stack, mem_ctx, cur_while);
/* Set up the block just after the while. Don't know when exactly
* it will start, yet.
*/
cur_while = new_block();
if (cur->instructions.is_empty()) {
/* New block was just created; use it. */
cur_do = cur;
} else {
cur_do = new_block();
cur->add_successor(mem_ctx, cur_do, bblock_link_logical);
set_next_block(&cur, cur_do, ip - 1);
}
cur->instructions.push_tail(inst);
/* Represent divergent execution of the loop as a pair of alternative
* edges coming out of the DO instruction: For any physical iteration
* of the loop a given logical thread can either start off enabled
* (which is represented as the "next" successor), or disabled (if it
* has reached a non-uniform exit of the loop during a previous
* iteration, which is represented as the "cur_while" successor).
*
* The disabled edge will be taken by the logical thread anytime we
* arrive at the DO instruction through a back-edge coming from a
* conditional exit of the loop where divergent control flow started.
*
* This guarantees that there is a control-flow path from any
* divergence point of the loop into the convergence point
* (immediately past the WHILE instruction) such that it overlaps the
* whole IP region of divergent control flow (potentially the whole
* loop) *and* doesn't imply the execution of any instructions part
* of the loop (since the corresponding execution mask bit will be
* disabled for a diverging thread).
*
* This way we make sure that any variables that are live throughout
* the region of divergence for an inactive logical thread are also
* considered to interfere with any other variables assigned by
* active logical threads within the same physical region of the
* program, since otherwise we would risk cross-channel data
* corruption.
*/
next = new_block();
cur->add_successor(mem_ctx, next, bblock_link_logical);
cur->add_successor(mem_ctx, cur_while, bblock_link_physical);
set_next_block(&cur, next, ip);
break;
case BRW_OPCODE_CONTINUE:
cur->instructions.push_tail(inst);
/* A conditional CONTINUE may start a region of divergent control
* flow until the start of the next loop iteration (*not* until the
* end of the loop which is why the successor is not the top-level
* divergence point at cur_do). The live interval of any variable
* extending through a CONTINUE edge is guaranteed to overlap the
* whole region of divergent execution, because any variable live-out
* at the CONTINUE instruction will also be live-in at the top of the
* loop, and therefore also live-out at the bottom-most point of the
* loop which is reachable from the top (since a control flow path
* exists from a definition of the variable through this CONTINUE
* instruction, the top of the loop, the (reachable) bottom of the
* loop, the top of the loop again, into a use of the variable).
*/
assert(cur_do != NULL);
cur->add_successor(mem_ctx, cur_do->next(), bblock_link_logical);
next = new_block();
if (inst->predicate)
cur->add_successor(mem_ctx, next, bblock_link_logical);
else
cur->add_successor(mem_ctx, next, bblock_link_physical);
set_next_block(&cur, next, ip);
break;
case BRW_OPCODE_BREAK:
cur->instructions.push_tail(inst);
/* A conditional BREAK instruction may start a region of divergent
* control flow until the end of the loop if the condition is
* non-uniform, in which case the loop will execute additional
* iterations with the present channel disabled. We model this as a
* control flow path from the divergence point to the convergence
* point that overlaps the whole IP range of the loop and skips over
* the execution of any other instructions part of the loop.
*
* See the DO case for additional explanation.
*/
assert(cur_do != NULL);
cur->add_successor(mem_ctx, cur_do, bblock_link_physical);
cur->add_successor(mem_ctx, cur_while, bblock_link_logical);
next = new_block();
if (inst->predicate)
cur->add_successor(mem_ctx, next, bblock_link_logical);
else
cur->add_successor(mem_ctx, next, bblock_link_physical);
set_next_block(&cur, next, ip);
break;
case BRW_OPCODE_WHILE:
cur->instructions.push_tail(inst);
assert(cur_do != NULL && cur_while != NULL);
/* A conditional WHILE instruction may start a region of divergent
* control flow until the end of the loop, just like the BREAK
* instruction. See the BREAK case for more details. OTOH an
* unconditional WHILE instruction is non-divergent (just like an
* unconditional CONTINUE), and will necessarily lead to the
* execution of an additional iteration of the loop for all enabled
* channels, so we may skip over the divergence point at the top of
* the loop to keep the CFG as unambiguous as possible.
*/
if (inst->predicate) {
cur->add_successor(mem_ctx, cur_do, bblock_link_logical);
} else {
cur->add_successor(mem_ctx, cur_do->next(), bblock_link_logical);
}
set_next_block(&cur, cur_while, ip);
/* Pop the stack so we're in the previous loop */
cur_do = pop_stack(&do_stack);
cur_while = pop_stack(&while_stack);
break;
default:
cur->instructions.push_tail(inst);
break;
}
}
cur->end_ip = ip - 1;
make_block_array();
}
cfg_t::~cfg_t()
{
ralloc_free(mem_ctx);
}
void
cfg_t::remove_block(bblock_t *block)
{
foreach_list_typed_safe (bblock_link, predecessor, link, &block->parents) {
/* cfg_t::validate checks that predecessor and successor lists are well
* formed, so it is known that the loop here would find exactly one
* block. Set old_link_kind to silence "variable used but not set"
* warnings.
*/
bblock_link_kind old_link_kind = bblock_link_logical;
/* Remove block from all of its predecessors' successor lists. */
foreach_list_typed_safe (bblock_link, successor, link,
&predecessor->block->children) {
if (block == successor->block) {
old_link_kind = successor->kind;
successor->link.remove();
ralloc_free(successor);
break;
}
}
/* Add removed-block's successors to its predecessors' successor lists. */
foreach_list_typed (bblock_link, successor, link, &block->children) {
bool need_to_link = true;
bblock_link_kind new_link_kind = MAX2(old_link_kind, successor->kind);
foreach_list_typed_safe (bblock_link, child, link, &predecessor->block->children) {
/* There is already a link between the two blocks. If the links
* are the same kind or the link is logical, do nothing. If the
* existing link is physical and the proposed new link is logical,
* promote the existing link to logical.
*
* This is accomplished by taking the minimum of the existing link
* kind and the proposed link kind.
*/
if (child->block == successor->block) {
child->kind = MIN2(child->kind, new_link_kind);
need_to_link = false;
break;
}
}
if (need_to_link) {
predecessor->block->children.push_tail(link(mem_ctx,
successor->block,
new_link_kind));
}
}
}
foreach_list_typed_safe (bblock_link, successor, link, &block->children) {
/* cfg_t::validate checks that predecessor and successor lists are well
* formed, so it is known that the loop here would find exactly one
* block. Set old_link_kind to silence "variable used but not set"
* warnings.
*/
bblock_link_kind old_link_kind = bblock_link_logical;
/* Remove block from all of its childrens' parents lists. */
foreach_list_typed_safe (bblock_link, predecessor, link,
&successor->block->parents) {
if (block == predecessor->block) {
old_link_kind = predecessor->kind;
predecessor->link.remove();
ralloc_free(predecessor);
}
}
/* Add removed-block's predecessors to its successors' predecessor lists. */
foreach_list_typed (bblock_link, predecessor, link, &block->parents) {
bool need_to_link = true;
bblock_link_kind new_link_kind = MAX2(old_link_kind, predecessor->kind);
foreach_list_typed_safe (bblock_link, parent, link, &successor->block->parents) {
/* There is already a link between the two blocks. If the links
* are the same kind or the link is logical, do nothing. If the
* existing link is physical and the proposed new link is logical,
* promote the existing link to logical.
*
* This is accomplished by taking the minimum of the existing link
* kind and the proposed link kind.
*/
if (parent->block == predecessor->block) {
parent->kind = MIN2(parent->kind, new_link_kind);
need_to_link = false;
break;
}
}
if (need_to_link) {
successor->block->parents.push_tail(link(mem_ctx,
predecessor->block,
new_link_kind));
}
}
}
block->link.remove();
for (int b = block->num; b < this->num_blocks - 1; b++) {
this->blocks[b] = this->blocks[b + 1];
this->blocks[b]->num = b;
}
this->blocks[this->num_blocks - 1]->num = this->num_blocks - 2;
this->num_blocks--;
}
bblock_t *
cfg_t::new_block()
{
bblock_t *block = new(mem_ctx) bblock_t(this);
return block;
}
void
cfg_t::set_next_block(bblock_t **cur, bblock_t *block, int ip)
{
if (*cur) {
(*cur)->end_ip = ip - 1;
}
block->start_ip = ip;
block->num = num_blocks++;
block_list.push_tail(&block->link);
*cur = block;
}
void
cfg_t::make_block_array()
{
blocks = ralloc_array(mem_ctx, bblock_t *, num_blocks);
int i = 0;
foreach_block (block, this) {
blocks[i++] = block;
}
assert(i == num_blocks);
}
namespace {
struct link_desc {
char kind;
int num;
};
int
compare_link_desc(const void *a, const void *b)
{
const link_desc *la = (const link_desc *)a;
const link_desc *lb = (const link_desc *)b;
return la->num < lb->num ? -1 :
la->num > lb->num ? +1 :
la->kind < lb->kind ? -1 :
la->kind > lb->kind ? +1 :
0;
}
void
sort_links(util_dynarray *scratch, exec_list *list)
{
util_dynarray_clear(scratch);
foreach_list_typed(bblock_link, link, link, list) {
link_desc l;
l.kind = link->kind == bblock_link_logical ? '-' : '~';
l.num = link->block->num;
util_dynarray_append(scratch, link_desc, l);
}
qsort(scratch->data, util_dynarray_num_elements(scratch, link_desc),
sizeof(link_desc), compare_link_desc);
}
} /* namespace */
void
cfg_t::dump(FILE *file)
{
const idom_tree *idom = (s ? &s->idom_analysis.require() : NULL);
/* Temporary storage to sort the lists of blocks. This normalizes the
* output, making it possible to use it for certain tests.
*/
util_dynarray scratch;
util_dynarray_init(&scratch, NULL);
foreach_block (block, this) {
if (idom && idom->parent(block))
fprintf(file, "START B%d IDOM(B%d)", block->num,
idom->parent(block)->num);
else
fprintf(file, "START B%d IDOM(none)", block->num);
sort_links(&scratch, &block->parents);
util_dynarray_foreach(&scratch, link_desc, l)
fprintf(file, " <%cB%d", l->kind, l->num);
fprintf(file, "\n");
if (s != NULL)
block->dump(file);
fprintf(file, "END B%d", block->num);
sort_links(&scratch, &block->children);
util_dynarray_foreach(&scratch, link_desc, l)
fprintf(file, " %c>B%d", l->kind, l->num);
fprintf(file, "\n");
}
util_dynarray_fini(&scratch);
}
/* Calculates the immediate dominator of each block, according to "A Simple,
* Fast Dominance Algorithm" by Keith D. Cooper, Timothy J. Harvey, and Ken
* Kennedy.
*
* The authors claim that for control flow graphs of sizes normally encountered
* (less than 1000 nodes) that this algorithm is significantly faster than
* others like Lengauer-Tarjan.
*/
idom_tree::idom_tree(const backend_shader *s) :
num_parents(s->cfg->num_blocks),
parents(new bblock_t *[num_parents]())
{
bool changed;
parents[0] = s->cfg->blocks[0];
do {
changed = false;
foreach_block(block, s->cfg) {
if (block->num == 0)
continue;
bblock_t *new_idom = NULL;
foreach_list_typed(bblock_link, parent_link, link, &block->parents) {
if (parent(parent_link->block)) {
new_idom = (new_idom ? intersect(new_idom, parent_link->block) :
parent_link->block);
}
}
if (parent(block) != new_idom) {
parents[block->num] = new_idom;
changed = true;
}
}
} while (changed);
}
idom_tree::~idom_tree()
{
delete[] parents;
}
bblock_t *
idom_tree::intersect(bblock_t *b1, bblock_t *b2) const
{
/* Note, the comparisons here are the opposite of what the paper says
* because we index blocks from beginning -> end (i.e. reverse post-order)
* instead of post-order like they assume.
*/
while (b1->num != b2->num) {
while (b1->num > b2->num)
b1 = parent(b1);
while (b2->num > b1->num)
b2 = parent(b2);
}
assert(b1);
return b1;
}
void
idom_tree::dump() const
{
printf("digraph DominanceTree {\n");
for (unsigned i = 0; i < num_parents; i++)
printf("\t%d -> %d\n", parents[i]->num, i);
printf("}\n");
}
void
cfg_t::dump_cfg()
{
printf("digraph CFG {\n");
for (int b = 0; b < num_blocks; b++) {
bblock_t *block = this->blocks[b];
foreach_list_typed_safe (bblock_link, child, link, &block->children) {
printf("\t%d -> %d\n", b, child->block->num);
}
}
printf("}\n");
}
#define cfgv_assert(assertion) \
{ \
if (!(assertion)) { \
fprintf(stderr, "ASSERT: CFG validation in %s failed!\n", stage_abbrev); \
fprintf(stderr, "%s:%d: '%s' failed\n", __FILE__, __LINE__, #assertion); \
abort(); \
} \
}
#ifndef NDEBUG
void
cfg_t::validate(const char *stage_abbrev)
{
foreach_block(block, this) {
foreach_list_typed(bblock_link, successor, link, &block->children) {
/* Each successor of a block must have one predecessor link back to
* the block.
*/
bool successor_links_back_to_predecessor = false;
bblock_t *succ_block = successor->block;
foreach_list_typed(bblock_link, predecessor, link, &succ_block->parents) {
if (predecessor->block == block) {
cfgv_assert(!successor_links_back_to_predecessor);
cfgv_assert(successor->kind == predecessor->kind);
successor_links_back_to_predecessor = true;
}
}
cfgv_assert(successor_links_back_to_predecessor);
/* Each successor block must appear only once in the list of
* successors.
*/
foreach_list_typed_from(bblock_link, later_successor, link,
&block->children, successor->link.next) {
cfgv_assert(successor->block != later_successor->block);
}
}
foreach_list_typed(bblock_link, predecessor, link, &block->parents) {
/* Each predecessor of a block must have one successor link back to
* the block.
*/
bool predecessor_links_back_to_successor = false;
bblock_t *pred_block = predecessor->block;
foreach_list_typed(bblock_link, successor, link, &pred_block->children) {
if (successor->block == block) {
cfgv_assert(!predecessor_links_back_to_successor);
cfgv_assert(successor->kind == predecessor->kind);
predecessor_links_back_to_successor = true;
}
}
cfgv_assert(predecessor_links_back_to_successor);
/* Each precessor block must appear only once in the list of
* precessors.
*/
foreach_list_typed_from(bblock_link, later_precessor, link,
&block->parents, predecessor->link.next) {
cfgv_assert(predecessor->block != later_precessor->block);
}
}
backend_instruction *first_inst = block->start();
if (first_inst->opcode == BRW_OPCODE_DO) {
/* DO instructions both begin and end a block, so the DO instruction
* must be the only instruction in the block.
*/
cfgv_assert(exec_list_is_singular(&block->instructions));
/* A block starting with DO should have exactly two successors. One
* is a physical link to the block starting after the WHILE
* instruction. The other is a logical link to the block starting the
* body of the loop.
*/
bblock_t *physical_block = nullptr;
bblock_t *logical_block = nullptr;
foreach_list_typed(bblock_link, child, link, &block->children) {
if (child->kind == bblock_link_physical) {
cfgv_assert(physical_block == nullptr);
physical_block = child->block;
} else {
cfgv_assert(logical_block == nullptr);
logical_block = child->block;
}
}
cfgv_assert(logical_block != nullptr);
cfgv_assert(physical_block != nullptr);
}
}
}
#endif

View file

@ -0,0 +1,532 @@
/*
* Copyright © 2012 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* Authors:
* Eric Anholt <eric@anholt.net>
*
*/
#ifndef BRW_CFG_H
#define BRW_CFG_H
#include "brw_ir.h"
#ifdef __cplusplus
#include "brw_ir_analysis.h"
#endif
struct bblock_t;
/**
* CFG edge types.
*
* A logical edge represents a potential control flow path of the original
* scalar program, while a physical edge represents a control flow path that
* may not have existed in the original program but was introduced during
* vectorization in order to implement divergent control flow of different
* shader invocations within the same SIMD thread.
*
* All logical edges in the CFG are considered to be physical edges but not
* the other way around -- I.e. the logical CFG is a subset of the physical
* one.
*/
enum bblock_link_kind {
bblock_link_logical = 0,
bblock_link_physical
};
struct bblock_link {
#ifdef __cplusplus
DECLARE_RALLOC_CXX_OPERATORS(bblock_link)
bblock_link(bblock_t *block, enum bblock_link_kind kind)
: block(block), kind(kind)
{
}
#endif
struct exec_node link;
struct bblock_t *block;
/* Type of this CFG edge. Because bblock_link_logical also implies
* bblock_link_physical, the proper way to test for membership of edge 'l'
* in CFG kind 'k' is 'l.kind <= k'.
*/
enum bblock_link_kind kind;
};
struct backend_shader;
struct cfg_t;
struct bblock_t {
#ifdef __cplusplus
DECLARE_RALLOC_CXX_OPERATORS(bblock_t)
explicit bblock_t(cfg_t *cfg);
void add_successor(void *mem_ctx, bblock_t *successor,
enum bblock_link_kind kind);
bool is_predecessor_of(const bblock_t *block,
enum bblock_link_kind kind) const;
bool is_successor_of(const bblock_t *block,
enum bblock_link_kind kind) const;
bool can_combine_with(const bblock_t *that) const;
void combine_with(bblock_t *that);
void dump(FILE *file = stderr) const;
backend_instruction *start();
const backend_instruction *start() const;
backend_instruction *end();
const backend_instruction *end() const;
bblock_t *next();
const bblock_t *next() const;
bblock_t *prev();
const bblock_t *prev() const;
bool starts_with_control_flow() const;
bool ends_with_control_flow() const;
backend_instruction *first_non_control_flow_inst();
backend_instruction *last_non_control_flow_inst();
private:
/**
* \sa unlink_parents, unlink_children
*/
void unlink_list(exec_list *);
public:
void unlink_parents()
{
unlink_list(&parents);
}
void unlink_children()
{
unlink_list(&children);
}
#endif
struct exec_node link;
struct cfg_t *cfg;
int start_ip;
int end_ip;
/**
* Change in end_ip since the last time IPs of later blocks were updated.
*/
int end_ip_delta;
struct exec_list instructions;
struct exec_list parents;
struct exec_list children;
int num;
};
static inline struct backend_instruction *
bblock_start(struct bblock_t *block)
{
return (struct backend_instruction *)exec_list_get_head(&block->instructions);
}
static inline const struct backend_instruction *
bblock_start_const(const struct bblock_t *block)
{
return (const struct backend_instruction *)exec_list_get_head_const(&block->instructions);
}
static inline struct backend_instruction *
bblock_end(struct bblock_t *block)
{
return (struct backend_instruction *)exec_list_get_tail(&block->instructions);
}
static inline const struct backend_instruction *
bblock_end_const(const struct bblock_t *block)
{
return (const struct backend_instruction *)exec_list_get_tail_const(&block->instructions);
}
static inline struct bblock_t *
bblock_next(struct bblock_t *block)
{
if (exec_node_is_tail_sentinel(block->link.next))
return NULL;
return (struct bblock_t *)block->link.next;
}
static inline const struct bblock_t *
bblock_next_const(const struct bblock_t *block)
{
if (exec_node_is_tail_sentinel(block->link.next))
return NULL;
return (const struct bblock_t *)block->link.next;
}
static inline struct bblock_t *
bblock_prev(struct bblock_t *block)
{
if (exec_node_is_head_sentinel(block->link.prev))
return NULL;
return (struct bblock_t *)block->link.prev;
}
static inline const struct bblock_t *
bblock_prev_const(const struct bblock_t *block)
{
if (exec_node_is_head_sentinel(block->link.prev))
return NULL;
return (const struct bblock_t *)block->link.prev;
}
static inline bool
bblock_starts_with_control_flow(const struct bblock_t *block)
{
enum opcode op = bblock_start_const(block)->opcode;
return op == BRW_OPCODE_DO || op == BRW_OPCODE_ENDIF;
}
static inline bool
bblock_ends_with_control_flow(const struct bblock_t *block)
{
enum opcode op = bblock_end_const(block)->opcode;
return op == BRW_OPCODE_IF ||
op == BRW_OPCODE_ELSE ||
op == BRW_OPCODE_WHILE ||
op == BRW_OPCODE_BREAK ||
op == BRW_OPCODE_CONTINUE;
}
static inline struct backend_instruction *
bblock_first_non_control_flow_inst(struct bblock_t *block)
{
struct backend_instruction *inst = bblock_start(block);
if (bblock_starts_with_control_flow(block))
#ifdef __cplusplus
inst = (struct backend_instruction *)inst->next;
#else
inst = (struct backend_instruction *)inst->link.next;
#endif
return inst;
}
static inline struct backend_instruction *
bblock_last_non_control_flow_inst(struct bblock_t *block)
{
struct backend_instruction *inst = bblock_end(block);
if (bblock_ends_with_control_flow(block))
#ifdef __cplusplus
inst = (struct backend_instruction *)inst->prev;
#else
inst = (struct backend_instruction *)inst->link.prev;
#endif
return inst;
}
#ifdef __cplusplus
inline backend_instruction *
bblock_t::start()
{
return bblock_start(this);
}
inline const backend_instruction *
bblock_t::start() const
{
return bblock_start_const(this);
}
inline backend_instruction *
bblock_t::end()
{
return bblock_end(this);
}
inline const backend_instruction *
bblock_t::end() const
{
return bblock_end_const(this);
}
inline bblock_t *
bblock_t::next()
{
return bblock_next(this);
}
inline const bblock_t *
bblock_t::next() const
{
return bblock_next_const(this);
}
inline bblock_t *
bblock_t::prev()
{
return bblock_prev(this);
}
inline const bblock_t *
bblock_t::prev() const
{
return bblock_prev_const(this);
}
inline bool
bblock_t::starts_with_control_flow() const
{
return bblock_starts_with_control_flow(this);
}
inline bool
bblock_t::ends_with_control_flow() const
{
return bblock_ends_with_control_flow(this);
}
inline backend_instruction *
bblock_t::first_non_control_flow_inst()
{
return bblock_first_non_control_flow_inst(this);
}
inline backend_instruction *
bblock_t::last_non_control_flow_inst()
{
return bblock_last_non_control_flow_inst(this);
}
#endif
struct cfg_t {
#ifdef __cplusplus
DECLARE_RALLOC_CXX_OPERATORS(cfg_t)
cfg_t(const backend_shader *s, exec_list *instructions);
~cfg_t();
void remove_block(bblock_t *block);
bblock_t *first_block();
const bblock_t *first_block() const;
bblock_t *last_block();
const bblock_t *last_block() const;
bblock_t *new_block();
void set_next_block(bblock_t **cur, bblock_t *block, int ip);
void make_block_array();
void dump(FILE *file = stderr);
void dump_cfg();
#ifdef NDEBUG
void validate(UNUSED const char *stage_abbrev) { }
#else
void validate(const char *stage_abbrev);
#endif
/**
* Propagate bblock_t::end_ip_delta data through the CFG.
*/
inline void adjust_block_ips();
#endif
const struct backend_shader *s;
void *mem_ctx;
/** Ordered list (by ip) of basic blocks */
struct exec_list block_list;
struct bblock_t **blocks;
int num_blocks;
};
static inline struct bblock_t *
cfg_first_block(struct cfg_t *cfg)
{
return (struct bblock_t *)exec_list_get_head(&cfg->block_list);
}
static inline const struct bblock_t *
cfg_first_block_const(const struct cfg_t *cfg)
{
return (const struct bblock_t *)exec_list_get_head_const(&cfg->block_list);
}
static inline struct bblock_t *
cfg_last_block(struct cfg_t *cfg)
{
return (struct bblock_t *)exec_list_get_tail(&cfg->block_list);
}
static inline const struct bblock_t *
cfg_last_block_const(const struct cfg_t *cfg)
{
return (const struct bblock_t *)exec_list_get_tail_const(&cfg->block_list);
}
#ifdef __cplusplus
inline bblock_t *
cfg_t::first_block()
{
return cfg_first_block(this);
}
const inline bblock_t *
cfg_t::first_block() const
{
return cfg_first_block_const(this);
}
inline bblock_t *
cfg_t::last_block()
{
return cfg_last_block(this);
}
const inline bblock_t *
cfg_t::last_block() const
{
return cfg_last_block_const(this);
}
#endif
/* Note that this is implemented with a double for loop -- break will
* break from the inner loop only!
*/
#define foreach_block_and_inst(__block, __type, __inst, __cfg) \
foreach_block (__block, __cfg) \
foreach_inst_in_block (__type, __inst, __block)
/* Note that this is implemented with a double for loop -- break will
* break from the inner loop only!
*/
#define foreach_block_and_inst_safe(__block, __type, __inst, __cfg) \
foreach_block_safe (__block, __cfg) \
foreach_inst_in_block_safe (__type, __inst, __block)
#define foreach_block(__block, __cfg) \
foreach_list_typed (bblock_t, __block, link, &(__cfg)->block_list)
#define foreach_block_reverse(__block, __cfg) \
foreach_list_typed_reverse (bblock_t, __block, link, &(__cfg)->block_list)
#define foreach_block_safe(__block, __cfg) \
foreach_list_typed_safe (bblock_t, __block, link, &(__cfg)->block_list)
#define foreach_block_reverse_safe(__block, __cfg) \
foreach_list_typed_reverse_safe (bblock_t, __block, link, &(__cfg)->block_list)
#define foreach_inst_in_block(__type, __inst, __block) \
foreach_in_list(__type, __inst, &(__block)->instructions)
#define foreach_inst_in_block_safe(__type, __inst, __block) \
for (__type *__inst = (__type *)__block->instructions.head_sentinel.next, \
*__next = (__type *)__inst->next; \
__next != NULL; \
__inst = __next, \
__next = (__type *)__next->next)
#define foreach_inst_in_block_reverse(__type, __inst, __block) \
foreach_in_list_reverse(__type, __inst, &(__block)->instructions)
#define foreach_inst_in_block_reverse_safe(__type, __inst, __block) \
foreach_in_list_reverse_safe(__type, __inst, &(__block)->instructions)
#define foreach_inst_in_block_starting_from(__type, __scan_inst, __inst) \
for (__type *__scan_inst = (__type *)__inst->next; \
!__scan_inst->is_tail_sentinel(); \
__scan_inst = (__type *)__scan_inst->next)
#define foreach_inst_in_block_reverse_starting_from(__type, __scan_inst, __inst) \
for (__type *__scan_inst = (__type *)__inst->prev; \
!__scan_inst->is_head_sentinel(); \
__scan_inst = (__type *)__scan_inst->prev)
#ifdef __cplusplus
inline void
cfg_t::adjust_block_ips()
{
int delta = 0;
foreach_block(block, this) {
block->start_ip += delta;
block->end_ip += delta;
delta += block->end_ip_delta;
block->end_ip_delta = 0;
}
}
namespace brw {
/**
* Immediate dominator tree analysis of a shader.
*/
struct idom_tree {
idom_tree(const backend_shader *s);
~idom_tree();
bool
validate(const backend_shader *) const
{
/* FINISHME */
return true;
}
analysis_dependency_class
dependency_class() const
{
return DEPENDENCY_BLOCKS;
}
const bblock_t *
parent(const bblock_t *b) const
{
assert(unsigned(b->num) < num_parents);
return parents[b->num];
}
bblock_t *
parent(bblock_t *b) const
{
assert(unsigned(b->num) < num_parents);
return parents[b->num];
}
bblock_t *
intersect(bblock_t *b1, bblock_t *b2) const;
void
dump() const;
private:
unsigned num_parents;
bblock_t **parents;
};
}
#endif
#endif /* BRW_CFG_H */

View file

@ -0,0 +1,163 @@
/*
Copyright (C) Intel Corp. 2006. All Rights Reserved.
Intel funded Tungsten Graphics to
develop this 3D driver.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice (including the
next paragraph) shall be included in all copies or substantial
portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
**********************************************************************/
/*
* Authors:
* Keith Whitwell <keithw@vmware.com>
*/
#ifndef BRW_CLIP_H
#define BRW_CLIP_H
#include "brw_compiler.h"
#include "brw_eu.h"
/* Initial 3 verts, plus at most 6 additional verts from intersections
* with fixed planes, plus at most 8 additional verts from intersections
* with user clip planes
*/
#define MAX_VERTS (3+6+8)
#define PRIM_MASK (0x1f)
struct brw_clip_compile {
struct brw_codegen func;
struct brw_clip_prog_key key;
struct brw_clip_prog_data prog_data;
struct {
struct brw_reg R0;
struct brw_reg vertex[MAX_VERTS];
struct brw_reg t;
struct brw_reg t0, t1;
struct brw_reg dp0, dp1;
struct brw_reg dpPrev;
struct brw_reg dp;
struct brw_reg loopcount;
struct brw_reg nr_verts;
struct brw_reg planemask;
struct brw_reg inlist;
struct brw_reg outlist;
struct brw_reg freelist;
struct brw_reg dir;
struct brw_reg tmp0, tmp1;
struct brw_reg offset;
struct brw_reg fixed_planes;
struct brw_reg plane_equation;
struct brw_reg ff_sync;
/* Bitmask indicating which coordinate attribute should be used for
* comparison to each clipping plane. A 0 indicates that VARYING_SLOT_POS
* should be used, because it's one of the fixed +/- x/y/z planes that
* constitute the bounds of the view volume. A 1 indicates that
* VARYING_SLOT_CLIP_VERTEX should be used (if available) since it's a user-
* defined clipping plane.
*/
struct brw_reg vertex_src_mask;
/* Offset into the vertex of the current plane's clipdistance value */
struct brw_reg clipdistance_offset;
} reg;
/* Number of registers storing VUE data */
GLuint nr_regs;
GLuint first_tmp;
GLuint last_tmp;
bool need_direction;
struct intel_vue_map vue_map;
};
/**
* True if the given varying is one of the outputs of the vertex shader.
*/
static inline bool brw_clip_have_varying(struct brw_clip_compile *c,
GLuint varying)
{
return (c->key.attrs & BITFIELD64_BIT(varying)) ? 1 : 0;
}
/* Points are only culled, so no need for a clip routine, however it
* works out easier to have a dummy one.
*/
void brw_emit_unfilled_clip( struct brw_clip_compile *c );
void brw_emit_tri_clip( struct brw_clip_compile *c );
void brw_emit_line_clip( struct brw_clip_compile *c );
void brw_emit_point_clip( struct brw_clip_compile *c );
/* brw_clip_tri.c, for use by the unfilled clip routine:
*/
void brw_clip_tri_init_vertices( struct brw_clip_compile *c );
void brw_clip_tri_flat_shade( struct brw_clip_compile *c );
void brw_clip_tri( struct brw_clip_compile *c );
void brw_clip_tri_emit_polygon( struct brw_clip_compile *c );
void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
GLuint nr_verts );
/* Utils:
*/
void brw_clip_interp_vertex( struct brw_clip_compile *c,
struct brw_indirect dest_ptr,
struct brw_indirect v0_ptr, /* from */
struct brw_indirect v1_ptr, /* to */
struct brw_reg t0,
bool force_edgeflag );
void brw_clip_init_planes( struct brw_clip_compile *c );
void brw_clip_emit_vue(struct brw_clip_compile *c,
struct brw_indirect vert,
enum brw_urb_write_flags flags,
GLuint header);
void brw_clip_kill_thread(struct brw_clip_compile *c);
struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c );
struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c );
void brw_clip_copy_flatshaded_attributes( struct brw_clip_compile *c,
GLuint to, GLuint from );
void brw_clip_init_clipmask( struct brw_clip_compile *c );
struct brw_reg get_tmp( struct brw_clip_compile *c );
void brw_clip_project_position(struct brw_clip_compile *c,
struct brw_reg pos );
void brw_clip_ff_sync(struct brw_clip_compile *c);
void brw_clip_init_ff_sync(struct brw_clip_compile *c);
#endif

View file

@ -0,0 +1,303 @@
/*
Copyright (C) Intel Corp. 2006. All Rights Reserved.
Intel funded Tungsten Graphics to
develop this 3D driver.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice (including the
next paragraph) shall be included in all copies or substantial
portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
**********************************************************************/
/*
* Authors:
* Keith Whitwell <keithw@vmware.com>
*/
#include "brw_clip.h"
#include "brw_prim.h"
static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
{
const struct intel_device_info *devinfo = c->func.devinfo;
GLuint i = 0,j;
/* Register usage is static, precompute here:
*/
c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
if (c->key.nr_userclip) {
c->reg.fixed_planes = brw_vec4_grf(i, 0);
i += (6 + c->key.nr_userclip + 1) / 2;
c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2;
}
else
c->prog_data.curb_read_length = 0;
/* Payload vertices plus space for more generated vertices:
*/
for (j = 0; j < 4; j++) {
c->reg.vertex[j] = brw_vec4_grf(i, 0);
i += c->nr_regs;
}
c->reg.t = brw_vec1_grf(i, 0);
c->reg.t0 = brw_vec1_grf(i, 1);
c->reg.t1 = brw_vec1_grf(i, 2);
c->reg.planemask = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD);
c->reg.plane_equation = brw_vec4_grf(i, 4);
i++;
c->reg.dp0 = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */
c->reg.dp1 = brw_vec1_grf(i, 4);
i++;
if (!c->key.nr_userclip) {
c->reg.fixed_planes = brw_vec8_grf(i, 0);
i++;
}
c->reg.vertex_src_mask = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
c->reg.clipdistance_offset = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_W);
i++;
if (devinfo->ver == 5) {
c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
i++;
}
c->first_tmp = i;
c->last_tmp = i;
c->prog_data.urb_read_length = c->nr_regs; /* ? */
c->prog_data.total_grf = i;
}
/* Line clipping, more or less following the following algorithm:
*
* for (p=0;p<MAX_PLANES;p++) {
* if (clipmask & (1 << p)) {
* GLfloat dp0 = DOTPROD( vtx0, plane[p] );
* GLfloat dp1 = DOTPROD( vtx1, plane[p] );
*
* if (dp1 < 0.0f) {
* GLfloat t = dp1 / (dp1 - dp0);
* if (t > t1) t1 = t;
* } else {
* GLfloat t = dp0 / (dp0 - dp1);
* if (t > t0) t0 = t;
* }
*
* if (t0 + t1 >= 1.0)
* return;
* }
* }
*
* interp( ctx, newvtx0, vtx0, vtx1, t0 );
* interp( ctx, newvtx1, vtx1, vtx0, t1 );
*
*/
static void clip_and_emit_line( struct brw_clip_compile *c )
{
struct brw_codegen *p = &c->func;
struct brw_indirect vtx0 = brw_indirect(0, 0);
struct brw_indirect vtx1 = brw_indirect(1, 0);
struct brw_indirect newvtx0 = brw_indirect(2, 0);
struct brw_indirect newvtx1 = brw_indirect(3, 0);
struct brw_indirect plane_ptr = brw_indirect(4, 0);
struct brw_reg v1_null_ud = retype(vec1(brw_null_reg()), BRW_REGISTER_TYPE_UD);
GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
GLint clipdist0_offset = c->key.nr_userclip
? brw_varying_to_offset(&c->vue_map, VARYING_SLOT_CLIP_DIST0)
: 0;
brw_MOV(p, get_addr_reg(vtx0), brw_address(c->reg.vertex[0]));
brw_MOV(p, get_addr_reg(vtx1), brw_address(c->reg.vertex[1]));
brw_MOV(p, get_addr_reg(newvtx0), brw_address(c->reg.vertex[2]));
brw_MOV(p, get_addr_reg(newvtx1), brw_address(c->reg.vertex[3]));
brw_MOV(p, get_addr_reg(plane_ptr), brw_clip_plane0_address(c));
/* Note: init t0, t1 together:
*/
brw_MOV(p, vec2(c->reg.t0), brw_imm_f(0));
brw_clip_init_planes(c);
brw_clip_init_clipmask(c);
/* -ve rhw workaround */
if (p->devinfo->has_negative_rhw_bug) {
brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
brw_imm_ud(1<<20));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(0x3f));
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
}
/* Set the initial vertex source mask: The first 6 planes are the bounds
* of the view volume; the next 8 planes are the user clipping planes.
*/
brw_MOV(p, c->reg.vertex_src_mask, brw_imm_ud(0x3fc0));
/* Set the initial clipdistance offset to be 6 floats before gl_ClipDistance[0].
* We'll increment 6 times before we start hitting actual user clipping. */
brw_MOV(p, c->reg.clipdistance_offset, brw_imm_d(clipdist0_offset - 6*sizeof(float)));
brw_DO(p, BRW_EXECUTE_1);
{
/* if (planemask & 1)
*/
brw_AND(p, v1_null_ud, c->reg.planemask, brw_imm_ud(1));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
brw_IF(p, BRW_EXECUTE_1);
{
brw_AND(p, v1_null_ud, c->reg.vertex_src_mask, brw_imm_ud(1));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
brw_IF(p, BRW_EXECUTE_1);
{
/* user clip distance: just fetch the correct float from each vertex */
struct brw_indirect temp_ptr = brw_indirect(7, 0);
brw_ADD(p, get_addr_reg(temp_ptr), get_addr_reg(vtx0), c->reg.clipdistance_offset);
brw_MOV(p, c->reg.dp0, deref_1f(temp_ptr, 0));
brw_ADD(p, get_addr_reg(temp_ptr), get_addr_reg(vtx1), c->reg.clipdistance_offset);
brw_MOV(p, c->reg.dp1, deref_1f(temp_ptr, 0));
}
brw_ELSE(p);
{
/* fixed plane: fetch the hpos, dp4 against the plane. */
if (c->key.nr_userclip)
brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
else
brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));
brw_DP4(p, vec4(c->reg.dp0), deref_4f(vtx0, hpos_offset), c->reg.plane_equation);
brw_DP4(p, vec4(c->reg.dp1), deref_4f(vtx1, hpos_offset), c->reg.plane_equation);
}
brw_ENDIF(p);
brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, vec1(c->reg.dp1), brw_imm_f(0.0f));
brw_IF(p, BRW_EXECUTE_1);
{
/*
* Both can be negative on GM965/G965 due to RHW workaround
* if so, this object should be rejected.
*/
if (p->devinfo->has_negative_rhw_bug) {
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE, c->reg.dp0, brw_imm_f(0.0));
brw_IF(p, BRW_EXECUTE_1);
{
brw_clip_kill_thread(c);
}
brw_ENDIF(p);
}
brw_ADD(p, c->reg.t, c->reg.dp1, negate(c->reg.dp0));
brw_math_invert(p, c->reg.t, c->reg.t);
brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp1);
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t1 );
brw_MOV(p, c->reg.t1, c->reg.t);
brw_inst_set_pred_control(p->devinfo, brw_last_inst,
BRW_PREDICATE_NORMAL);
}
brw_ELSE(p);
{
/* Coming back in. We know that both cannot be negative
* because the line would have been culled in that case.
*/
/* If both are positive, do nothing */
/* Only on GM965/G965 */
if (p->devinfo->has_negative_rhw_bug) {
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0));
brw_IF(p, BRW_EXECUTE_1);
}
{
brw_ADD(p, c->reg.t, c->reg.dp0, negate(c->reg.dp1));
brw_math_invert(p, c->reg.t, c->reg.t);
brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp0);
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t0 );
brw_MOV(p, c->reg.t0, c->reg.t);
brw_inst_set_pred_control(p->devinfo, brw_last_inst,
BRW_PREDICATE_NORMAL);
}
if (p->devinfo->has_negative_rhw_bug) {
brw_ENDIF(p);
}
}
brw_ENDIF(p);
}
brw_ENDIF(p);
/* plane_ptr++;
*/
brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));
/* while (planemask>>=1) != 0
*/
brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
brw_SHR(p, c->reg.vertex_src_mask, c->reg.vertex_src_mask, brw_imm_ud(1));
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
brw_ADD(p, c->reg.clipdistance_offset, c->reg.clipdistance_offset, brw_imm_w(sizeof(float)));
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
}
brw_WHILE(p);
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
brw_ADD(p, c->reg.t, c->reg.t0, c->reg.t1);
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.t, brw_imm_f(1.0));
brw_IF(p, BRW_EXECUTE_1);
{
brw_clip_interp_vertex(c, newvtx0, vtx0, vtx1, c->reg.t0, false);
brw_clip_interp_vertex(c, newvtx1, vtx1, vtx0, c->reg.t1, false);
brw_clip_emit_vue(c, newvtx0, BRW_URB_WRITE_ALLOCATE_COMPLETE,
(_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
| URB_WRITE_PRIM_START);
brw_clip_emit_vue(c, newvtx1, BRW_URB_WRITE_EOT_COMPLETE,
(_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
| URB_WRITE_PRIM_END);
}
brw_ENDIF(p);
brw_clip_kill_thread(c);
}
void brw_emit_line_clip( struct brw_clip_compile *c )
{
brw_clip_line_alloc_regs(c);
brw_clip_init_ff_sync(c);
if (c->key.contains_flat_varying) {
if (c->key.pv_first)
brw_clip_copy_flatshaded_attributes(c, 1, 0);
else
brw_clip_copy_flatshaded_attributes(c, 0, 1);
}
clip_and_emit_line(c);
}

View file

@ -0,0 +1,45 @@
/*
Copyright (C) Intel Corp. 2006. All Rights Reserved.
Intel funded Tungsten Graphics to
develop this 3D driver.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice (including the
next paragraph) shall be included in all copies or substantial
portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
**********************************************************************/
/*
* Authors:
* Keith Whitwell <keithw@vmware.com>
*/
#include "brw_clip.h"
/* Point clipping, nothing to do?
*/
void brw_emit_point_clip( struct brw_clip_compile *c )
{
/* Send an empty message to kill the thread:
*/
brw_clip_tri_alloc_regs(c, 0);
brw_clip_init_ff_sync(c);
brw_clip_kill_thread(c);
}

View file

@ -0,0 +1,659 @@
/*
Copyright (C) Intel Corp. 2006. All Rights Reserved.
Intel funded Tungsten Graphics to
develop this 3D driver.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice (including the
next paragraph) shall be included in all copies or substantial
portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
**********************************************************************/
/*
* Authors:
* Keith Whitwell <keithw@vmware.com>
*/
#include "brw_clip.h"
#include "brw_prim.h"
static void release_tmps( struct brw_clip_compile *c )
{
c->last_tmp = c->first_tmp;
}
void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
GLuint nr_verts )
{
const struct intel_device_info *devinfo = c->func.devinfo;
GLuint i = 0,j;
/* Register usage is static, precompute here:
*/
c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
if (c->key.nr_userclip) {
c->reg.fixed_planes = brw_vec4_grf(i, 0);
i += (6 + c->key.nr_userclip + 1) / 2;
c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2;
}
else
c->prog_data.curb_read_length = 0;
/* Payload vertices plus space for more generated vertices:
*/
for (j = 0; j < nr_verts; j++) {
c->reg.vertex[j] = brw_vec4_grf(i, 0);
i += c->nr_regs;
}
if (c->vue_map.num_slots % 2 && nr_verts > 0) {
/* The VUE has an odd number of slots so the last register is only half
* used. Fill the second half with zero.
*/
for (j = 0; j < 3; j++) {
GLuint delta = brw_vue_slot_to_offset(c->vue_map.num_slots);
brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0));
}
}
c->reg.t = brw_vec1_grf(i, 0);
c->reg.loopcount = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_D);
c->reg.nr_verts = retype(brw_vec1_grf(i, 2), BRW_REGISTER_TYPE_UD);
c->reg.planemask = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD);
c->reg.plane_equation = brw_vec4_grf(i, 4);
i++;
c->reg.dpPrev = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */
c->reg.dp = brw_vec1_grf(i, 4);
i++;
c->reg.inlist = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
i++;
c->reg.outlist = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
i++;
c->reg.freelist = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
i++;
if (!c->key.nr_userclip) {
c->reg.fixed_planes = brw_vec8_grf(i, 0);
i++;
}
if (c->key.do_unfilled) {
c->reg.dir = brw_vec4_grf(i, 0);
c->reg.offset = brw_vec4_grf(i, 4);
i++;
c->reg.tmp0 = brw_vec4_grf(i, 0);
c->reg.tmp1 = brw_vec4_grf(i, 4);
i++;
}
c->reg.vertex_src_mask = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
c->reg.clipdistance_offset = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_W);
i++;
if (devinfo->ver == 5) {
c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
i++;
}
c->first_tmp = i;
c->last_tmp = i;
c->prog_data.urb_read_length = c->nr_regs; /* ? */
c->prog_data.total_grf = i;
}
void brw_clip_tri_init_vertices( struct brw_clip_compile *c )
{
struct brw_codegen *p = &c->func;
struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
/* Initial list of indices for incoming vertices:
*/
brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK));
brw_CMP(p,
vec1(brw_null_reg()),
BRW_CONDITIONAL_EQ,
tmp0,
brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE));
/* XXX: Is there an easier way to do this? Need to reverse every
* second tristrip element: Can ignore sometimes?
*/
brw_IF(p, BRW_EXECUTE_1);
{
brw_MOV(p, get_element(c->reg.inlist, 0), brw_address(c->reg.vertex[1]) );
brw_MOV(p, get_element(c->reg.inlist, 1), brw_address(c->reg.vertex[0]) );
if (c->need_direction)
brw_MOV(p, c->reg.dir, brw_imm_f(-1));
}
brw_ELSE(p);
{
brw_MOV(p, get_element(c->reg.inlist, 0), brw_address(c->reg.vertex[0]) );
brw_MOV(p, get_element(c->reg.inlist, 1), brw_address(c->reg.vertex[1]) );
if (c->need_direction)
brw_MOV(p, c->reg.dir, brw_imm_f(1));
}
brw_ENDIF(p);
brw_MOV(p, get_element(c->reg.inlist, 2), brw_address(c->reg.vertex[2]) );
brw_MOV(p, brw_vec8_grf(c->reg.outlist.nr, 0), brw_imm_f(0));
brw_MOV(p, c->reg.nr_verts, brw_imm_ud(3));
}
void brw_clip_tri_flat_shade( struct brw_clip_compile *c )
{
struct brw_codegen *p = &c->func;
struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK));
brw_CMP(p,
vec1(brw_null_reg()),
BRW_CONDITIONAL_EQ,
tmp0,
brw_imm_ud(_3DPRIM_POLYGON));
brw_IF(p, BRW_EXECUTE_1);
{
brw_clip_copy_flatshaded_attributes(c, 1, 0);
brw_clip_copy_flatshaded_attributes(c, 2, 0);
}
brw_ELSE(p);
{
if (c->key.pv_first) {
brw_CMP(p,
vec1(brw_null_reg()),
BRW_CONDITIONAL_EQ,
tmp0,
brw_imm_ud(_3DPRIM_TRIFAN));
brw_IF(p, BRW_EXECUTE_1);
{
brw_clip_copy_flatshaded_attributes(c, 0, 1);
brw_clip_copy_flatshaded_attributes(c, 2, 1);
}
brw_ELSE(p);
{
brw_clip_copy_flatshaded_attributes(c, 1, 0);
brw_clip_copy_flatshaded_attributes(c, 2, 0);
}
brw_ENDIF(p);
}
else {
brw_clip_copy_flatshaded_attributes(c, 0, 2);
brw_clip_copy_flatshaded_attributes(c, 1, 2);
}
}
brw_ENDIF(p);
}
/**
* Loads the clip distance for a vertex into `dst`, and ends with
* a comparison of it to zero with the condition `cond`.
*
* - If using a fixed plane, the distance is dot(hpos, plane).
* - If using a user clip plane, the distance is directly available in the vertex.
*/
static inline void
load_clip_distance(struct brw_clip_compile *c, struct brw_indirect vtx,
struct brw_reg dst, GLuint hpos_offset, int cond)
{
struct brw_codegen *p = &c->func;
dst = vec4(dst);
brw_AND(p, vec1(brw_null_reg()), c->reg.vertex_src_mask, brw_imm_ud(1));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
brw_IF(p, BRW_EXECUTE_1);
{
struct brw_indirect temp_ptr = brw_indirect(7, 0);
brw_ADD(p, get_addr_reg(temp_ptr), get_addr_reg(vtx), c->reg.clipdistance_offset);
brw_MOV(p, vec1(dst), deref_1f(temp_ptr, 0));
}
brw_ELSE(p);
{
brw_MOV(p, dst, deref_4f(vtx, hpos_offset));
brw_DP4(p, dst, dst, c->reg.plane_equation);
}
brw_ENDIF(p);
brw_CMP(p, brw_null_reg(), cond, vec1(dst), brw_imm_f(0.0f));
}
/* Use mesa's clipping algorithms, translated to GFX4 assembly.
*/
void brw_clip_tri( struct brw_clip_compile *c )
{
struct brw_codegen *p = &c->func;
struct brw_indirect vtx = brw_indirect(0, 0);
struct brw_indirect vtxPrev = brw_indirect(1, 0);
struct brw_indirect vtxOut = brw_indirect(2, 0);
struct brw_indirect plane_ptr = brw_indirect(3, 0);
struct brw_indirect inlist_ptr = brw_indirect(4, 0);
struct brw_indirect outlist_ptr = brw_indirect(5, 0);
struct brw_indirect freelist_ptr = brw_indirect(6, 0);
GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
GLint clipdist0_offset = c->key.nr_userclip
? brw_varying_to_offset(&c->vue_map, VARYING_SLOT_CLIP_DIST0)
: 0;
brw_MOV(p, get_addr_reg(vtxPrev), brw_address(c->reg.vertex[2]) );
brw_MOV(p, get_addr_reg(plane_ptr), brw_clip_plane0_address(c));
brw_MOV(p, get_addr_reg(inlist_ptr), brw_address(c->reg.inlist));
brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist));
brw_MOV(p, get_addr_reg(freelist_ptr), brw_address(c->reg.vertex[3]) );
/* Set the initial vertex source mask: The first 6 planes are the bounds
* of the view volume; the next 8 planes are the user clipping planes.
*/
brw_MOV(p, c->reg.vertex_src_mask, brw_imm_ud(0x3fc0));
/* Set the initial clipdistance offset to be 6 floats before gl_ClipDistance[0].
* We'll increment 6 times before we start hitting actual user clipping. */
brw_MOV(p, c->reg.clipdistance_offset, brw_imm_d(clipdist0_offset - 6*sizeof(float)));
brw_DO(p, BRW_EXECUTE_1);
{
/* if (planemask & 1)
*/
brw_AND(p, vec1(brw_null_reg()), c->reg.planemask, brw_imm_ud(1));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
brw_IF(p, BRW_EXECUTE_1);
{
/* vtxOut = freelist_ptr++
*/
brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(freelist_ptr) );
brw_ADD(p, get_addr_reg(freelist_ptr), get_addr_reg(freelist_ptr), brw_imm_uw(c->nr_regs * REG_SIZE));
if (c->key.nr_userclip)
brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
else
brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));
brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
brw_MOV(p, c->reg.nr_verts, brw_imm_ud(0));
brw_DO(p, BRW_EXECUTE_1);
{
/* vtx = *input_ptr;
*/
brw_MOV(p, get_addr_reg(vtx), deref_1uw(inlist_ptr, 0));
load_clip_distance(c, vtxPrev, c->reg.dpPrev, hpos_offset, BRW_CONDITIONAL_L);
/* (prev < 0.0f) */
brw_IF(p, BRW_EXECUTE_1);
{
load_clip_distance(c, vtx, c->reg.dp, hpos_offset, BRW_CONDITIONAL_GE);
/* IS_POSITIVE(next)
*/
brw_IF(p, BRW_EXECUTE_1);
{
/* Coming back in.
*/
brw_ADD(p, c->reg.t, c->reg.dpPrev, negate(c->reg.dp));
brw_math_invert(p, c->reg.t, c->reg.t);
brw_MUL(p, c->reg.t, c->reg.t, c->reg.dpPrev);
/* If (vtxOut == 0) vtxOut = vtxPrev
*/
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) );
brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtxPrev));
brw_inst_set_pred_control(p->devinfo, brw_last_inst,
BRW_PREDICATE_NORMAL);
brw_clip_interp_vertex(c, vtxOut, vtxPrev, vtx, c->reg.t, false);
/* *outlist_ptr++ = vtxOut;
* nr_verts++;
* vtxOut = 0;
*/
brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut));
brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) );
}
brw_ENDIF(p);
}
brw_ELSE(p);
{
/* *outlist_ptr++ = vtxPrev;
* nr_verts++;
*/
brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxPrev));
brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
load_clip_distance(c, vtx, c->reg.dp, hpos_offset, BRW_CONDITIONAL_L);
/* (next < 0.0f)
*/
brw_IF(p, BRW_EXECUTE_1);
{
/* Going out of bounds. Avoid division by zero as we
* know dp != dpPrev from DIFFERENT_SIGNS, above.
*/
brw_ADD(p, c->reg.t, c->reg.dp, negate(c->reg.dpPrev));
brw_math_invert(p, c->reg.t, c->reg.t);
brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp);
/* If (vtxOut == 0) vtxOut = vtx
*/
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) );
brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtx));
brw_inst_set_pred_control(p->devinfo, brw_last_inst,
BRW_PREDICATE_NORMAL);
brw_clip_interp_vertex(c, vtxOut, vtx, vtxPrev, c->reg.t, true);
/* *outlist_ptr++ = vtxOut;
* nr_verts++;
* vtxOut = 0;
*/
brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut));
brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) );
}
brw_ENDIF(p);
}
brw_ENDIF(p);
/* vtxPrev = vtx;
* inlist_ptr++;
*/
brw_MOV(p, get_addr_reg(vtxPrev), get_addr_reg(vtx));
brw_ADD(p, get_addr_reg(inlist_ptr), get_addr_reg(inlist_ptr), brw_imm_uw(sizeof(short)));
/* while (--loopcount != 0)
*/
brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
}
brw_WHILE(p);
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
/* vtxPrev = *(outlist_ptr-1) OR: outlist[nr_verts-1]
* inlist = outlist
* inlist_ptr = &inlist[0]
* outlist_ptr = &outlist[0]
*/
brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_w(-2));
brw_MOV(p, get_addr_reg(vtxPrev), deref_1uw(outlist_ptr, 0));
brw_MOV(p, brw_vec8_grf(c->reg.inlist.nr, 0), brw_vec8_grf(c->reg.outlist.nr, 0));
brw_MOV(p, get_addr_reg(inlist_ptr), brw_address(c->reg.inlist));
brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist));
}
brw_ENDIF(p);
/* plane_ptr++;
*/
brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));
/* nr_verts >= 3
*/
brw_CMP(p,
vec1(brw_null_reg()),
BRW_CONDITIONAL_GE,
c->reg.nr_verts,
brw_imm_ud(3));
brw_set_default_predicate_control(p, BRW_PREDICATE_NORMAL);
/* && (planemask>>=1) != 0
*/
brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
brw_SHR(p, c->reg.vertex_src_mask, c->reg.vertex_src_mask, brw_imm_ud(1));
brw_ADD(p, c->reg.clipdistance_offset, c->reg.clipdistance_offset, brw_imm_w(sizeof(float)));
}
brw_WHILE(p);
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
}
void brw_clip_tri_emit_polygon(struct brw_clip_compile *c)
{
struct brw_codegen *p = &c->func;
/* for (loopcount = nr_verts-2; loopcount > 0; loopcount--)
*/
brw_ADD(p,
c->reg.loopcount,
c->reg.nr_verts,
brw_imm_d(-2));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_G);
brw_IF(p, BRW_EXECUTE_1);
{
struct brw_indirect v0 = brw_indirect(0, 0);
struct brw_indirect vptr = brw_indirect(1, 0);
brw_MOV(p, get_addr_reg(vptr), brw_address(c->reg.inlist));
brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
brw_clip_emit_vue(c, v0, BRW_URB_WRITE_ALLOCATE_COMPLETE,
((_3DPRIM_TRIFAN << URB_WRITE_PRIM_TYPE_SHIFT)
| URB_WRITE_PRIM_START));
brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2));
brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
brw_DO(p, BRW_EXECUTE_1);
{
brw_clip_emit_vue(c, v0, BRW_URB_WRITE_ALLOCATE_COMPLETE,
(_3DPRIM_TRIFAN << URB_WRITE_PRIM_TYPE_SHIFT));
brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2));
brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
}
brw_WHILE(p);
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
brw_clip_emit_vue(c, v0, BRW_URB_WRITE_EOT_COMPLETE,
((_3DPRIM_TRIFAN << URB_WRITE_PRIM_TYPE_SHIFT)
| URB_WRITE_PRIM_END));
}
brw_ENDIF(p);
}
static void do_clip_tri( struct brw_clip_compile *c )
{
brw_clip_init_planes(c);
brw_clip_tri(c);
}
static void maybe_do_clip_tri( struct brw_clip_compile *c )
{
struct brw_codegen *p = &c->func;
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0));
brw_IF(p, BRW_EXECUTE_1);
{
do_clip_tri(c);
}
brw_ENDIF(p);
}
static void brw_clip_test( struct brw_clip_compile *c )
{
struct brw_reg t = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
struct brw_reg t1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
struct brw_reg t2 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
struct brw_reg t3 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
struct brw_reg v0 = get_tmp(c);
struct brw_reg v1 = get_tmp(c);
struct brw_reg v2 = get_tmp(c);
struct brw_indirect vt0 = brw_indirect(0, 0);
struct brw_indirect vt1 = brw_indirect(1, 0);
struct brw_indirect vt2 = brw_indirect(2, 0);
struct brw_codegen *p = &c->func;
struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
GLuint hpos_offset = brw_varying_to_offset(&c->vue_map,
VARYING_SLOT_POS);
brw_MOV(p, get_addr_reg(vt0), brw_address(c->reg.vertex[0]));
brw_MOV(p, get_addr_reg(vt1), brw_address(c->reg.vertex[1]));
brw_MOV(p, get_addr_reg(vt2), brw_address(c->reg.vertex[2]));
brw_MOV(p, v0, deref_4f(vt0, hpos_offset));
brw_MOV(p, v1, deref_4f(vt1, hpos_offset));
brw_MOV(p, v2, deref_4f(vt2, hpos_offset));
brw_AND(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(~0x3f));
/* test nearz, xmin, ymin plane */
/* clip.xyz < -clip.w */
brw_CMP(p, t1, BRW_CONDITIONAL_L, v0, negate(get_element(v0, 3)));
brw_CMP(p, t2, BRW_CONDITIONAL_L, v1, negate(get_element(v1, 3)));
brw_CMP(p, t3, BRW_CONDITIONAL_L, v2, negate(get_element(v2, 3)));
/* All vertices are outside of a plane, rejected */
brw_AND(p, t, t1, t2);
brw_AND(p, t, t, t3);
brw_OR(p, tmp0, get_element(t, 0), get_element(t, 1));
brw_OR(p, tmp0, tmp0, get_element(t, 2));
brw_AND(p, brw_null_reg(), tmp0, brw_imm_ud(0x1));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
brw_IF(p, BRW_EXECUTE_1);
{
brw_clip_kill_thread(c);
}
brw_ENDIF(p);
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
/* some vertices are inside a plane, some are outside,need to clip */
brw_XOR(p, t, t1, t2);
brw_XOR(p, t1, t2, t3);
brw_OR(p, t, t, t1);
brw_AND(p, t, t, brw_imm_ud(0x1));
brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
get_element(t, 0), brw_imm_ud(0));
brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<5)));
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
get_element(t, 1), brw_imm_ud(0));
brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<3)));
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
get_element(t, 2), brw_imm_ud(0));
brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<1)));
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
/* test farz, xmax, ymax plane */
/* clip.xyz > clip.w */
brw_CMP(p, t1, BRW_CONDITIONAL_G, v0, get_element(v0, 3));
brw_CMP(p, t2, BRW_CONDITIONAL_G, v1, get_element(v1, 3));
brw_CMP(p, t3, BRW_CONDITIONAL_G, v2, get_element(v2, 3));
/* All vertices are outside of a plane, rejected */
brw_AND(p, t, t1, t2);
brw_AND(p, t, t, t3);
brw_OR(p, tmp0, get_element(t, 0), get_element(t, 1));
brw_OR(p, tmp0, tmp0, get_element(t, 2));
brw_AND(p, brw_null_reg(), tmp0, brw_imm_ud(0x1));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
brw_IF(p, BRW_EXECUTE_1);
{
brw_clip_kill_thread(c);
}
brw_ENDIF(p);
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
/* some vertices are inside a plane, some are outside,need to clip */
brw_XOR(p, t, t1, t2);
brw_XOR(p, t1, t2, t3);
brw_OR(p, t, t, t1);
brw_AND(p, t, t, brw_imm_ud(0x1));
brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
get_element(t, 0), brw_imm_ud(0));
brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<4)));
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
get_element(t, 1), brw_imm_ud(0));
brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<2)));
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
get_element(t, 2), brw_imm_ud(0));
brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<0)));
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
release_tmps(c);
}
void brw_emit_tri_clip( struct brw_clip_compile *c )
{
struct brw_codegen *p = &c->func;
brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
brw_clip_tri_init_vertices(c);
brw_clip_init_clipmask(c);
brw_clip_init_ff_sync(c);
/* if -ve rhw workaround bit is set,
do cliptest */
if (p->devinfo->has_negative_rhw_bug) {
brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
brw_imm_ud(1<<20));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
brw_IF(p, BRW_EXECUTE_1);
{
brw_clip_test(c);
}
brw_ENDIF(p);
}
/* Can't push into do_clip_tri because with polygon (or quad)
* flatshading, need to apply the flatshade here because we don't
* respect the PV when converting to trifan for emit:
*/
if (c->key.contains_flat_varying)
brw_clip_tri_flat_shade(c);
if ((c->key.clip_mode == BRW_CLIP_MODE_NORMAL) ||
(c->key.clip_mode == BRW_CLIP_MODE_KERNEL_CLIP))
do_clip_tri(c);
else
maybe_do_clip_tri(c);
brw_clip_tri_emit_polygon(c);
/* Send an empty message to kill the thread:
*/
brw_clip_kill_thread(c);
}

View file

@ -0,0 +1,528 @@
/*
Copyright (C) Intel Corp. 2006. All Rights Reserved.
Intel funded Tungsten Graphics to
develop this 3D driver.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice (including the
next paragraph) shall be included in all copies or substantial
portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
**********************************************************************/
/*
* Authors:
* Keith Whitwell <keithw@vmware.com>
*/
#include "brw_clip.h"
#include "brw_prim.h"
/* This is performed against the original triangles, so no indirection
* required:
BZZZT!
*/
static void compute_tri_direction( struct brw_clip_compile *c )
{
struct brw_codegen *p = &c->func;
struct brw_reg e = c->reg.tmp0;
struct brw_reg f = c->reg.tmp1;
GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
struct brw_reg v0 = byte_offset(c->reg.vertex[0], hpos_offset);
struct brw_reg v1 = byte_offset(c->reg.vertex[1], hpos_offset);
struct brw_reg v2 = byte_offset(c->reg.vertex[2], hpos_offset);
struct brw_reg v0n = get_tmp(c);
struct brw_reg v1n = get_tmp(c);
struct brw_reg v2n = get_tmp(c);
/* Convert to NDC.
* NOTE: We can't modify the original vertex coordinates,
* as it may impact further operations.
* So, we have to keep normalized coordinates in temp registers.
*
* TBD-KC
* Try to optimize unnecessary MOV's.
*/
brw_MOV(p, v0n, v0);
brw_MOV(p, v1n, v1);
brw_MOV(p, v2n, v2);
brw_clip_project_position(c, v0n);
brw_clip_project_position(c, v1n);
brw_clip_project_position(c, v2n);
/* Calculate the vectors of two edges of the triangle:
*/
brw_ADD(p, e, v0n, negate(v2n));
brw_ADD(p, f, v1n, negate(v2n));
/* Take their crossproduct:
*/
brw_set_default_access_mode(p, BRW_ALIGN_16);
brw_MUL(p, vec4(brw_null_reg()), brw_swizzle(e, BRW_SWIZZLE_YZXW),
brw_swizzle(f, BRW_SWIZZLE_ZXYW));
brw_MAC(p, vec4(e), negate(brw_swizzle(e, BRW_SWIZZLE_ZXYW)),
brw_swizzle(f, BRW_SWIZZLE_YZXW));
brw_set_default_access_mode(p, BRW_ALIGN_1);
brw_MUL(p, c->reg.dir, c->reg.dir, vec4(e));
}
static void cull_direction( struct brw_clip_compile *c )
{
struct brw_codegen *p = &c->func;
GLuint conditional;
assert (!(c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL &&
c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL));
if (c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL)
conditional = BRW_CONDITIONAL_GE;
else
conditional = BRW_CONDITIONAL_L;
brw_CMP(p,
vec1(brw_null_reg()),
conditional,
get_element(c->reg.dir, 2),
brw_imm_f(0));
brw_IF(p, BRW_EXECUTE_1);
{
brw_clip_kill_thread(c);
}
brw_ENDIF(p);
}
static void copy_bfc( struct brw_clip_compile *c )
{
struct brw_codegen *p = &c->func;
GLuint conditional;
/* Do we have any colors to copy?
*/
if (!(brw_clip_have_varying(c, VARYING_SLOT_COL0) &&
brw_clip_have_varying(c, VARYING_SLOT_BFC0)) &&
!(brw_clip_have_varying(c, VARYING_SLOT_COL1) &&
brw_clip_have_varying(c, VARYING_SLOT_BFC1)))
return;
/* In some weird degenerate cases we can end up testing the
* direction twice, once for culling and once for bfc copying. Oh
* well, that's what you get for setting weird GL state.
*/
if (c->key.copy_bfc_ccw)
conditional = BRW_CONDITIONAL_GE;
else
conditional = BRW_CONDITIONAL_L;
brw_CMP(p,
vec1(brw_null_reg()),
conditional,
get_element(c->reg.dir, 2),
brw_imm_f(0));
brw_IF(p, BRW_EXECUTE_1);
{
GLuint i;
for (i = 0; i < 3; i++) {
if (brw_clip_have_varying(c, VARYING_SLOT_COL0) &&
brw_clip_have_varying(c, VARYING_SLOT_BFC0))
brw_MOV(p,
byte_offset(c->reg.vertex[i],
brw_varying_to_offset(&c->vue_map,
VARYING_SLOT_COL0)),
byte_offset(c->reg.vertex[i],
brw_varying_to_offset(&c->vue_map,
VARYING_SLOT_BFC0)));
if (brw_clip_have_varying(c, VARYING_SLOT_COL1) &&
brw_clip_have_varying(c, VARYING_SLOT_BFC1))
brw_MOV(p,
byte_offset(c->reg.vertex[i],
brw_varying_to_offset(&c->vue_map,
VARYING_SLOT_COL1)),
byte_offset(c->reg.vertex[i],
brw_varying_to_offset(&c->vue_map,
VARYING_SLOT_BFC1)));
}
}
brw_ENDIF(p);
}
/*
GLfloat iz = 1.0 / dir.z;
GLfloat ac = dir.x * iz;
GLfloat bc = dir.y * iz;
offset = ctx->Polygon.OffsetUnits * DEPTH_SCALE;
offset += MAX2( abs(ac), abs(bc) ) * ctx->Polygon.OffsetFactor;
if (ctx->Polygon.OffsetClamp && isfinite(ctx->Polygon.OffsetClamp)) {
if (ctx->Polygon.OffsetClamp < 0)
offset = MAX2( offset, ctx->Polygon.OffsetClamp );
else
offset = MIN2( offset, ctx->Polygon.OffsetClamp );
}
offset *= MRD;
*/
static void compute_offset( struct brw_clip_compile *c )
{
struct brw_codegen *p = &c->func;
struct brw_reg off = c->reg.offset;
struct brw_reg dir = c->reg.dir;
brw_math_invert(p, get_element(off, 2), get_element(dir, 2));
brw_MUL(p, vec2(off), vec2(dir), get_element(off, 2));
brw_CMP(p,
vec1(brw_null_reg()),
BRW_CONDITIONAL_GE,
brw_abs(get_element(off, 0)),
brw_abs(get_element(off, 1)));
brw_SEL(p, vec1(off),
brw_abs(get_element(off, 0)), brw_abs(get_element(off, 1)));
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
brw_MUL(p, vec1(off), vec1(off), brw_imm_f(c->key.offset_factor));
brw_ADD(p, vec1(off), vec1(off), brw_imm_f(c->key.offset_units));
if (c->key.offset_clamp && isfinite(c->key.offset_clamp)) {
brw_CMP(p,
vec1(brw_null_reg()),
c->key.offset_clamp < 0 ? BRW_CONDITIONAL_GE : BRW_CONDITIONAL_L,
vec1(off),
brw_imm_f(c->key.offset_clamp));
brw_SEL(p, vec1(off), vec1(off), brw_imm_f(c->key.offset_clamp));
}
}
static void merge_edgeflags( struct brw_clip_compile *c )
{
struct brw_codegen *p = &c->func;
struct brw_reg tmp0 = get_element_ud(c->reg.tmp0, 0);
brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK));
brw_CMP(p,
vec1(brw_null_reg()),
BRW_CONDITIONAL_EQ,
tmp0,
brw_imm_ud(_3DPRIM_POLYGON));
/* Get away with using reg.vertex because we know that this is not
* a _3DPRIM_TRISTRIP_REVERSE:
*/
brw_IF(p, BRW_EXECUTE_1);
{
brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<8));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_EQ);
brw_MOV(p, byte_offset(c->reg.vertex[0],
brw_varying_to_offset(&c->vue_map,
VARYING_SLOT_EDGE)),
brw_imm_f(0));
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<9));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_EQ);
brw_MOV(p, byte_offset(c->reg.vertex[2],
brw_varying_to_offset(&c->vue_map,
VARYING_SLOT_EDGE)),
brw_imm_f(0));
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
}
brw_ENDIF(p);
}
static void apply_one_offset( struct brw_clip_compile *c,
struct brw_indirect vert )
{
struct brw_codegen *p = &c->func;
GLuint ndc_offset = brw_varying_to_offset(&c->vue_map,
BRW_VARYING_SLOT_NDC);
struct brw_reg z = deref_1f(vert, ndc_offset +
2 * type_sz(BRW_REGISTER_TYPE_F));
brw_ADD(p, z, z, vec1(c->reg.offset));
}
/***********************************************************************
* Output clipped polygon as an unfilled primitive:
*/
static void emit_lines(struct brw_clip_compile *c,
bool do_offset)
{
struct brw_codegen *p = &c->func;
struct brw_indirect v0 = brw_indirect(0, 0);
struct brw_indirect v1 = brw_indirect(1, 0);
struct brw_indirect v0ptr = brw_indirect(2, 0);
struct brw_indirect v1ptr = brw_indirect(3, 0);
/* Need a separate loop for offset:
*/
if (do_offset) {
brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
brw_DO(p, BRW_EXECUTE_1);
{
brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
apply_one_offset(c, v0);
brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_G);
}
brw_WHILE(p);
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
}
/* v1ptr = &inlist[nr_verts]
* *v1ptr = v0
*/
brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v0ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW));
brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v1ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW));
brw_MOV(p, deref_1uw(v1ptr, 0), deref_1uw(v0ptr, 0));
brw_DO(p, BRW_EXECUTE_1);
{
brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
brw_MOV(p, get_addr_reg(v1), deref_1uw(v0ptr, 2));
brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
/* draw edge if edgeflag != 0 */
brw_CMP(p,
vec1(brw_null_reg()), BRW_CONDITIONAL_NZ,
deref_1f(v0, brw_varying_to_offset(&c->vue_map,
VARYING_SLOT_EDGE)),
brw_imm_f(0));
brw_IF(p, BRW_EXECUTE_1);
{
brw_clip_emit_vue(c, v0, BRW_URB_WRITE_ALLOCATE_COMPLETE,
(_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
| URB_WRITE_PRIM_START);
brw_clip_emit_vue(c, v1, BRW_URB_WRITE_ALLOCATE_COMPLETE,
(_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
| URB_WRITE_PRIM_END);
}
brw_ENDIF(p);
brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
}
brw_WHILE(p);
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
}
static void emit_points(struct brw_clip_compile *c,
bool do_offset )
{
struct brw_codegen *p = &c->func;
struct brw_indirect v0 = brw_indirect(0, 0);
struct brw_indirect v0ptr = brw_indirect(2, 0);
brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
brw_DO(p, BRW_EXECUTE_1);
{
brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
/* draw if edgeflag != 0
*/
brw_CMP(p,
vec1(brw_null_reg()), BRW_CONDITIONAL_NZ,
deref_1f(v0, brw_varying_to_offset(&c->vue_map,
VARYING_SLOT_EDGE)),
brw_imm_f(0));
brw_IF(p, BRW_EXECUTE_1);
{
if (do_offset)
apply_one_offset(c, v0);
brw_clip_emit_vue(c, v0, BRW_URB_WRITE_ALLOCATE_COMPLETE,
(_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT)
| URB_WRITE_PRIM_START | URB_WRITE_PRIM_END);
}
brw_ENDIF(p);
brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
}
brw_WHILE(p);
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
}
static void emit_primitives( struct brw_clip_compile *c,
GLuint mode,
bool do_offset )
{
switch (mode) {
case BRW_CLIP_FILL_MODE_FILL:
brw_clip_tri_emit_polygon(c);
break;
case BRW_CLIP_FILL_MODE_LINE:
emit_lines(c, do_offset);
break;
case BRW_CLIP_FILL_MODE_POINT:
emit_points(c, do_offset);
break;
case BRW_CLIP_FILL_MODE_CULL:
unreachable("not reached");
}
}
static void emit_unfilled_primitives( struct brw_clip_compile *c )
{
struct brw_codegen *p = &c->func;
/* Direction culling has already been done.
*/
if (c->key.fill_ccw != c->key.fill_cw &&
c->key.fill_ccw != BRW_CLIP_FILL_MODE_CULL &&
c->key.fill_cw != BRW_CLIP_FILL_MODE_CULL)
{
brw_CMP(p,
vec1(brw_null_reg()),
BRW_CONDITIONAL_GE,
get_element(c->reg.dir, 2),
brw_imm_f(0));
brw_IF(p, BRW_EXECUTE_1);
{
emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
}
brw_ELSE(p);
{
emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
}
brw_ENDIF(p);
}
else if (c->key.fill_cw != BRW_CLIP_FILL_MODE_CULL) {
emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
}
else if (c->key.fill_ccw != BRW_CLIP_FILL_MODE_CULL) {
emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
}
}
static void check_nr_verts( struct brw_clip_compile *c )
{
struct brw_codegen *p = &c->func;
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.nr_verts, brw_imm_d(3));
brw_IF(p, BRW_EXECUTE_1);
{
brw_clip_kill_thread(c);
}
brw_ENDIF(p);
}
void brw_emit_unfilled_clip( struct brw_clip_compile *c )
{
struct brw_codegen *p = &c->func;
c->need_direction = ((c->key.offset_ccw || c->key.offset_cw) ||
(c->key.fill_ccw != c->key.fill_cw) ||
c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL ||
c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL ||
c->key.copy_bfc_cw ||
c->key.copy_bfc_ccw);
brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
brw_clip_tri_init_vertices(c);
brw_clip_init_ff_sync(c);
assert(brw_clip_have_varying(c, VARYING_SLOT_EDGE));
if (c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL &&
c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL) {
brw_clip_kill_thread(c);
return;
}
merge_edgeflags(c);
/* Need to use the inlist indirection here:
*/
if (c->need_direction)
compute_tri_direction(c);
if (c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL ||
c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL)
cull_direction(c);
if (c->key.offset_ccw ||
c->key.offset_cw)
compute_offset(c);
if (c->key.copy_bfc_ccw ||
c->key.copy_bfc_cw)
copy_bfc(c);
/* Need to do this whether we clip or not:
*/
if (c->key.contains_flat_varying)
brw_clip_tri_flat_shade(c);
brw_clip_init_clipmask(c);
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0));
brw_IF(p, BRW_EXECUTE_1);
{
brw_clip_init_planes(c);
brw_clip_tri(c);
check_nr_verts(c);
}
brw_ENDIF(p);
emit_unfilled_primitives(c);
brw_clip_kill_thread(c);
}

View file

@ -0,0 +1,464 @@
/*
Copyright (C) Intel Corp. 2006. All Rights Reserved.
Intel funded Tungsten Graphics to
develop this 3D driver.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice (including the
next paragraph) shall be included in all copies or substantial
portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
**********************************************************************/
/*
* Authors:
* Keith Whitwell <keithw@vmware.com>
*/
#include "brw_clip.h"
struct brw_reg get_tmp( struct brw_clip_compile *c )
{
struct brw_reg tmp = brw_vec4_grf(c->last_tmp, 0);
if (++c->last_tmp > c->prog_data.total_grf)
c->prog_data.total_grf = c->last_tmp;
return tmp;
}
static void release_tmp( struct brw_clip_compile *c, struct brw_reg tmp )
{
if (tmp.nr == c->last_tmp-1)
c->last_tmp--;
}
static struct brw_reg make_plane_ud(GLuint x, GLuint y, GLuint z, GLuint w)
{
return brw_imm_ud((w<<24) | (z<<16) | (y<<8) | x);
}
void brw_clip_init_planes( struct brw_clip_compile *c )
{
struct brw_codegen *p = &c->func;
if (!c->key.nr_userclip) {
brw_MOV(p, get_element_ud(c->reg.fixed_planes, 0), make_plane_ud( 0, 0, 0xff, 1));
brw_MOV(p, get_element_ud(c->reg.fixed_planes, 1), make_plane_ud( 0, 0, 1, 1));
brw_MOV(p, get_element_ud(c->reg.fixed_planes, 2), make_plane_ud( 0, 0xff, 0, 1));
brw_MOV(p, get_element_ud(c->reg.fixed_planes, 3), make_plane_ud( 0, 1, 0, 1));
brw_MOV(p, get_element_ud(c->reg.fixed_planes, 4), make_plane_ud(0xff, 0, 0, 1));
brw_MOV(p, get_element_ud(c->reg.fixed_planes, 5), make_plane_ud( 1, 0, 0, 1));
}
}
#define W 3
/* Project 'pos' to screen space (or back again), overwrite with results:
*/
void brw_clip_project_position(struct brw_clip_compile *c, struct brw_reg pos )
{
struct brw_codegen *p = &c->func;
/* calc rhw
*/
brw_math_invert(p, get_element(pos, W), get_element(pos, W));
/* value.xyz *= value.rhw
*/
brw_set_default_access_mode(p, BRW_ALIGN_16);
brw_MUL(p, brw_writemask(pos, WRITEMASK_XYZ), pos,
brw_swizzle(pos, BRW_SWIZZLE_WWWW));
brw_set_default_access_mode(p, BRW_ALIGN_1);
}
static void brw_clip_project_vertex( struct brw_clip_compile *c,
struct brw_indirect vert_addr )
{
struct brw_codegen *p = &c->func;
struct brw_reg tmp = get_tmp(c);
GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
GLuint ndc_offset = brw_varying_to_offset(&c->vue_map,
BRW_VARYING_SLOT_NDC);
/* Fixup position. Extract from the original vertex and re-project
* to screen space:
*/
brw_MOV(p, tmp, deref_4f(vert_addr, hpos_offset));
brw_clip_project_position(c, tmp);
brw_MOV(p, deref_4f(vert_addr, ndc_offset), tmp);
release_tmp(c, tmp);
}
/* Interpolate between two vertices and put the result into a0.0.
* Increment a0.0 accordingly.
*
* Beware that dest_ptr can be equal to v0_ptr!
*/
void brw_clip_interp_vertex( struct brw_clip_compile *c,
struct brw_indirect dest_ptr,
struct brw_indirect v0_ptr, /* from */
struct brw_indirect v1_ptr, /* to */
struct brw_reg t0,
bool force_edgeflag)
{
struct brw_codegen *p = &c->func;
struct brw_reg t_nopersp, v0_ndc_copy;
GLuint slot;
/* Just copy the vertex header:
*/
/*
* After CLIP stage, only first 256 bits of the VUE are read
* back on Ironlake, so needn't change it
*/
brw_copy_indirect_to_indirect(p, dest_ptr, v0_ptr, 1);
/* First handle the 3D and NDC interpolation, in case we
* need noperspective interpolation. Doing it early has no
* performance impact in any case.
*/
/* Take a copy of the v0 NDC coordinates, in case dest == v0. */
if (c->key.contains_noperspective_varying) {
GLuint offset = brw_varying_to_offset(&c->vue_map,
BRW_VARYING_SLOT_NDC);
v0_ndc_copy = get_tmp(c);
brw_MOV(p, v0_ndc_copy, deref_4f(v0_ptr, offset));
}
/* Compute the new 3D position
*
* dest_hpos = v0_hpos * (1 - t0) + v1_hpos * t0
*/
{
GLuint delta = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
struct brw_reg tmp = get_tmp(c);
brw_MUL(p, vec4(brw_null_reg()), deref_4f(v1_ptr, delta), t0);
brw_MAC(p, tmp, negate(deref_4f(v0_ptr, delta)), t0);
brw_ADD(p, deref_4f(dest_ptr, delta), deref_4f(v0_ptr, delta), tmp);
release_tmp(c, tmp);
}
/* Recreate the projected (NDC) coordinate in the new vertex header */
brw_clip_project_vertex(c, dest_ptr);
/* If we have noperspective attributes,
* we need to compute the screen-space t
*/
if (c->key.contains_noperspective_varying) {
GLuint delta = brw_varying_to_offset(&c->vue_map,
BRW_VARYING_SLOT_NDC);
struct brw_reg tmp = get_tmp(c);
t_nopersp = get_tmp(c);
/* t_nopersp = vec4(v1.xy, dest.xy) */
brw_MOV(p, t_nopersp, deref_4f(v1_ptr, delta));
brw_MOV(p, tmp, deref_4f(dest_ptr, delta));
brw_set_default_access_mode(p, BRW_ALIGN_16);
brw_MOV(p,
brw_writemask(t_nopersp, WRITEMASK_ZW),
brw_swizzle(tmp, BRW_SWIZZLE_XYXY));
/* t_nopersp = vec4(v1.xy, dest.xy) - v0.xyxy */
brw_ADD(p, t_nopersp, t_nopersp,
negate(brw_swizzle(v0_ndc_copy, BRW_SWIZZLE_XYXY)));
/* Add the absolute values of the X and Y deltas so that if
* the points aren't in the same place on the screen we get
* nonzero values to divide.
*
* After that, we have vert1 - vert0 in t_nopersp.x and
* vertnew - vert0 in t_nopersp.y
*
* t_nopersp = vec2(|v1.x -v0.x| + |v1.y -v0.y|,
* |dest.x-v0.x| + |dest.y-v0.y|)
*/
brw_ADD(p,
brw_writemask(t_nopersp, WRITEMASK_XY),
brw_abs(brw_swizzle(t_nopersp, BRW_SWIZZLE_XZXZ)),
brw_abs(brw_swizzle(t_nopersp, BRW_SWIZZLE_YWYW)));
brw_set_default_access_mode(p, BRW_ALIGN_1);
/* If the points are in the same place, just substitute a
* value to avoid divide-by-zero
*/
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ,
vec1(t_nopersp),
brw_imm_f(0));
brw_IF(p, BRW_EXECUTE_1);
brw_MOV(p, t_nopersp, brw_imm_vf4(brw_float_to_vf(1.0),
brw_float_to_vf(0.0),
brw_float_to_vf(0.0),
brw_float_to_vf(0.0)));
brw_ENDIF(p);
/* Now compute t_nopersp = t_nopersp.y/t_nopersp.x and broadcast it. */
brw_math_invert(p, get_element(t_nopersp, 0), get_element(t_nopersp, 0));
brw_MUL(p, vec1(t_nopersp), vec1(t_nopersp),
vec1(suboffset(t_nopersp, 1)));
brw_set_default_access_mode(p, BRW_ALIGN_16);
brw_MOV(p, t_nopersp, brw_swizzle(t_nopersp, BRW_SWIZZLE_XXXX));
brw_set_default_access_mode(p, BRW_ALIGN_1);
release_tmp(c, tmp);
release_tmp(c, v0_ndc_copy);
}
/* Now we can iterate over each attribute
* (could be done in pairs?)
*/
for (slot = 0; slot < c->vue_map.num_slots; slot++) {
int varying = c->vue_map.slot_to_varying[slot];
GLuint delta = brw_vue_slot_to_offset(slot);
/* HPOS, NDC already handled above */
if (varying == VARYING_SLOT_POS || varying == BRW_VARYING_SLOT_NDC)
continue;
if (varying == VARYING_SLOT_EDGE) {
if (force_edgeflag)
brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(1));
else
brw_MOV(p, deref_4f(dest_ptr, delta), deref_4f(v0_ptr, delta));
} else if (varying == VARYING_SLOT_PSIZ) {
/* PSIZ doesn't need interpolation because it isn't used by the
* fragment shader.
*/
} else if (varying < VARYING_SLOT_MAX) {
/* This is a true vertex result (and not a special value for the VUE
* header), so interpolate:
*
* New = attr0 + t*attr1 - t*attr0
*
* Unless the attribute is flat shaded -- in which case just copy
* from one of the sources (doesn't matter which; already copied from pv)
*/
GLuint interp = c->key.interp_mode[slot];
if (interp != INTERP_MODE_FLAT) {
struct brw_reg tmp = get_tmp(c);
struct brw_reg t =
interp == INTERP_MODE_NOPERSPECTIVE ? t_nopersp : t0;
brw_MUL(p,
vec4(brw_null_reg()),
deref_4f(v1_ptr, delta),
t);
brw_MAC(p,
tmp,
negate(deref_4f(v0_ptr, delta)),
t);
brw_ADD(p,
deref_4f(dest_ptr, delta),
deref_4f(v0_ptr, delta),
tmp);
release_tmp(c, tmp);
}
else {
brw_MOV(p,
deref_4f(dest_ptr, delta),
deref_4f(v0_ptr, delta));
}
}
}
if (c->vue_map.num_slots % 2) {
GLuint delta = brw_vue_slot_to_offset(c->vue_map.num_slots);
brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0));
}
if (c->key.contains_noperspective_varying)
release_tmp(c, t_nopersp);
}
void brw_clip_emit_vue(struct brw_clip_compile *c,
struct brw_indirect vert,
enum brw_urb_write_flags flags,
GLuint header)
{
struct brw_codegen *p = &c->func;
bool allocate = flags & BRW_URB_WRITE_ALLOCATE;
brw_clip_ff_sync(c);
/* Any URB entry that is allocated must subsequently be used or discarded,
* so it doesn't make sense to mark EOT and ALLOCATE at the same time.
*/
assert(!(allocate && (flags & BRW_URB_WRITE_EOT)));
/* Copy the vertex from vertn into m1..mN+1:
*/
brw_copy_from_indirect(p, brw_message_reg(1), vert, c->nr_regs);
/* Overwrite PrimType and PrimStart in the message header, for
* each vertex in turn:
*/
brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header));
/* Send each vertex as a separate write to the urb. This
* is different to the concept in brw_sf_emit.c, where
* subsequent writes are used to build up a single urb
* entry. Each of these writes instantiates a separate
* urb entry - (I think... what about 'allocate'?)
*/
brw_urb_WRITE(p,
allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
0,
c->reg.R0,
flags,
c->nr_regs + 1, /* msg length */
allocate ? 1 : 0, /* response_length */
0, /* urb offset */
BRW_URB_SWIZZLE_NONE);
}
void brw_clip_kill_thread(struct brw_clip_compile *c)
{
struct brw_codegen *p = &c->func;
brw_clip_ff_sync(c);
/* Send an empty message to kill the thread and release any
* allocated urb entry:
*/
brw_urb_WRITE(p,
retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
0,
c->reg.R0,
BRW_URB_WRITE_UNUSED | BRW_URB_WRITE_EOT_COMPLETE,
1, /* msg len */
0, /* response len */
0,
BRW_URB_SWIZZLE_NONE);
}
struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c )
{
return brw_address(c->reg.fixed_planes);
}
struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c )
{
if (c->key.nr_userclip) {
return brw_imm_uw(16);
}
else {
return brw_imm_uw(4);
}
}
/* Distribute flatshaded attributes from provoking vertex prior to
* clipping.
*/
void brw_clip_copy_flatshaded_attributes( struct brw_clip_compile *c,
GLuint to, GLuint from )
{
struct brw_codegen *p = &c->func;
for (int i = 0; i < c->vue_map.num_slots; i++) {
if (c->key.interp_mode[i] == INTERP_MODE_FLAT) {
brw_MOV(p,
byte_offset(c->reg.vertex[to], brw_vue_slot_to_offset(i)),
byte_offset(c->reg.vertex[from], brw_vue_slot_to_offset(i)));
}
}
}
void brw_clip_init_clipmask( struct brw_clip_compile *c )
{
struct brw_codegen *p = &c->func;
struct brw_reg incoming = get_element_ud(c->reg.R0, 2);
/* Shift so that lowest outcode bit is rightmost:
*/
brw_SHR(p, c->reg.planemask, incoming, brw_imm_ud(26));
if (c->key.nr_userclip) {
struct brw_reg tmp = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UD);
/* Rearrange userclip outcodes so that they come directly after
* the fixed plane bits.
*/
if (p->devinfo->ver == 5 || p->devinfo->verx10 == 45)
brw_AND(p, tmp, incoming, brw_imm_ud(0xff<<14));
else
brw_AND(p, tmp, incoming, brw_imm_ud(0x3f<<14));
brw_SHR(p, tmp, tmp, brw_imm_ud(8));
brw_OR(p, c->reg.planemask, c->reg.planemask, tmp);
release_tmp(c, tmp);
}
}
void brw_clip_ff_sync(struct brw_clip_compile *c)
{
struct brw_codegen *p = &c->func;
if (p->devinfo->ver == 5) {
brw_AND(p, brw_null_reg(), c->reg.ff_sync, brw_imm_ud(0x1));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
brw_IF(p, BRW_EXECUTE_1);
{
brw_OR(p, c->reg.ff_sync, c->reg.ff_sync, brw_imm_ud(0x1));
brw_ff_sync(p,
c->reg.R0,
0,
c->reg.R0,
1, /* allocate */
1, /* response length */
0 /* eot */);
}
brw_ENDIF(p);
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
}
}
void brw_clip_init_ff_sync(struct brw_clip_compile *c)
{
struct brw_codegen *p = &c->func;
if (p->devinfo->ver == 5) {
brw_MOV(p, c->reg.ff_sync, brw_imm_ud(0));
}
}

View file

@ -0,0 +1,97 @@
/*
* Copyright © 2006 - 2017 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_clip.h"
#include "brw_disasm.h"
#include "dev/intel_debug.h"
const unsigned *
brw_compile_clip(const struct brw_compiler *compiler,
void *mem_ctx,
const struct brw_clip_prog_key *key,
struct brw_clip_prog_data *prog_data,
struct intel_vue_map *vue_map,
unsigned *final_assembly_size)
{
struct brw_clip_compile c;
memset(&c, 0, sizeof(c));
/* Begin the compilation:
*/
brw_init_codegen(&compiler->isa, &c.func, mem_ctx);
c.func.single_program_flow = 1;
c.key = *key;
c.vue_map = *vue_map;
/* nr_regs is the number of registers filled by reading data from the VUE.
* This program accesses the entire VUE, so nr_regs needs to be the size of
* the VUE (measured in pairs, since two slots are stored in each
* register).
*/
c.nr_regs = (c.vue_map.num_slots + 1)/2;
c.prog_data.clip_mode = c.key.clip_mode; /* XXX */
/* For some reason the thread is spawned with only 4 channels
* unmasked.
*/
brw_set_default_mask_control(&c.func, BRW_MASK_DISABLE);
/* Would ideally have the option of producing a program which could
* do all three:
*/
switch (key->primitive) {
case MESA_PRIM_TRIANGLES:
if (key->do_unfilled)
brw_emit_unfilled_clip( &c );
else
brw_emit_tri_clip( &c );
break;
case MESA_PRIM_LINES:
brw_emit_line_clip( &c );
break;
case MESA_PRIM_POINTS:
brw_emit_point_clip( &c );
break;
default:
unreachable("not reached");
}
brw_compact_instructions(&c.func, 0, NULL);
*prog_data = c.prog_data;
const unsigned *program = brw_get_program(&c.func, final_assembly_size);
if (INTEL_DEBUG(DEBUG_CLIP)) {
fprintf(stderr, "clip:\n");
brw_disassemble_with_labels(&compiler->isa,
program, 0, *final_assembly_size, stderr);
fprintf(stderr, "\n");
}
return program;
}

View file

@ -0,0 +1,662 @@
/*
Copyright (C) Intel Corp. 2006. All Rights Reserved.
Intel funded Tungsten Graphics to
develop this 3D driver.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice (including the
next paragraph) shall be included in all copies or substantial
portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
**********************************************************************/
/*
* Authors:
* Keith Whitwell <keithw@vmware.com>
*/
#include "brw_compiler.h"
#include "brw_disasm.h"
#include "brw_eu.h"
#include "brw_prim.h"
#include "dev/intel_debug.h"
#define MAX_GS_VERTS (4)
struct brw_ff_gs_compile {
struct brw_codegen func;
struct brw_ff_gs_prog_key key;
struct brw_ff_gs_prog_data *prog_data;
struct {
struct brw_reg R0;
/**
* Register holding streamed vertex buffer pointers -- see the Sandy
* Bridge PRM, volume 2 part 1, section 4.4.2 (GS Thread Payload
* [DevSNB]). These pointers are delivered in GRF 1.
*/
struct brw_reg SVBI;
struct brw_reg vertex[MAX_GS_VERTS];
struct brw_reg header;
struct brw_reg temp;
/**
* Register holding destination indices for streamed buffer writes.
* Only used for SOL programs.
*/
struct brw_reg destination_indices;
} reg;
/* Number of registers used to store vertex data */
GLuint nr_regs;
struct intel_vue_map vue_map;
};
/**
* Allocate registers for GS.
*
* If sol_program is true, then:
*
* - The thread will be spawned with the "SVBI Payload Enable" bit set, so GRF
* 1 needs to be set aside to hold the streamed vertex buffer indices.
*
* - The thread will need to use the destination_indices register.
*/
static void brw_ff_gs_alloc_regs(struct brw_ff_gs_compile *c,
GLuint nr_verts,
bool sol_program)
{
GLuint i = 0,j;
/* Register usage is static, precompute here:
*/
c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
/* Streamed vertex buffer indices */
if (sol_program)
c->reg.SVBI = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
/* Payload vertices plus space for more generated vertices:
*/
for (j = 0; j < nr_verts; j++) {
c->reg.vertex[j] = brw_vec4_grf(i, 0);
i += c->nr_regs;
}
c->reg.header = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
c->reg.temp = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
if (sol_program) {
c->reg.destination_indices =
retype(brw_vec4_grf(i++, 0), BRW_REGISTER_TYPE_UD);
}
c->prog_data->urb_read_length = c->nr_regs;
c->prog_data->total_grf = i;
}
/**
* Set up the initial value of c->reg.header register based on c->reg.R0.
*
* The following information is passed to the GS thread in R0, and needs to be
* included in the first URB_WRITE or FF_SYNC message sent by the GS:
*
* - DWORD 0 [31:0] handle info (Gen4 only)
* - DWORD 5 [7:0] FFTID
* - DWORD 6 [31:0] Debug info
* - DWORD 7 [31:0] Debug info
*
* This function sets up the above data by copying by copying the contents of
* R0 to the header register.
*/
static void brw_ff_gs_initialize_header(struct brw_ff_gs_compile *c)
{
struct brw_codegen *p = &c->func;
brw_MOV(p, c->reg.header, c->reg.R0);
}
/**
* Overwrite DWORD 2 of c->reg.header with the given immediate unsigned value.
*
* In URB_WRITE messages, DWORD 2 contains the fields PrimType, PrimStart,
* PrimEnd, Increment CL_INVOCATIONS, and SONumPrimsWritten, many of which we
* need to be able to update on a per-vertex basis.
*/
static void brw_ff_gs_overwrite_header_dw2(struct brw_ff_gs_compile *c,
unsigned dw2)
{
struct brw_codegen *p = &c->func;
brw_MOV(p, get_element_ud(c->reg.header, 2), brw_imm_ud(dw2));
}
/**
* Overwrite DWORD 2 of c->reg.header with the primitive type from c->reg.R0.
*
* When the thread is spawned, GRF 0 contains the primitive type in bits 4:0
* of DWORD 2. URB_WRITE messages need the primitive type in bits 6:2 of
* DWORD 2. So this function extracts the primitive type field, bitshifts it
* appropriately, and stores it in c->reg.header.
*/
static void brw_ff_gs_overwrite_header_dw2_from_r0(struct brw_ff_gs_compile *c)
{
struct brw_codegen *p = &c->func;
brw_AND(p, get_element_ud(c->reg.header, 2), get_element_ud(c->reg.R0, 2),
brw_imm_ud(0x1f));
brw_SHL(p, get_element_ud(c->reg.header, 2),
get_element_ud(c->reg.header, 2), brw_imm_ud(2));
}
/**
* Apply an additive offset to DWORD 2 of c->reg.header.
*
* This is used to set/unset the "PrimStart" and "PrimEnd" flags appropriately
* for each vertex.
*/
static void brw_ff_gs_offset_header_dw2(struct brw_ff_gs_compile *c,
int offset)
{
struct brw_codegen *p = &c->func;
brw_ADD(p, get_element_d(c->reg.header, 2), get_element_d(c->reg.header, 2),
brw_imm_d(offset));
}
/**
* Emit a vertex using the URB_WRITE message. Use the contents of
* c->reg.header for the message header, and the registers starting at \c vert
* for the vertex data.
*
* If \c last is true, then this is the last vertex, so no further URB space
* should be allocated, and this message should end the thread.
*
* If \c last is false, then a new URB entry will be allocated, and its handle
* will be stored in DWORD 0 of c->reg.header for use in the next URB_WRITE
* message.
*/
static void brw_ff_gs_emit_vue(struct brw_ff_gs_compile *c,
struct brw_reg vert,
bool last)
{
struct brw_codegen *p = &c->func;
int write_offset = 0;
bool complete = false;
do {
/* We can't write more than 14 registers at a time to the URB */
int write_len = MIN2(c->nr_regs - write_offset, 14);
if (write_len == c->nr_regs - write_offset)
complete = true;
/* Copy the vertex from vertn into m1..mN+1:
*/
brw_copy8(p, brw_message_reg(1), offset(vert, write_offset), write_len);
/* Send the vertex data to the URB. If this is the last write for this
* vertex, then we mark it as complete, and either end the thread or
* allocate another vertex URB entry (depending whether this is the last
* vertex).
*/
enum brw_urb_write_flags flags;
if (!complete)
flags = BRW_URB_WRITE_NO_FLAGS;
else if (last)
flags = BRW_URB_WRITE_EOT_COMPLETE;
else
flags = BRW_URB_WRITE_ALLOCATE_COMPLETE;
brw_urb_WRITE(p,
(flags & BRW_URB_WRITE_ALLOCATE) ? c->reg.temp
: retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
0,
c->reg.header,
flags,
write_len + 1, /* msg length */
(flags & BRW_URB_WRITE_ALLOCATE) ? 1
: 0, /* response length */
write_offset, /* urb offset */
BRW_URB_SWIZZLE_NONE);
write_offset += write_len;
} while (!complete);
if (!last) {
brw_MOV(p, get_element_ud(c->reg.header, 0),
get_element_ud(c->reg.temp, 0));
}
}
/**
* Send an FF_SYNC message to ensure that all previously spawned GS threads
* have finished sending primitives down the pipeline, and to allocate a URB
* entry for the first output vertex. Only needed on Ironlake+.
*
* This function modifies c->reg.header: in DWORD 1, it stores num_prim (which
* is needed by the FF_SYNC message), and in DWORD 0, it stores the handle to
* the allocated URB entry (which will be needed by the URB_WRITE meesage that
* follows).
*/
static void brw_ff_gs_ff_sync(struct brw_ff_gs_compile *c, int num_prim)
{
struct brw_codegen *p = &c->func;
brw_MOV(p, get_element_ud(c->reg.header, 1), brw_imm_ud(num_prim));
brw_ff_sync(p,
c->reg.temp,
0,
c->reg.header,
1, /* allocate */
1, /* response length */
0 /* eot */);
brw_MOV(p, get_element_ud(c->reg.header, 0),
get_element_ud(c->reg.temp, 0));
}
static void
brw_ff_gs_quads(struct brw_ff_gs_compile *c,
const struct brw_ff_gs_prog_key *key)
{
brw_ff_gs_alloc_regs(c, 4, false);
brw_ff_gs_initialize_header(c);
/* Use polygons for correct edgeflag behaviour. Note that vertex 3
* is the PV for quads, but vertex 0 for polygons:
*/
if (c->func.devinfo->ver == 5)
brw_ff_gs_ff_sync(c, 1);
brw_ff_gs_overwrite_header_dw2(
c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
| URB_WRITE_PRIM_START));
if (key->pv_first) {
brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
brw_ff_gs_overwrite_header_dw2(
c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
brw_ff_gs_emit_vue(c, c->reg.vertex[1], 0);
brw_ff_gs_emit_vue(c, c->reg.vertex[2], 0);
brw_ff_gs_overwrite_header_dw2(
c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
| URB_WRITE_PRIM_END));
brw_ff_gs_emit_vue(c, c->reg.vertex[3], 1);
}
else {
brw_ff_gs_emit_vue(c, c->reg.vertex[3], 0);
brw_ff_gs_overwrite_header_dw2(
c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
brw_ff_gs_emit_vue(c, c->reg.vertex[1], 0);
brw_ff_gs_overwrite_header_dw2(
c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
| URB_WRITE_PRIM_END));
brw_ff_gs_emit_vue(c, c->reg.vertex[2], 1);
}
}
static void
brw_ff_gs_quad_strip(struct brw_ff_gs_compile *c,
const struct brw_ff_gs_prog_key *key)
{
brw_ff_gs_alloc_regs(c, 4, false);
brw_ff_gs_initialize_header(c);
if (c->func.devinfo->ver == 5)
brw_ff_gs_ff_sync(c, 1);
brw_ff_gs_overwrite_header_dw2(
c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
| URB_WRITE_PRIM_START));
if (key->pv_first) {
brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
brw_ff_gs_overwrite_header_dw2(
c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
brw_ff_gs_emit_vue(c, c->reg.vertex[1], 0);
brw_ff_gs_emit_vue(c, c->reg.vertex[2], 0);
brw_ff_gs_overwrite_header_dw2(
c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
| URB_WRITE_PRIM_END));
brw_ff_gs_emit_vue(c, c->reg.vertex[3], 1);
}
else {
brw_ff_gs_emit_vue(c, c->reg.vertex[2], 0);
brw_ff_gs_overwrite_header_dw2(
c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
brw_ff_gs_emit_vue(c, c->reg.vertex[3], 0);
brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
brw_ff_gs_overwrite_header_dw2(
c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
| URB_WRITE_PRIM_END));
brw_ff_gs_emit_vue(c, c->reg.vertex[1], 1);
}
}
static void brw_ff_gs_lines(struct brw_ff_gs_compile *c)
{
brw_ff_gs_alloc_regs(c, 2, false);
brw_ff_gs_initialize_header(c);
if (c->func.devinfo->ver == 5)
brw_ff_gs_ff_sync(c, 1);
brw_ff_gs_overwrite_header_dw2(
c, ((_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
| URB_WRITE_PRIM_START));
brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
brw_ff_gs_overwrite_header_dw2(
c, ((_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
| URB_WRITE_PRIM_END));
brw_ff_gs_emit_vue(c, c->reg.vertex[1], 1);
}
/**
* Generate the geometry shader program used on Gen6 to perform stream output
* (transform feedback).
*/
static void
gfx6_sol_program(struct brw_ff_gs_compile *c, const struct brw_ff_gs_prog_key *key,
unsigned num_verts, bool check_edge_flags)
{
struct brw_codegen *p = &c->func;
brw_inst *inst;
c->prog_data->svbi_postincrement_value = num_verts;
brw_ff_gs_alloc_regs(c, num_verts, true);
brw_ff_gs_initialize_header(c);
if (key->num_transform_feedback_bindings > 0) {
unsigned vertex, binding;
struct brw_reg destination_indices_uw =
vec8(retype(c->reg.destination_indices, BRW_REGISTER_TYPE_UW));
/* Note: since we use the binding table to keep track of buffer offsets
* and stride, the GS doesn't need to keep track of a separate pointer
* into each buffer; it uses a single pointer which increments by 1 for
* each vertex. So we use SVBI0 for this pointer, regardless of whether
* transform feedback is in interleaved or separate attribs mode.
*
* Make sure that the buffers have enough room for all the vertices.
*/
brw_ADD(p, get_element_ud(c->reg.temp, 0),
get_element_ud(c->reg.SVBI, 0), brw_imm_ud(num_verts));
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE,
get_element_ud(c->reg.temp, 0),
get_element_ud(c->reg.SVBI, 4));
brw_IF(p, BRW_EXECUTE_1);
/* Compute the destination indices to write to. Usually we use SVBI[0]
* + (0, 1, 2). However, for odd-numbered triangles in tristrips, the
* vertices come down the pipeline in reversed winding order, so we need
* to flip the order when writing to the transform feedback buffer. To
* ensure that flatshading accuracy is preserved, we need to write them
* in order SVBI[0] + (0, 2, 1) if we're using the first provoking
* vertex convention, and in order SVBI[0] + (1, 0, 2) if we're using
* the last provoking vertex convention.
*
* Note: since brw_imm_v can only be used in instructions in
* packed-word execution mode, and SVBI is a double-word, we need to
* first move the appropriate immediate constant ((0, 1, 2), (0, 2, 1),
* or (1, 0, 2)) to the destination_indices register, and then add SVBI
* using a separate instruction. Also, since the immediate constant is
* expressed as packed words, and we need to load double-words into
* destination_indices, we need to intersperse zeros to fill the upper
* halves of each double-word.
*/
brw_MOV(p, destination_indices_uw,
brw_imm_v(0x00020100)); /* (0, 1, 2) */
if (num_verts == 3) {
/* Get primitive type into temp register. */
brw_AND(p, get_element_ud(c->reg.temp, 0),
get_element_ud(c->reg.R0, 2), brw_imm_ud(0x1f));
/* Test if primitive type is TRISTRIP_REVERSE. We need to do this as
* an 8-wide comparison so that the conditional MOV that follows
* moves all 8 words correctly.
*/
brw_CMP(p, vec8(brw_null_reg()), BRW_CONDITIONAL_EQ,
get_element_ud(c->reg.temp, 0),
brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE));
/* If so, then overwrite destination_indices_uw with the appropriate
* reordering.
*/
inst = brw_MOV(p, destination_indices_uw,
brw_imm_v(key->pv_first ? 0x00010200 /* (0, 2, 1) */
: 0x00020001)); /* (1, 0, 2) */
brw_inst_set_pred_control(p->devinfo, inst, BRW_PREDICATE_NORMAL);
}
assert(c->reg.destination_indices.width == BRW_EXECUTE_4);
brw_push_insn_state(p);
brw_set_default_exec_size(p, BRW_EXECUTE_4);
brw_ADD(p, c->reg.destination_indices,
c->reg.destination_indices, get_element_ud(c->reg.SVBI, 0));
brw_pop_insn_state(p);
/* For each vertex, generate code to output each varying using the
* appropriate binding table entry.
*/
for (vertex = 0; vertex < num_verts; ++vertex) {
/* Set up the correct destination index for this vertex */
brw_MOV(p, get_element_ud(c->reg.header, 5),
get_element_ud(c->reg.destination_indices, vertex));
for (binding = 0; binding < key->num_transform_feedback_bindings;
++binding) {
unsigned char varying =
key->transform_feedback_bindings[binding];
unsigned char slot = c->vue_map.varying_to_slot[varying];
/* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
*
* "Prior to End of Thread with a URB_WRITE, the kernel must
* ensure that all writes are complete by sending the final
* write as a committed write."
*/
bool final_write =
binding == key->num_transform_feedback_bindings - 1 &&
vertex == num_verts - 1;
struct brw_reg vertex_slot = c->reg.vertex[vertex];
vertex_slot.nr += slot / 2;
vertex_slot.subnr = (slot % 2) * 16;
/* gl_PointSize is stored in VARYING_SLOT_PSIZ.w. */
vertex_slot.swizzle = varying == VARYING_SLOT_PSIZ
? BRW_SWIZZLE_WWWW : key->transform_feedback_swizzles[binding];
brw_set_default_access_mode(p, BRW_ALIGN_16);
brw_push_insn_state(p);
brw_set_default_exec_size(p, BRW_EXECUTE_4);
brw_MOV(p, stride(c->reg.header, 4, 4, 1),
retype(vertex_slot, BRW_REGISTER_TYPE_UD));
brw_pop_insn_state(p);
brw_set_default_access_mode(p, BRW_ALIGN_1);
brw_svb_write(p,
final_write ? c->reg.temp : brw_null_reg(), /* dest */
1, /* msg_reg_nr */
c->reg.header, /* src0 */
BRW_GFX6_SOL_BINDING_START + binding, /* binding_table_index */
final_write); /* send_commit_msg */
}
}
brw_ENDIF(p);
/* Now, reinitialize the header register from R0 to restore the parts of
* the register that we overwrote while streaming out transform feedback
* data.
*/
brw_ff_gs_initialize_header(c);
/* Finally, wait for the write commit to occur so that we can proceed to
* other things safely.
*
* From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
*
* The write commit does not modify the destination register, but
* merely clears the dependency associated with the destination
* register. Thus, a simple mov instruction using the register as a
* source is sufficient to wait for the write commit to occur.
*/
brw_MOV(p, c->reg.temp, c->reg.temp);
}
brw_ff_gs_ff_sync(c, 1);
brw_ff_gs_overwrite_header_dw2_from_r0(c);
switch (num_verts) {
case 1:
brw_ff_gs_offset_header_dw2(c,
URB_WRITE_PRIM_START | URB_WRITE_PRIM_END);
brw_ff_gs_emit_vue(c, c->reg.vertex[0], true);
break;
case 2:
brw_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_START);
brw_ff_gs_emit_vue(c, c->reg.vertex[0], false);
brw_ff_gs_offset_header_dw2(c,
URB_WRITE_PRIM_END - URB_WRITE_PRIM_START);
brw_ff_gs_emit_vue(c, c->reg.vertex[1], true);
break;
case 3:
if (check_edge_flags) {
/* Only emit vertices 0 and 1 if this is the first triangle of the
* polygon. Otherwise they are redundant.
*/
brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
get_element_ud(c->reg.R0, 2),
brw_imm_ud(BRW_GS_EDGE_INDICATOR_0));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
brw_IF(p, BRW_EXECUTE_1);
}
brw_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_START);
brw_ff_gs_emit_vue(c, c->reg.vertex[0], false);
brw_ff_gs_offset_header_dw2(c, -URB_WRITE_PRIM_START);
brw_ff_gs_emit_vue(c, c->reg.vertex[1], false);
if (check_edge_flags) {
brw_ENDIF(p);
/* Only emit vertex 2 in PRIM_END mode if this is the last triangle
* of the polygon. Otherwise leave the primitive incomplete because
* there are more polygon vertices coming.
*/
brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
get_element_ud(c->reg.R0, 2),
brw_imm_ud(BRW_GS_EDGE_INDICATOR_1));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
brw_set_default_predicate_control(p, BRW_PREDICATE_NORMAL);
}
brw_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_END);
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
brw_ff_gs_emit_vue(c, c->reg.vertex[2], true);
break;
}
}
const unsigned *
brw_compile_ff_gs_prog(struct brw_compiler *compiler,
void *mem_ctx,
const struct brw_ff_gs_prog_key *key,
struct brw_ff_gs_prog_data *prog_data,
struct intel_vue_map *vue_map,
unsigned *final_assembly_size)
{
struct brw_ff_gs_compile c;
const GLuint *program;
memset(&c, 0, sizeof(c));
c.key = *key;
c.vue_map = *vue_map;
c.nr_regs = (c.vue_map.num_slots + 1)/2;
c.prog_data = prog_data;
mem_ctx = ralloc_context(NULL);
/* Begin the compilation:
*/
brw_init_codegen(&compiler->isa, &c.func, mem_ctx);
c.func.single_program_flow = 1;
/* For some reason the thread is spawned with only 4 channels
* unmasked.
*/
brw_set_default_mask_control(&c.func, BRW_MASK_DISABLE);
if (compiler->devinfo->ver >= 6) {
unsigned num_verts;
bool check_edge_flag;
/* On Sandybridge, we use the GS for implementing transform feedback
* (called "Stream Out" in the PRM).
*/
switch (key->primitive) {
case _3DPRIM_POINTLIST:
num_verts = 1;
check_edge_flag = false;
break;
case _3DPRIM_LINELIST:
case _3DPRIM_LINESTRIP:
case _3DPRIM_LINELOOP:
num_verts = 2;
check_edge_flag = false;
break;
case _3DPRIM_TRILIST:
case _3DPRIM_TRIFAN:
case _3DPRIM_TRISTRIP:
case _3DPRIM_RECTLIST:
num_verts = 3;
check_edge_flag = false;
break;
case _3DPRIM_QUADLIST:
case _3DPRIM_QUADSTRIP:
case _3DPRIM_POLYGON:
num_verts = 3;
check_edge_flag = true;
break;
default:
unreachable("Unexpected primitive type in Gen6 SOL program.");
}
gfx6_sol_program(&c, key, num_verts, check_edge_flag);
} else {
/* On Gen4-5, we use the GS to decompose certain types of primitives.
* Note that primitives which don't require a GS program have already
* been weeded out by now.
*/
switch (key->primitive) {
case _3DPRIM_QUADLIST:
brw_ff_gs_quads( &c, key );
break;
case _3DPRIM_QUADSTRIP:
brw_ff_gs_quad_strip( &c, key );
break;
case _3DPRIM_LINELOOP:
brw_ff_gs_lines( &c );
break;
default:
return NULL;
}
}
brw_compact_instructions(&c.func, 0, NULL);
/* get the program
*/
program = brw_get_program(&c.func, final_assembly_size);
if (INTEL_DEBUG(DEBUG_GS)) {
fprintf(stderr, "gs:\n");
brw_disassemble_with_labels(&compiler->isa, c.func.store,
0, *final_assembly_size, stderr);
fprintf(stderr, "\n");
}
return program;
}

View file

@ -0,0 +1,881 @@
/*
* Copyright © 2006 - 2017 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_compiler.h"
#include "brw_disasm.h"
#include "brw_eu.h"
#include "brw_prim.h"
#include "dev/intel_debug.h"
struct brw_sf_compile {
struct brw_codegen func;
struct brw_sf_prog_key key;
struct brw_sf_prog_data prog_data;
struct brw_reg pv;
struct brw_reg det;
struct brw_reg dx0;
struct brw_reg dx2;
struct brw_reg dy0;
struct brw_reg dy2;
/* z and 1/w passed in separately:
*/
struct brw_reg z[3];
struct brw_reg inv_w[3];
/* The vertices:
*/
struct brw_reg vert[3];
/* Temporaries, allocated after last vertex reg.
*/
struct brw_reg inv_det;
struct brw_reg a1_sub_a0;
struct brw_reg a2_sub_a0;
struct brw_reg tmp;
struct brw_reg m1Cx;
struct brw_reg m2Cy;
struct brw_reg m3C0;
GLuint nr_verts;
GLuint nr_attr_regs;
GLuint nr_setup_regs;
int urb_entry_read_offset;
/** The last known value of the f0.0 flag register. */
unsigned flag_value;
struct intel_vue_map vue_map;
};
/**
* Determine the vue slot corresponding to the given half of the given register.
*/
static inline int vert_reg_to_vue_slot(struct brw_sf_compile *c, GLuint reg,
int half)
{
return (reg + c->urb_entry_read_offset) * 2 + half;
}
/**
* Determine the varying corresponding to the given half of the given
* register. half=0 means the first half of a register, half=1 means the
* second half.
*/
static inline int vert_reg_to_varying(struct brw_sf_compile *c, GLuint reg,
int half)
{
int vue_slot = vert_reg_to_vue_slot(c, reg, half);
return c->vue_map.slot_to_varying[vue_slot];
}
/**
* Determine the register corresponding to the given vue slot
*/
static struct brw_reg get_vue_slot(struct brw_sf_compile *c,
struct brw_reg vert,
int vue_slot)
{
GLuint off = vue_slot / 2 - c->urb_entry_read_offset;
GLuint sub = vue_slot % 2;
return brw_vec4_grf(vert.nr + off, sub * 4);
}
/**
* Determine the register corresponding to the given varying.
*/
static struct brw_reg get_varying(struct brw_sf_compile *c,
struct brw_reg vert,
GLuint varying)
{
int vue_slot = c->vue_map.varying_to_slot[varying];
assert (vue_slot >= c->urb_entry_read_offset);
return get_vue_slot(c, vert, vue_slot);
}
static bool
have_attr(struct brw_sf_compile *c, GLuint attr)
{
return (c->key.attrs & BITFIELD64_BIT(attr)) ? 1 : 0;
}
/***********************************************************************
* Twoside lighting
*/
static void copy_bfc( struct brw_sf_compile *c,
struct brw_reg vert )
{
struct brw_codegen *p = &c->func;
GLuint i;
for (i = 0; i < 2; i++) {
if (have_attr(c, VARYING_SLOT_COL0+i) &&
have_attr(c, VARYING_SLOT_BFC0+i))
brw_MOV(p,
get_varying(c, vert, VARYING_SLOT_COL0+i),
get_varying(c, vert, VARYING_SLOT_BFC0+i));
}
}
static void do_twoside_color( struct brw_sf_compile *c )
{
struct brw_codegen *p = &c->func;
GLuint backface_conditional = c->key.frontface_ccw ? BRW_CONDITIONAL_G : BRW_CONDITIONAL_L;
/* Already done in clip program:
*/
if (c->key.primitive == BRW_SF_PRIM_UNFILLED_TRIS)
return;
/* If the vertex shader provides backface color, do the selection. The VS
* promises to set up the front color if the backface color is provided, but
* it may contain junk if never written to.
*/
if (!(have_attr(c, VARYING_SLOT_COL0) && have_attr(c, VARYING_SLOT_BFC0)) &&
!(have_attr(c, VARYING_SLOT_COL1) && have_attr(c, VARYING_SLOT_BFC1)))
return;
/* Need to use BRW_EXECUTE_4 and also do an 4-wide compare in order
* to get all channels active inside the IF. In the clipping code
* we run with NoMask, so it's not an option and we can use
* BRW_EXECUTE_1 for all comparisons.
*/
brw_CMP(p, vec4(brw_null_reg()), backface_conditional, c->det, brw_imm_f(0));
brw_IF(p, BRW_EXECUTE_4);
{
switch (c->nr_verts) {
case 3: copy_bfc(c, c->vert[2]); FALLTHROUGH;
case 2: copy_bfc(c, c->vert[1]); FALLTHROUGH;
case 1: copy_bfc(c, c->vert[0]);
}
}
brw_ENDIF(p);
}
/***********************************************************************
* Flat shading
*/
static void copy_flatshaded_attributes(struct brw_sf_compile *c,
struct brw_reg dst,
struct brw_reg src)
{
struct brw_codegen *p = &c->func;
int i;
for (i = 0; i < c->vue_map.num_slots; i++) {
if (c->key.interp_mode[i] == INTERP_MODE_FLAT) {
brw_MOV(p,
get_vue_slot(c, dst, i),
get_vue_slot(c, src, i));
}
}
}
static int count_flatshaded_attributes(struct brw_sf_compile *c)
{
int i;
int count = 0;
for (i = 0; i < c->vue_map.num_slots; i++)
if (c->key.interp_mode[i] == INTERP_MODE_FLAT)
count++;
return count;
}
/* Need to use a computed jump to copy flatshaded attributes as the
* vertices are ordered according to y-coordinate before reaching this
* point, so the PV could be anywhere.
*/
static void do_flatshade_triangle( struct brw_sf_compile *c )
{
struct brw_codegen *p = &c->func;
GLuint nr;
GLuint jmpi = 1;
/* Already done in clip program:
*/
if (c->key.primitive == BRW_SF_PRIM_UNFILLED_TRIS)
return;
if (p->devinfo->ver == 5)
jmpi = 2;
nr = count_flatshaded_attributes(c);
brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr*2+1)));
brw_JMPI(p, c->pv, BRW_PREDICATE_NONE);
copy_flatshaded_attributes(c, c->vert[1], c->vert[0]);
copy_flatshaded_attributes(c, c->vert[2], c->vert[0]);
brw_JMPI(p, brw_imm_d(jmpi*(nr*4+1)), BRW_PREDICATE_NONE);
copy_flatshaded_attributes(c, c->vert[0], c->vert[1]);
copy_flatshaded_attributes(c, c->vert[2], c->vert[1]);
brw_JMPI(p, brw_imm_d(jmpi*nr*2), BRW_PREDICATE_NONE);
copy_flatshaded_attributes(c, c->vert[0], c->vert[2]);
copy_flatshaded_attributes(c, c->vert[1], c->vert[2]);
}
static void do_flatshade_line( struct brw_sf_compile *c )
{
struct brw_codegen *p = &c->func;
GLuint nr;
GLuint jmpi = 1;
/* Already done in clip program:
*/
if (c->key.primitive == BRW_SF_PRIM_UNFILLED_TRIS)
return;
if (p->devinfo->ver == 5)
jmpi = 2;
nr = count_flatshaded_attributes(c);
brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr+1)));
brw_JMPI(p, c->pv, BRW_PREDICATE_NONE);
copy_flatshaded_attributes(c, c->vert[1], c->vert[0]);
brw_JMPI(p, brw_imm_ud(jmpi*nr), BRW_PREDICATE_NONE);
copy_flatshaded_attributes(c, c->vert[0], c->vert[1]);
}
/***********************************************************************
* Triangle setup.
*/
static void alloc_regs( struct brw_sf_compile *c )
{
GLuint reg, i;
/* Values computed by fixed function unit:
*/
c->pv = retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_D);
c->det = brw_vec1_grf(1, 2);
c->dx0 = brw_vec1_grf(1, 3);
c->dx2 = brw_vec1_grf(1, 4);
c->dy0 = brw_vec1_grf(1, 5);
c->dy2 = brw_vec1_grf(1, 6);
/* z and 1/w passed in separately:
*/
c->z[0] = brw_vec1_grf(2, 0);
c->inv_w[0] = brw_vec1_grf(2, 1);
c->z[1] = brw_vec1_grf(2, 2);
c->inv_w[1] = brw_vec1_grf(2, 3);
c->z[2] = brw_vec1_grf(2, 4);
c->inv_w[2] = brw_vec1_grf(2, 5);
/* The vertices:
*/
reg = 3;
for (i = 0; i < c->nr_verts; i++) {
c->vert[i] = brw_vec8_grf(reg, 0);
reg += c->nr_attr_regs;
}
/* Temporaries, allocated after last vertex reg.
*/
c->inv_det = brw_vec1_grf(reg, 0); reg++;
c->a1_sub_a0 = brw_vec8_grf(reg, 0); reg++;
c->a2_sub_a0 = brw_vec8_grf(reg, 0); reg++;
c->tmp = brw_vec8_grf(reg, 0); reg++;
/* Note grf allocation:
*/
c->prog_data.total_grf = reg;
/* Outputs of this program - interpolation coefficients for
* rasterization:
*/
c->m1Cx = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 1, 0);
c->m2Cy = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 2, 0);
c->m3C0 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 3, 0);
}
static void copy_z_inv_w( struct brw_sf_compile *c )
{
struct brw_codegen *p = &c->func;
GLuint i;
/* Copy both scalars with a single MOV:
*/
for (i = 0; i < c->nr_verts; i++)
brw_MOV(p, vec2(suboffset(c->vert[i], 2)), vec2(c->z[i]));
}
static void invert_det( struct brw_sf_compile *c)
{
/* Looks like we invert all 8 elements just to get 1/det in
* position 2 !?!
*/
gfx4_math(&c->func,
c->inv_det,
BRW_MATH_FUNCTION_INV,
0,
c->det,
BRW_MATH_PRECISION_FULL);
}
static bool
calculate_masks(struct brw_sf_compile *c,
GLuint reg,
GLushort *pc,
GLushort *pc_persp,
GLushort *pc_linear)
{
bool is_last_attr = (reg == c->nr_setup_regs - 1);
enum glsl_interp_mode interp;
*pc_persp = 0;
*pc_linear = 0;
*pc = 0xf;
interp = c->key.interp_mode[vert_reg_to_vue_slot(c, reg, 0)];
if (interp == INTERP_MODE_SMOOTH) {
*pc_linear = 0xf;
*pc_persp = 0xf;
} else if (interp == INTERP_MODE_NOPERSPECTIVE)
*pc_linear = 0xf;
/* Maybe only process one attribute on the final round:
*/
if (vert_reg_to_varying(c, reg, 1) != BRW_VARYING_SLOT_COUNT) {
*pc |= 0xf0;
interp = c->key.interp_mode[vert_reg_to_vue_slot(c, reg, 1)];
if (interp == INTERP_MODE_SMOOTH) {
*pc_linear |= 0xf0;
*pc_persp |= 0xf0;
} else if (interp == INTERP_MODE_NOPERSPECTIVE)
*pc_linear |= 0xf0;
}
return is_last_attr;
}
/* Calculates the predicate control for which channels of a reg
* (containing 2 attrs) to do point sprite coordinate replacement on.
*/
static uint16_t
calculate_point_sprite_mask(struct brw_sf_compile *c, GLuint reg)
{
int varying1, varying2;
uint16_t pc = 0;
varying1 = vert_reg_to_varying(c, reg, 0);
if (varying1 >= VARYING_SLOT_TEX0 && varying1 <= VARYING_SLOT_TEX7) {
if (c->key.point_sprite_coord_replace & (1 << (varying1 - VARYING_SLOT_TEX0)))
pc |= 0x0f;
}
if (varying1 == BRW_VARYING_SLOT_PNTC)
pc |= 0x0f;
varying2 = vert_reg_to_varying(c, reg, 1);
if (varying2 >= VARYING_SLOT_TEX0 && varying2 <= VARYING_SLOT_TEX7) {
if (c->key.point_sprite_coord_replace & (1 << (varying2 -
VARYING_SLOT_TEX0)))
pc |= 0xf0;
}
if (varying2 == BRW_VARYING_SLOT_PNTC)
pc |= 0xf0;
return pc;
}
static void
set_predicate_control_flag_value(struct brw_codegen *p,
struct brw_sf_compile *c,
unsigned value)
{
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
if (value != 0xff) {
if (value != c->flag_value) {
brw_MOV(p, brw_flag_reg(0, 0), brw_imm_uw(value));
c->flag_value = value;
}
brw_set_default_predicate_control(p, BRW_PREDICATE_NORMAL);
}
}
static void brw_emit_tri_setup(struct brw_sf_compile *c, bool allocate)
{
struct brw_codegen *p = &c->func;
GLuint i;
c->flag_value = 0xff;
c->nr_verts = 3;
if (allocate)
alloc_regs(c);
invert_det(c);
copy_z_inv_w(c);
if (c->key.do_twoside_color)
do_twoside_color(c);
if (c->key.contains_flat_varying)
do_flatshade_triangle(c);
for (i = 0; i < c->nr_setup_regs; i++)
{
/* Pair of incoming attributes:
*/
struct brw_reg a0 = offset(c->vert[0], i);
struct brw_reg a1 = offset(c->vert[1], i);
struct brw_reg a2 = offset(c->vert[2], i);
GLushort pc, pc_persp, pc_linear;
bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
if (pc_persp)
{
set_predicate_control_flag_value(p, c, pc_persp);
brw_MUL(p, a0, a0, c->inv_w[0]);
brw_MUL(p, a1, a1, c->inv_w[1]);
brw_MUL(p, a2, a2, c->inv_w[2]);
}
/* Calculate coefficients for interpolated values:
*/
if (pc_linear)
{
set_predicate_control_flag_value(p, c, pc_linear);
brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
brw_ADD(p, c->a2_sub_a0, a2, negate(a0));
/* calculate dA/dx
*/
brw_MUL(p, brw_null_reg(), c->a1_sub_a0, c->dy2);
brw_MAC(p, c->tmp, c->a2_sub_a0, negate(c->dy0));
brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
/* calculate dA/dy
*/
brw_MUL(p, brw_null_reg(), c->a2_sub_a0, c->dx0);
brw_MAC(p, c->tmp, c->a1_sub_a0, negate(c->dx2));
brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
}
{
set_predicate_control_flag_value(p, c, pc);
/* start point for interpolation
*/
brw_MOV(p, c->m3C0, a0);
/* Copy m0..m3 to URB. m0 is implicitly copied from r0 in
* the send instruction:
*/
brw_urb_WRITE(p,
brw_null_reg(),
0,
brw_vec8_grf(0, 0), /* r0, will be copied to m0 */
last ? BRW_URB_WRITE_EOT_COMPLETE
: BRW_URB_WRITE_NO_FLAGS,
4, /* msg len */
0, /* response len */
i*4, /* offset */
BRW_URB_SWIZZLE_TRANSPOSE); /* XXX: Swizzle control "SF to windower" */
}
}
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
}
static void brw_emit_line_setup(struct brw_sf_compile *c, bool allocate)
{
struct brw_codegen *p = &c->func;
GLuint i;
c->flag_value = 0xff;
c->nr_verts = 2;
if (allocate)
alloc_regs(c);
invert_det(c);
copy_z_inv_w(c);
if (c->key.contains_flat_varying)
do_flatshade_line(c);
for (i = 0; i < c->nr_setup_regs; i++)
{
/* Pair of incoming attributes:
*/
struct brw_reg a0 = offset(c->vert[0], i);
struct brw_reg a1 = offset(c->vert[1], i);
GLushort pc, pc_persp, pc_linear;
bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
if (pc_persp)
{
set_predicate_control_flag_value(p, c, pc_persp);
brw_MUL(p, a0, a0, c->inv_w[0]);
brw_MUL(p, a1, a1, c->inv_w[1]);
}
/* Calculate coefficients for position, color:
*/
if (pc_linear) {
set_predicate_control_flag_value(p, c, pc_linear);
brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
brw_MUL(p, c->tmp, c->a1_sub_a0, c->dx0);
brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
brw_MUL(p, c->tmp, c->a1_sub_a0, c->dy0);
brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
}
{
set_predicate_control_flag_value(p, c, pc);
/* start point for interpolation
*/
brw_MOV(p, c->m3C0, a0);
/* Copy m0..m3 to URB.
*/
brw_urb_WRITE(p,
brw_null_reg(),
0,
brw_vec8_grf(0, 0),
last ? BRW_URB_WRITE_EOT_COMPLETE
: BRW_URB_WRITE_NO_FLAGS,
4, /* msg len */
0, /* response len */
i*4, /* urb destination offset */
BRW_URB_SWIZZLE_TRANSPOSE);
}
}
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
}
static void brw_emit_point_sprite_setup(struct brw_sf_compile *c, bool allocate)
{
struct brw_codegen *p = &c->func;
GLuint i;
c->flag_value = 0xff;
c->nr_verts = 1;
if (allocate)
alloc_regs(c);
copy_z_inv_w(c);
for (i = 0; i < c->nr_setup_regs; i++)
{
struct brw_reg a0 = offset(c->vert[0], i);
GLushort pc, pc_persp, pc_linear, pc_coord_replace;
bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
pc_coord_replace = calculate_point_sprite_mask(c, i);
pc_persp &= ~pc_coord_replace;
if (pc_persp) {
set_predicate_control_flag_value(p, c, pc_persp);
brw_MUL(p, a0, a0, c->inv_w[0]);
}
/* Point sprite coordinate replacement: A texcoord with this
* enabled gets replaced with the value (x, y, 0, 1) where x and
* y vary from 0 to 1 across the horizontal and vertical of the
* point.
*/
if (pc_coord_replace) {
set_predicate_control_flag_value(p, c, pc_coord_replace);
/* Calculate 1.0/PointWidth */
gfx4_math(&c->func,
c->tmp,
BRW_MATH_FUNCTION_INV,
0,
c->dx0,
BRW_MATH_PRECISION_FULL);
brw_set_default_access_mode(p, BRW_ALIGN_16);
/* dA/dx, dA/dy */
brw_MOV(p, c->m1Cx, brw_imm_f(0.0));
brw_MOV(p, c->m2Cy, brw_imm_f(0.0));
brw_MOV(p, brw_writemask(c->m1Cx, WRITEMASK_X), c->tmp);
if (c->key.sprite_origin_lower_left) {
brw_MOV(p, brw_writemask(c->m2Cy, WRITEMASK_Y), negate(c->tmp));
} else {
brw_MOV(p, brw_writemask(c->m2Cy, WRITEMASK_Y), c->tmp);
}
/* attribute constant offset */
brw_MOV(p, c->m3C0, brw_imm_f(0.0));
if (c->key.sprite_origin_lower_left) {
brw_MOV(p, brw_writemask(c->m3C0, WRITEMASK_YW), brw_imm_f(1.0));
} else {
brw_MOV(p, brw_writemask(c->m3C0, WRITEMASK_W), brw_imm_f(1.0));
}
brw_set_default_access_mode(p, BRW_ALIGN_1);
}
if (pc & ~pc_coord_replace) {
set_predicate_control_flag_value(p, c, pc & ~pc_coord_replace);
brw_MOV(p, c->m1Cx, brw_imm_ud(0));
brw_MOV(p, c->m2Cy, brw_imm_ud(0));
brw_MOV(p, c->m3C0, a0); /* constant value */
}
set_predicate_control_flag_value(p, c, pc);
/* Copy m0..m3 to URB. */
brw_urb_WRITE(p,
brw_null_reg(),
0,
brw_vec8_grf(0, 0),
last ? BRW_URB_WRITE_EOT_COMPLETE
: BRW_URB_WRITE_NO_FLAGS,
4, /* msg len */
0, /* response len */
i*4, /* urb destination offset */
BRW_URB_SWIZZLE_TRANSPOSE);
}
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
}
/* Points setup - several simplifications as all attributes are
* constant across the face of the point (point sprites excluded!)
*/
static void brw_emit_point_setup(struct brw_sf_compile *c, bool allocate)
{
struct brw_codegen *p = &c->func;
GLuint i;
c->flag_value = 0xff;
c->nr_verts = 1;
if (allocate)
alloc_regs(c);
copy_z_inv_w(c);
brw_MOV(p, c->m1Cx, brw_imm_ud(0)); /* zero - move out of loop */
brw_MOV(p, c->m2Cy, brw_imm_ud(0)); /* zero - move out of loop */
for (i = 0; i < c->nr_setup_regs; i++)
{
struct brw_reg a0 = offset(c->vert[0], i);
GLushort pc, pc_persp, pc_linear;
bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
if (pc_persp)
{
/* This seems odd as the values are all constant, but the
* fragment shader will be expecting it:
*/
set_predicate_control_flag_value(p, c, pc_persp);
brw_MUL(p, a0, a0, c->inv_w[0]);
}
/* The delta values are always zero, just send the starting
* coordinate. Again, this is to fit in with the interpolation
* code in the fragment shader.
*/
{
set_predicate_control_flag_value(p, c, pc);
brw_MOV(p, c->m3C0, a0); /* constant value */
/* Copy m0..m3 to URB.
*/
brw_urb_WRITE(p,
brw_null_reg(),
0,
brw_vec8_grf(0, 0),
last ? BRW_URB_WRITE_EOT_COMPLETE
: BRW_URB_WRITE_NO_FLAGS,
4, /* msg len */
0, /* response len */
i*4, /* urb destination offset */
BRW_URB_SWIZZLE_TRANSPOSE);
}
}
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
}
static void brw_emit_anyprim_setup( struct brw_sf_compile *c )
{
struct brw_codegen *p = &c->func;
struct brw_reg payload_prim = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0);
struct brw_reg payload_attr = get_element_ud(brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0), 0);
struct brw_reg primmask;
int jmp;
struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
c->nr_verts = 3;
alloc_regs(c);
primmask = retype(get_element(c->tmp, 0), BRW_REGISTER_TYPE_UD);
brw_MOV(p, primmask, brw_imm_ud(1));
brw_SHL(p, primmask, primmask, payload_prim);
brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_TRILIST) |
(1<<_3DPRIM_TRISTRIP) |
(1<<_3DPRIM_TRIFAN) |
(1<<_3DPRIM_TRISTRIP_REVERSE) |
(1<<_3DPRIM_POLYGON) |
(1<<_3DPRIM_RECTLIST) |
(1<<_3DPRIM_TRIFAN_NOSTIPPLE)));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
jmp = brw_JMPI(p, brw_imm_d(0), BRW_PREDICATE_NORMAL) - p->store;
brw_emit_tri_setup(c, false);
brw_land_fwd_jump(p, jmp);
brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_LINELIST) |
(1<<_3DPRIM_LINESTRIP) |
(1<<_3DPRIM_LINELOOP) |
(1<<_3DPRIM_LINESTRIP_CONT) |
(1<<_3DPRIM_LINESTRIP_BF) |
(1<<_3DPRIM_LINESTRIP_CONT_BF)));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
jmp = brw_JMPI(p, brw_imm_d(0), BRW_PREDICATE_NORMAL) - p->store;
brw_emit_line_setup(c, false);
brw_land_fwd_jump(p, jmp);
brw_AND(p, v1_null_ud, payload_attr, brw_imm_ud(1<<BRW_SPRITE_POINT_ENABLE));
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
jmp = brw_JMPI(p, brw_imm_d(0), BRW_PREDICATE_NORMAL) - p->store;
brw_emit_point_sprite_setup(c, false);
brw_land_fwd_jump(p, jmp);
brw_emit_point_setup( c, false );
}
const unsigned *
brw_compile_sf(const struct brw_compiler *compiler,
void *mem_ctx,
const struct brw_sf_prog_key *key,
struct brw_sf_prog_data *prog_data,
struct intel_vue_map *vue_map,
unsigned *final_assembly_size)
{
struct brw_sf_compile c;
memset(&c, 0, sizeof(c));
/* Begin the compilation:
*/
brw_init_codegen(&compiler->isa, &c.func, mem_ctx);
c.key = *key;
c.vue_map = *vue_map;
if (c.key.do_point_coord) {
/*
* gl_PointCoord is a FS instead of VS builtin variable, thus it's
* not included in c.vue_map generated in VS stage. Here we add
* it manually to let SF shader generate the needed interpolation
* coefficient for FS shader.
*/
c.vue_map.varying_to_slot[BRW_VARYING_SLOT_PNTC] = c.vue_map.num_slots;
c.vue_map.slot_to_varying[c.vue_map.num_slots++] = BRW_VARYING_SLOT_PNTC;
}
c.urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET;
c.nr_attr_regs = (c.vue_map.num_slots + 1)/2 - c.urb_entry_read_offset;
c.nr_setup_regs = c.nr_attr_regs;
c.prog_data.urb_read_length = c.nr_attr_regs;
c.prog_data.urb_entry_size = c.nr_setup_regs * 2;
/* Which primitive? Or all three?
*/
switch (key->primitive) {
case BRW_SF_PRIM_TRIANGLES:
c.nr_verts = 3;
brw_emit_tri_setup( &c, true );
break;
case BRW_SF_PRIM_LINES:
c.nr_verts = 2;
brw_emit_line_setup( &c, true );
break;
case BRW_SF_PRIM_POINTS:
c.nr_verts = 1;
if (key->do_point_sprite)
brw_emit_point_sprite_setup( &c, true );
else
brw_emit_point_setup( &c, true );
break;
case BRW_SF_PRIM_UNFILLED_TRIS:
c.nr_verts = 3;
brw_emit_anyprim_setup( &c );
break;
default:
unreachable("not reached");
}
/* FINISHME: SF programs use calculated jumps (i.e., JMPI with a register
* source). Compacting would be difficult.
*/
/* brw_compact_instructions(&c.func, 0, 0, NULL); */
*prog_data = c.prog_data;
const unsigned *program = brw_get_program(&c.func, final_assembly_size);
if (INTEL_DEBUG(DEBUG_SF)) {
fprintf(stderr, "sf:\n");
brw_disassemble_with_labels(&compiler->isa,
program, 0, *final_assembly_size, stderr);
fprintf(stderr, "\n");
}
return program;
}

View file

@ -0,0 +1,370 @@
/*
* Copyright © 2015-2016 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_compiler.h"
#include "brw_shader.h"
#include "brw_eu.h"
#include "brw_nir.h"
#include "dev/intel_debug.h"
#include "compiler/nir/nir.h"
#include "util/u_debug.h"
#define COMMON_OPTIONS \
.has_uclz = true, \
.lower_fdiv = true, \
.lower_scmp = true, \
.lower_flrp16 = true, \
.lower_fmod = true, \
.lower_ufind_msb = true, \
.lower_uadd_carry = true, \
.lower_usub_borrow = true, \
.lower_flrp64 = true, \
.lower_fisnormal = true, \
.lower_isign = true, \
.lower_ldexp = true, \
.lower_bitfield_extract = true, \
.lower_bitfield_insert = true, \
.lower_device_index_to_zero = true, \
.vectorize_io = true, \
.vectorize_tess_levels = true, \
.use_interpolated_input_intrinsics = true, \
.lower_insert_byte = true, \
.lower_insert_word = true, \
.vertex_id_zero_based = true, \
.lower_base_vertex = true, \
.support_16bit_alu = true, \
.lower_uniforms_to_ubo = true
#define COMMON_SCALAR_OPTIONS \
.lower_to_scalar = true, \
.lower_pack_half_2x16 = true, \
.lower_pack_snorm_2x16 = true, \
.lower_pack_snorm_4x8 = true, \
.lower_pack_unorm_2x16 = true, \
.lower_pack_unorm_4x8 = true, \
.lower_unpack_half_2x16 = true, \
.lower_unpack_snorm_2x16 = true, \
.lower_unpack_snorm_4x8 = true, \
.lower_unpack_unorm_2x16 = true, \
.lower_unpack_unorm_4x8 = true, \
.lower_hadd64 = true, \
.avoid_ternary_with_two_constants = true, \
.has_pack_32_4x8 = true, \
.max_unroll_iterations = 32, \
.force_indirect_unrolling = nir_var_function_temp, \
.divergence_analysis_options = \
(nir_divergence_single_patch_per_tcs_subgroup | \
nir_divergence_single_patch_per_tes_subgroup | \
nir_divergence_shader_record_ptr_uniform)
const struct nir_shader_compiler_options brw_scalar_nir_options = {
COMMON_OPTIONS,
COMMON_SCALAR_OPTIONS,
};
const struct nir_shader_compiler_options brw_vector_nir_options = {
COMMON_OPTIONS,
/* In the vec4 backend, our dpN instruction replicates its result to all the
* components of a vec4. We would like NIR to give us replicated fdot
* instructions because it can optimize better for us.
*/
.fdot_replicates = true,
.lower_usub_sat = true,
.lower_pack_snorm_2x16 = true,
.lower_pack_unorm_2x16 = true,
.lower_unpack_snorm_2x16 = true,
.lower_unpack_unorm_2x16 = true,
.lower_extract_byte = true,
.lower_extract_word = true,
.intel_vec4 = true,
.max_unroll_iterations = 32,
};
struct brw_compiler *
brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
{
struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler);
compiler->devinfo = devinfo;
brw_init_isa_info(&compiler->isa, devinfo);
brw_fs_alloc_reg_sets(compiler);
if (devinfo->ver < 8)
brw_vec4_alloc_reg_set(compiler);
compiler->precise_trig = debug_get_bool_option("INTEL_PRECISE_TRIG", false);
compiler->use_tcs_multi_patch = devinfo->ver >= 12;
/* Default to the sampler since that's what we've done since forever */
compiler->indirect_ubos_use_sampler = true;
compiler->lower_dpas = devinfo->verx10 < 125 ||
intel_device_info_is_mtl(devinfo) ||
(intel_device_info_is_arl(devinfo) &&
devinfo->platform != INTEL_PLATFORM_ARL_H) ||
debug_get_bool_option("INTEL_LOWER_DPAS", false);
/* There is no vec4 mode on Gfx10+, and we don't use it at all on Gfx8+. */
for (int i = MESA_SHADER_VERTEX; i < MESA_ALL_SHADER_STAGES; i++) {
compiler->scalar_stage[i] = devinfo->ver >= 8 ||
i == MESA_SHADER_FRAGMENT || i == MESA_SHADER_COMPUTE;
}
for (int i = MESA_SHADER_TASK; i < MESA_VULKAN_SHADER_STAGES; i++)
compiler->scalar_stage[i] = true;
nir_lower_int64_options int64_options =
nir_lower_imul64 |
nir_lower_isign64 |
nir_lower_divmod64 |
nir_lower_imul_high64 |
nir_lower_find_lsb64 |
nir_lower_ufind_msb64 |
nir_lower_bit_count64;
nir_lower_doubles_options fp64_options =
nir_lower_drcp |
nir_lower_dsqrt |
nir_lower_drsq |
nir_lower_dtrunc |
nir_lower_dfloor |
nir_lower_dceil |
nir_lower_dfract |
nir_lower_dround_even |
nir_lower_dmod |
nir_lower_dsub |
nir_lower_ddiv;
if (!devinfo->has_64bit_float || INTEL_DEBUG(DEBUG_SOFT64))
fp64_options |= nir_lower_fp64_full_software;
if (!devinfo->has_64bit_int)
int64_options |= (nir_lower_int64_options)~0;
/* The Bspec's section titled "Instruction_multiply[DevBDW+]" claims that
* destination type can be Quadword and source type Doubleword for Gfx8 and
* Gfx9. So, lower 64 bit multiply instruction on rest of the platforms.
*/
if (devinfo->ver < 8 || devinfo->ver > 9)
int64_options |= nir_lower_imul_2x32_64;
/* We want the GLSL compiler to emit code that uses condition codes */
for (int i = 0; i < MESA_ALL_SHADER_STAGES; i++) {
struct nir_shader_compiler_options *nir_options =
rzalloc(compiler, struct nir_shader_compiler_options);
bool is_scalar = compiler->scalar_stage[i];
if (is_scalar) {
*nir_options = brw_scalar_nir_options;
int64_options |= nir_lower_usub_sat64;
} else {
*nir_options = brw_vector_nir_options;
}
/* Prior to Gfx6, there are no three source operations, and Gfx11 loses
* LRP.
*/
nir_options->lower_ffma16 = devinfo->ver < 6;
nir_options->lower_ffma32 = devinfo->ver < 6;
nir_options->lower_ffma64 = devinfo->ver < 6;
nir_options->lower_flrp32 = devinfo->ver < 6 || devinfo->ver >= 11;
nir_options->lower_fpow = devinfo->ver >= 12;
nir_options->has_bfe = devinfo->ver >= 7;
nir_options->has_bfm = devinfo->ver >= 7;
nir_options->has_bfi = devinfo->ver >= 7;
nir_options->has_rotate16 = devinfo->ver >= 11;
nir_options->has_rotate32 = devinfo->ver >= 11;
nir_options->lower_bitfield_reverse = devinfo->ver < 7;
nir_options->lower_find_lsb = devinfo->ver < 7;
nir_options->lower_ifind_msb = devinfo->ver < 7;
nir_options->has_iadd3 = devinfo->verx10 >= 125;
nir_options->has_sdot_4x8 = devinfo->ver >= 12;
nir_options->has_udot_4x8 = devinfo->ver >= 12;
nir_options->has_sudot_4x8 = devinfo->ver >= 12;
nir_options->has_sdot_4x8_sat = devinfo->ver >= 12;
nir_options->has_udot_4x8_sat = devinfo->ver >= 12;
nir_options->has_sudot_4x8_sat = devinfo->ver >= 12;
nir_options->lower_int64_options = int64_options;
nir_options->lower_doubles_options = fp64_options;
nir_options->unify_interfaces = i < MESA_SHADER_FRAGMENT;
nir_options->force_indirect_unrolling |=
brw_nir_no_indirect_mask(compiler, i);
nir_options->force_indirect_unrolling_sampler = devinfo->ver < 7;
if (compiler->use_tcs_multi_patch) {
/* TCS MULTI_PATCH mode has multiple patches per subgroup */
nir_options->divergence_analysis_options &=
~nir_divergence_single_patch_per_tcs_subgroup;
}
if (devinfo->ver < 12)
nir_options->divergence_analysis_options |=
nir_divergence_single_prim_per_subgroup;
compiler->nir_options[i] = nir_options;
}
compiler->mesh.mue_header_packing =
(unsigned)debug_get_num_option("INTEL_MESH_HEADER_PACKING", 3);
compiler->mesh.mue_compaction =
debug_get_bool_option("INTEL_MESH_COMPACTION", true);
return compiler;
}
static void
insert_u64_bit(uint64_t *val, bool add)
{
*val = (*val << 1) | !!add;
}
uint64_t
brw_get_compiler_config_value(const struct brw_compiler *compiler)
{
uint64_t config = 0;
unsigned bits = 0;
insert_u64_bit(&config, compiler->precise_trig);
bits++;
insert_u64_bit(&config, compiler->lower_dpas);
bits++;
insert_u64_bit(&config, compiler->mesh.mue_compaction);
bits++;
uint64_t mask = DEBUG_DISK_CACHE_MASK;
bits += util_bitcount64(mask);
u_foreach_bit64(bit, mask)
insert_u64_bit(&config, INTEL_DEBUG(1ULL << bit));
mask = SIMD_DISK_CACHE_MASK;
bits += util_bitcount64(mask);
u_foreach_bit64(bit, mask)
insert_u64_bit(&config, (intel_simd & (1ULL << bit)) != 0);
mask = 3;
bits += util_bitcount64(mask);
u_foreach_bit64(bit, mask)
insert_u64_bit(&config, (compiler->mesh.mue_header_packing & (1ULL << bit)) != 0);
assert(bits <= util_bitcount64(UINT64_MAX));
return config;
}
void
brw_device_sha1(char *hex,
const struct intel_device_info *devinfo) {
struct mesa_sha1 ctx;
_mesa_sha1_init(&ctx);
brw_device_sha1_update(&ctx, devinfo);
unsigned char result[20];
_mesa_sha1_final(&ctx, result);
_mesa_sha1_format(hex, result);
}
unsigned
brw_prog_data_size(gl_shader_stage stage)
{
static const size_t stage_sizes[] = {
[MESA_SHADER_VERTEX] = sizeof(struct brw_vs_prog_data),
[MESA_SHADER_TESS_CTRL] = sizeof(struct brw_tcs_prog_data),
[MESA_SHADER_TESS_EVAL] = sizeof(struct brw_tes_prog_data),
[MESA_SHADER_GEOMETRY] = sizeof(struct brw_gs_prog_data),
[MESA_SHADER_FRAGMENT] = sizeof(struct brw_wm_prog_data),
[MESA_SHADER_COMPUTE] = sizeof(struct brw_cs_prog_data),
[MESA_SHADER_TASK] = sizeof(struct brw_task_prog_data),
[MESA_SHADER_MESH] = sizeof(struct brw_mesh_prog_data),
[MESA_SHADER_RAYGEN] = sizeof(struct brw_bs_prog_data),
[MESA_SHADER_ANY_HIT] = sizeof(struct brw_bs_prog_data),
[MESA_SHADER_CLOSEST_HIT] = sizeof(struct brw_bs_prog_data),
[MESA_SHADER_MISS] = sizeof(struct brw_bs_prog_data),
[MESA_SHADER_INTERSECTION] = sizeof(struct brw_bs_prog_data),
[MESA_SHADER_CALLABLE] = sizeof(struct brw_bs_prog_data),
[MESA_SHADER_KERNEL] = sizeof(struct brw_cs_prog_data),
};
assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_sizes));
return stage_sizes[stage];
}
unsigned
brw_prog_key_size(gl_shader_stage stage)
{
static const size_t stage_sizes[] = {
[MESA_SHADER_VERTEX] = sizeof(struct brw_vs_prog_key),
[MESA_SHADER_TESS_CTRL] = sizeof(struct brw_tcs_prog_key),
[MESA_SHADER_TESS_EVAL] = sizeof(struct brw_tes_prog_key),
[MESA_SHADER_GEOMETRY] = sizeof(struct brw_gs_prog_key),
[MESA_SHADER_FRAGMENT] = sizeof(struct brw_wm_prog_key),
[MESA_SHADER_COMPUTE] = sizeof(struct brw_cs_prog_key),
[MESA_SHADER_TASK] = sizeof(struct brw_task_prog_key),
[MESA_SHADER_MESH] = sizeof(struct brw_mesh_prog_key),
[MESA_SHADER_RAYGEN] = sizeof(struct brw_bs_prog_key),
[MESA_SHADER_ANY_HIT] = sizeof(struct brw_bs_prog_key),
[MESA_SHADER_CLOSEST_HIT] = sizeof(struct brw_bs_prog_key),
[MESA_SHADER_MISS] = sizeof(struct brw_bs_prog_key),
[MESA_SHADER_INTERSECTION] = sizeof(struct brw_bs_prog_key),
[MESA_SHADER_CALLABLE] = sizeof(struct brw_bs_prog_key),
[MESA_SHADER_KERNEL] = sizeof(struct brw_cs_prog_key),
};
assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_sizes));
return stage_sizes[stage];
}
void
brw_write_shader_relocs(const struct brw_isa_info *isa,
void *program,
const struct brw_stage_prog_data *prog_data,
struct brw_shader_reloc_value *values,
unsigned num_values)
{
for (unsigned i = 0; i < prog_data->num_relocs; i++) {
assert(prog_data->relocs[i].offset % 8 == 0);
void *dst = program + prog_data->relocs[i].offset;
for (unsigned j = 0; j < num_values; j++) {
if (prog_data->relocs[i].id == values[j].id) {
uint32_t value = values[j].value + prog_data->relocs[i].delta;
switch (prog_data->relocs[i].type) {
case BRW_SHADER_RELOC_TYPE_U32:
*(uint32_t *)dst = value;
break;
case BRW_SHADER_RELOC_TYPE_MOV_IMM:
brw_update_reloc_imm(isa, dst, value);
break;
default:
unreachable("Invalid relocation type");
}
break;
}
}
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,121 @@
/*
* Copyright © 2013 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
/** @file brw_dead_control_flow.cpp
*
* This file implements the dead control flow elimination optimization pass.
*/
#include "brw_shader.h"
#include "brw_cfg.h"
using namespace brw;
/* Look for and eliminate dead control flow:
*
* - if/endif
* - else in else/endif
* - then in if/else/endif
*/
bool
dead_control_flow_eliminate(backend_shader *s)
{
bool progress = false;
foreach_block_safe (block, s->cfg) {
bblock_t *prev_block = block->prev();
if (!prev_block)
continue;
backend_instruction *const inst = block->start();
backend_instruction *const prev_inst = prev_block->end();
/* ENDIF instructions, by definition, can only be found at the start of
* basic blocks.
*/
if (inst->opcode == BRW_OPCODE_ENDIF &&
prev_inst->opcode == BRW_OPCODE_ELSE) {
bblock_t *const else_block = prev_block;
backend_instruction *const else_inst = prev_inst;
else_inst->remove(else_block);
progress = true;
} else if (inst->opcode == BRW_OPCODE_ENDIF &&
prev_inst->opcode == BRW_OPCODE_IF) {
bblock_t *const endif_block = block;
bblock_t *const if_block = prev_block;
backend_instruction *const endif_inst = inst;
backend_instruction *const if_inst = prev_inst;
bblock_t *earlier_block = NULL, *later_block = NULL;
if (if_block->start_ip == if_block->end_ip) {
earlier_block = if_block->prev();
} else {
earlier_block = if_block;
}
if_inst->remove(if_block);
if (endif_block->start_ip == endif_block->end_ip) {
later_block = endif_block->next();
} else {
later_block = endif_block;
}
endif_inst->remove(endif_block);
assert((earlier_block == NULL) == (later_block == NULL));
if (earlier_block && earlier_block->can_combine_with(later_block)) {
earlier_block->combine_with(later_block);
/* If ENDIF was in its own block, then we've now deleted it and
* merged the two surrounding blocks, the latter of which the
* __next block pointer was pointing to.
*/
if (endif_block != later_block) {
__next = earlier_block->next();
}
}
progress = true;
} else if (inst->opcode == BRW_OPCODE_ELSE &&
prev_inst->opcode == BRW_OPCODE_IF) {
bblock_t *const else_block = block;
backend_instruction *const if_inst = prev_inst;
backend_instruction *const else_inst = inst;
/* Since the else-branch is becoming the new then-branch, the
* condition has to be inverted.
*/
if_inst->predicate_inverse = !if_inst->predicate_inverse;
else_inst->remove(else_block);
progress = true;
}
}
if (progress)
s->invalidate_analysis(DEPENDENCY_BLOCKS | DEPENDENCY_INSTRUCTIONS);
return progress;
}

View file

@ -0,0 +1,31 @@
/*
* Copyright © 2013 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_DEAD_CONTROL_FLOW_H
#define BRW_DEAD_CONTROL_FLOW_H
#include "brw_shader.h"
bool dead_control_flow_eliminate(backend_shader *s);
#endif /* BRW_DEAD_CONTROL_FLOW_H */

View file

@ -0,0 +1,238 @@
/*
* Copyright © 2019 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/**
* @file brw_debug_recompiles.c
*/
#include <stdio.h>
#include "brw_compiler.h"
static bool
key_debug(const struct brw_compiler *c, void *log,
const char *name, int a, int b)
{
if (a != b) {
brw_shader_perf_log(c, log, " %s %d->%d\n", name, a, b);
return true;
}
return false;
}
static bool
key_debug_float(const struct brw_compiler *c, void *log,
const char *name, float a, float b)
{
if (a != b) {
brw_shader_perf_log(c, log, " %s %f->%f\n", name, a, b);
return true;
}
return false;
}
#define check(name, field) \
key_debug(c, log, name, old_key->field, key->field)
#define check_float(name, field) \
key_debug_float(c, log, name, old_key->field, key->field)
static bool
debug_sampler_recompile(const struct brw_compiler *c, void *log,
const struct brw_sampler_prog_key_data *old_key,
const struct brw_sampler_prog_key_data *key)
{
bool found = false;
found |= check("gather channel quirk", gather_channel_quirk_mask);
for (unsigned i = 0; i < BRW_MAX_SAMPLERS; i++) {
found |= check("EXT_texture_swizzle or DEPTH_TEXTURE_MODE", swizzles[i]);
found |= check("textureGather workarounds", gfx6_gather_wa[i]);
}
for (unsigned i = 0; i < 3; i++) {
found |= check("GL_CLAMP enabled on any texture unit", gl_clamp_mask[i]);
}
return found;
}
static bool
debug_base_recompile(const struct brw_compiler *c, void *log,
const struct brw_base_prog_key *old_key,
const struct brw_base_prog_key *key)
{
return debug_sampler_recompile(c, log, &old_key->tex, &key->tex);
}
static void
debug_vs_recompile(const struct brw_compiler *c, void *log,
const struct brw_vs_prog_key *old_key,
const struct brw_vs_prog_key *key)
{
bool found = debug_base_recompile(c, log, &old_key->base, &key->base);
for (unsigned i = 0; i < VERT_ATTRIB_MAX; i++) {
found |= check("vertex attrib w/a flags", gl_attrib_wa_flags[i]);
}
found |= check("legacy user clipping", nr_userclip_plane_consts);
found |= check("copy edgeflag", copy_edgeflag);
found |= check("pointcoord replace", point_coord_replace);
found |= check("vertex color clamping", clamp_vertex_color);
if (!found) {
brw_shader_perf_log(c, log, " something else\n");
}
}
static void
debug_tcs_recompile(const struct brw_compiler *c, void *log,
const struct brw_tcs_prog_key *old_key,
const struct brw_tcs_prog_key *key)
{
bool found = debug_base_recompile(c, log, &old_key->base, &key->base);
found |= check("input vertices", input_vertices);
found |= check("outputs written", outputs_written);
found |= check("patch outputs written", patch_outputs_written);
found |= check("tes primitive mode", _tes_primitive_mode);
found |= check("quads and equal_spacing workaround", quads_workaround);
if (!found) {
brw_shader_perf_log(c, log, " something else\n");
}
}
static void
debug_tes_recompile(const struct brw_compiler *c, void *log,
const struct brw_tes_prog_key *old_key,
const struct brw_tes_prog_key *key)
{
bool found = debug_base_recompile(c, log, &old_key->base, &key->base);
found |= check("inputs read", inputs_read);
found |= check("patch inputs read", patch_inputs_read);
if (!found) {
brw_shader_perf_log(c, log, " something else\n");
}
}
static void
debug_gs_recompile(const struct brw_compiler *c, void *log,
const struct brw_gs_prog_key *old_key,
const struct brw_gs_prog_key *key)
{
bool found = debug_base_recompile(c, log, &old_key->base, &key->base);
if (!found) {
brw_shader_perf_log(c, log, " something else\n");
}
}
static void
debug_fs_recompile(const struct brw_compiler *c, void *log,
const struct brw_wm_prog_key *old_key,
const struct brw_wm_prog_key *key)
{
bool found = false;
found |= check("alphatest, computed depth, depth test, or depth write",
iz_lookup);
found |= check("depth statistics", stats_wm);
found |= check("flat shading", flat_shade);
found |= check("number of color buffers", nr_color_regions);
found |= check("MRT alpha test", alpha_test_replicate_alpha);
found |= check("alpha to coverage", alpha_to_coverage);
found |= check("fragment color clamping", clamp_fragment_color);
found |= check("per-sample interpolation", persample_interp);
found |= check("multisampled FBO", multisample_fbo);
found |= check("line smoothing", line_aa);
found |= check("force dual color blending", force_dual_color_blend);
found |= check("coherent fb fetch", coherent_fb_fetch);
found |= check("ignore sample mask out", ignore_sample_mask_out);
found |= check("coarse pixel", coarse_pixel);
found |= check("input slots valid", input_slots_valid);
found |= check("mrt alpha test function", alpha_test_func);
found |= check("mrt alpha test reference value", alpha_test_ref);
found |= debug_base_recompile(c, log, &old_key->base, &key->base);
if (!found) {
brw_shader_perf_log(c, log, " something else\n");
}
}
static void
debug_cs_recompile(const struct brw_compiler *c, void *log,
const struct brw_cs_prog_key *old_key,
const struct brw_cs_prog_key *key)
{
bool found = debug_base_recompile(c, log, &old_key->base, &key->base);
if (!found) {
brw_shader_perf_log(c, log, " something else\n");
}
}
void
brw_debug_key_recompile(const struct brw_compiler *c, void *log,
gl_shader_stage stage,
const struct brw_base_prog_key *old_key,
const struct brw_base_prog_key *key)
{
if (!old_key) {
brw_shader_perf_log(c, log, " No previous compile found...\n");
return;
}
switch (stage) {
case MESA_SHADER_VERTEX:
debug_vs_recompile(c, log, (const struct brw_vs_prog_key *)old_key,
(const struct brw_vs_prog_key *)key);
break;
case MESA_SHADER_TESS_CTRL:
debug_tcs_recompile(c, log, (const struct brw_tcs_prog_key *)old_key,
(const struct brw_tcs_prog_key *)key);
break;
case MESA_SHADER_TESS_EVAL:
debug_tes_recompile(c, log, (const struct brw_tes_prog_key *)old_key,
(const struct brw_tes_prog_key *)key);
break;
case MESA_SHADER_GEOMETRY:
debug_gs_recompile(c, log, (const struct brw_gs_prog_key *)old_key,
(const struct brw_gs_prog_key *)key);
break;
case MESA_SHADER_FRAGMENT:
debug_fs_recompile(c, log, (const struct brw_wm_prog_key *)old_key,
(const struct brw_wm_prog_key *)key);
break;
case MESA_SHADER_COMPUTE:
debug_cs_recompile(c, log, (const struct brw_cs_prog_key *)old_key,
(const struct brw_cs_prog_key *)key);
break;
default:
break;
}
}

View file

@ -0,0 +1,74 @@
#!/usr/bin/env python3
COPYRIGHT = """\
/*
* Copyright 2024 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sub license, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice (including the
* next paragraph) shall be included in all copies or substantial portions
* of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
"""
import argparse
import os
import sys
from mako.template import Template
from mako import exceptions
sys.path.append(f"{os.path.dirname(sys.argv[0])}/../dev")
import intel_device_info
template = COPYRIGHT + """
/* DO NOT EDIT - This file generated automatically by intel_device_serialize_c.py script */
#include "dev/intel_device_info.h"
#include "brw_compiler.h"
#define SHA_UPDATE_FIELD(field) _mesa_sha1_update(ctx, &devinfo->field, sizeof(devinfo->field))
void
brw_device_sha1_update(struct mesa_sha1 *ctx,
const struct intel_device_info *devinfo) {
% for member in compiler_fields:
SHA_UPDATE_FIELD(${member.name});
% endfor
}
#undef SHA_UPDATE_FIELD
"""
def main():
"""print intel_device_serialize.c at the specified path"""
parser = argparse.ArgumentParser()
parser.add_argument('--outdir', required=True,
help='Directory to put the generated files in')
args = parser.parse_args()
path = os.path.join(args.outdir, 'brw_device_sha1_gen.c')
device_members = intel_device_info.TYPES_BY_NAME["intel_device_info"].members
compiler_fields = [field for field in device_members if field.compiler_field]
with open(path, 'w', encoding='utf-8') as f:
try:
f.write(Template(template).render(compiler_fields=compiler_fields))
except:
print(exceptions.text_error_template().render(compiler_fields=compiler_fields))
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,42 @@
/*
* Copyright 2024 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#ifndef BRW_DISASM_H
#define BRW_DISASM_H
#include <stdio.h>
#include <stdbool.h>
#ifdef __cplusplus
extern "C" {
#endif
struct brw_isa_info;
struct brw_inst;
const struct brw_label *brw_find_label(const struct brw_label *root, int offset);
void brw_create_label(struct brw_label **labels, int offset, void *mem_ctx);
int brw_disassemble_inst(FILE *file, const struct brw_isa_info *isa,
const struct brw_inst *inst, bool is_compacted,
int offset, const struct brw_label *root_label);
const struct
brw_label *brw_label_assembly(const struct brw_isa_info *isa,
const void *assembly, int start, int end,
void *mem_ctx);
void brw_disassemble_with_labels(const struct brw_isa_info *isa,
const void *assembly, int start, int end, FILE *out);
void brw_disassemble(const struct brw_isa_info *isa,
const void *assembly, int start, int end,
const struct brw_label *root_label, FILE *out);
int brw_disassemble_find_end(const struct brw_isa_info *isa,
const void *assembly, int start);
void brw_disassemble_with_errors(const struct brw_isa_info *isa,
const void *assembly, int start, FILE *out);
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* BRW_DISASM_H */

View file

@ -0,0 +1,207 @@
/*
* Copyright © 2014 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_cfg.h"
#include "brw_eu.h"
#include "brw_disasm.h"
#include "brw_disasm_info.h"
#include "dev/intel_debug.h"
#include "compiler/nir/nir.h"
__attribute__((weak)) void nir_print_instr(UNUSED const nir_instr *instr,
UNUSED FILE *fp) {}
void
dump_assembly(void *assembly, int start_offset, int end_offset,
struct disasm_info *disasm, const unsigned *block_latency)
{
const struct brw_isa_info *isa = disasm->isa;
const char *last_annotation_string = NULL;
const void *last_annotation_ir = NULL;
void *mem_ctx = ralloc_context(NULL);
const struct brw_label *root_label =
brw_label_assembly(isa, assembly, start_offset, end_offset, mem_ctx);
foreach_list_typed(struct inst_group, group, link, &disasm->group_list) {
struct exec_node *next_node = exec_node_get_next(&group->link);
if (exec_node_is_tail_sentinel(next_node))
break;
struct inst_group *next =
exec_node_data(struct inst_group, next_node, link);
int start_offset = group->offset;
int end_offset = next->offset;
if (group->block_start) {
fprintf(stderr, " START B%d", group->block_start->num);
foreach_list_typed(struct bblock_link, predecessor_link, link,
&group->block_start->parents) {
struct bblock_t *predecessor_block = predecessor_link->block;
fprintf(stderr, " <-B%d", predecessor_block->num);
}
if (block_latency)
fprintf(stderr, " (%u cycles)",
block_latency[group->block_start->num]);
fprintf(stderr, "\n");
}
if (last_annotation_ir != group->ir) {
last_annotation_ir = group->ir;
if (last_annotation_ir) {
fprintf(stderr, " ");
nir_print_instr(group->ir, stderr);
fprintf(stderr, "\n");
}
}
if (last_annotation_string != group->annotation) {
last_annotation_string = group->annotation;
if (last_annotation_string)
fprintf(stderr, " %s\n", last_annotation_string);
}
brw_disassemble(isa, assembly, start_offset, end_offset,
root_label, stderr);
if (group->error) {
fputs(group->error, stderr);
}
if (group->block_end) {
fprintf(stderr, " END B%d", group->block_end->num);
foreach_list_typed(struct bblock_link, successor_link, link,
&group->block_end->children) {
struct bblock_t *successor_block = successor_link->block;
fprintf(stderr, " ->B%d", successor_block->num);
}
fprintf(stderr, "\n");
}
}
fprintf(stderr, "\n");
ralloc_free(mem_ctx);
}
struct disasm_info *
disasm_initialize(const struct brw_isa_info *isa,
const struct cfg_t *cfg)
{
struct disasm_info *disasm = ralloc(NULL, struct disasm_info);
exec_list_make_empty(&disasm->group_list);
disasm->isa = isa;
disasm->cfg = cfg;
disasm->cur_block = 0;
disasm->use_tail = false;
return disasm;
}
struct inst_group *
disasm_new_inst_group(struct disasm_info *disasm, unsigned next_inst_offset)
{
struct inst_group *tail = rzalloc(disasm, struct inst_group);
tail->offset = next_inst_offset;
exec_list_push_tail(&disasm->group_list, &tail->link);
return tail;
}
void
disasm_annotate(struct disasm_info *disasm,
struct backend_instruction *inst, unsigned offset)
{
const struct intel_device_info *devinfo = disasm->isa->devinfo;
const struct cfg_t *cfg = disasm->cfg;
struct inst_group *group;
if (!disasm->use_tail) {
group = disasm_new_inst_group(disasm, offset);
} else {
disasm->use_tail = false;
group = exec_node_data(struct inst_group,
exec_list_get_tail_raw(&disasm->group_list), link);
}
if (INTEL_DEBUG(DEBUG_ANNOTATION)) {
group->ir = inst->ir;
group->annotation = inst->annotation;
}
if (bblock_start(cfg->blocks[disasm->cur_block]) == inst) {
group->block_start = cfg->blocks[disasm->cur_block];
}
/* There is no hardware DO instruction on Gfx6+, so since DO always
* starts a basic block, we need to set the .block_start of the next
* instruction's annotation with a pointer to the bblock started by
* the DO.
*
* There's also only complication from emitting an annotation without
* a corresponding hardware instruction to disassemble.
*/
if (devinfo->ver >= 6 && inst->opcode == BRW_OPCODE_DO) {
disasm->use_tail = true;
}
if (bblock_end(cfg->blocks[disasm->cur_block]) == inst) {
group->block_end = cfg->blocks[disasm->cur_block];
disasm->cur_block++;
}
}
void
disasm_insert_error(struct disasm_info *disasm, unsigned offset,
unsigned inst_size, const char *error)
{
foreach_list_typed(struct inst_group, cur, link, &disasm->group_list) {
struct exec_node *next_node = exec_node_get_next(&cur->link);
if (exec_node_is_tail_sentinel(next_node))
break;
struct inst_group *next =
exec_node_data(struct inst_group, next_node, link);
if (next->offset <= offset)
continue;
if (offset + inst_size != next->offset) {
struct inst_group *new = ralloc(disasm, struct inst_group);
memcpy(new, cur, sizeof(struct inst_group));
cur->error = NULL;
cur->error_length = 0;
cur->block_end = NULL;
new->offset = offset + inst_size;
new->block_start = NULL;
exec_node_insert_after(&cur->link, &new->link);
}
if (cur->error)
ralloc_strcat(&cur->error, error);
else
cur->error = ralloc_strdup(disasm, error);
return;
}
}

View file

@ -0,0 +1,90 @@
/*
* Copyright © 2014 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef _INTEL_ASM_ANNOTATION_H
#define _INTEL_ASM_ANNOTATION_H
#include "compiler/glsl/list.h"
#ifdef __cplusplus
extern "C" {
#endif
struct cfg_t;
struct backend_instruction;
struct intel_device_info;
struct inst_group {
struct exec_node link;
int offset;
size_t error_length;
char *error;
/* Pointers to the basic block in the CFG if the instruction group starts
* or ends a basic block.
*/
struct bblock_t *block_start;
struct bblock_t *block_end;
/* Annotation for the generated IR. One of the two can be set. */
const void *ir;
const char *annotation;
};
struct disasm_info {
struct exec_list group_list;
const struct brw_isa_info *isa;
const struct cfg_t *cfg;
/** Block index in the cfg. */
int cur_block;
bool use_tail;
};
void
dump_assembly(void *assembly, int start_offset, int end_offset,
struct disasm_info *disasm, const unsigned *block_latency);
struct disasm_info *
disasm_initialize(const struct brw_isa_info *isa,
const struct cfg_t *cfg);
struct inst_group *
disasm_new_inst_group(struct disasm_info *disasm, unsigned offset);
void
disasm_annotate(struct disasm_info *disasm,
struct backend_instruction *inst, unsigned offset);
void
disasm_insert_error(struct disasm_info *disasm, unsigned offset,
unsigned inst_size, const char *error);
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* _INTEL_ASM_ANNOTATION_H */

View file

@ -0,0 +1,242 @@
/*
* Copyright © 2018 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <getopt.h>
#include "compiler/brw_disasm.h"
#include "compiler/brw_isa_info.h"
#include "dev/intel_device_info.h"
#include "util/u_dynarray.h"
enum opt_input_type {
OPT_INPUT_BINARY,
OPT_INPUT_C_LITERAL,
};
static enum opt_input_type input_type = OPT_INPUT_BINARY;
/* Return size of file in bytes pointed by fp */
static long
i965_disasm_get_file_size(FILE *fp)
{
long size;
fseek(fp, 0L, SEEK_END);
size = ftell(fp);
fseek(fp, 0L, SEEK_SET);
return size;
}
/* Read hex file which should be in following format:
* for example :
* { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }
*/
static void *
i965_disasm_read_c_literal_file(FILE *fp, size_t *end)
{
struct util_dynarray assembly = {};
uint32_t temp[2];
if (fscanf(fp, " { ") == EOF) {
fprintf(stderr, "Couldn't find opening `{`\n");
return NULL;
}
if (fscanf(fp, "0x%x , 0x%x", &temp[0], &temp[1]) == 2) {
util_dynarray_append(&assembly, uint32_t, temp[0]);
util_dynarray_append(&assembly, uint32_t, temp[1]);
} else {
fprintf(stderr, "Couldn't read hex values\n");
return NULL;
}
while (fscanf(fp, " , 0x%x , 0x%x ", &temp[0], &temp[1]) == 2) {
util_dynarray_append(&assembly, uint32_t, temp[0]);
util_dynarray_append(&assembly, uint32_t, temp[1]);
}
if (fscanf(fp, "}") == EOF) {
fprintf(stderr, "Couldn't find closing `}`\n");
return NULL;
}
*end = assembly.size;
return assembly.data;
}
static void *
i965_disasm_read_binary(FILE *fp, size_t *end)
{
size_t size;
void *assembly;
long sz = i965_disasm_get_file_size(fp);
if (sz < 0)
return NULL;
*end = (size_t)sz;
if (!*end)
return NULL;
assembly = malloc(*end + 1);
if (assembly == NULL)
return NULL;
size = fread(assembly, *end, 1, fp);
if (!size) {
free(assembly);
return NULL;
}
return assembly;
}
static void
print_help(const char *progname, FILE *file)
{
fprintf(file,
"Usage: %s [OPTION]...\n"
"Disassemble i965 instructions from binary file.\n\n"
" --help display this help and exit\n"
" --input-path=PATH read binary file from binary file PATH\n"
" --type=INPUT_TYPE INPUT_TYPE can be 'bin' (default if omitted),\n"
" 'c_literal'.\n"
" --gen=platform disassemble instructions for given \n"
" platform (3 letter platform name)\n",
progname);
}
int main(int argc, char *argv[])
{
FILE *fp = NULL;
void *assembly = NULL;
char *file_path = NULL;
size_t start = 0, end = 0;
uint16_t pci_id = 0;
int c;
int result = EXIT_FAILURE;
bool help = false;
const struct option i965_disasm_opts[] = {
{ "help", no_argument, (int *) &help, true },
{ "input-path", required_argument, NULL, 'i' },
{ "type", required_argument, NULL, 't' },
{ "gen", required_argument, NULL, 'g'},
{ NULL, 0, NULL, 0 }
};
while ((c = getopt_long(argc, argv, ":i:t:g:h", i965_disasm_opts, NULL)) != -1) {
switch (c) {
case 'g': {
const int id = intel_device_name_to_pci_device_id(optarg);
if (id < 0) {
fprintf(stderr, "can't parse gen: '%s', expected 3 letter "
"platform name\n", optarg);
goto end;
} else {
pci_id = id;
}
break;
}
case 'i':
file_path = strdup(optarg);
fp = fopen(file_path, "r");
if (!fp) {
fprintf(stderr, "Unable to read input file : %s\n",
file_path);
goto end;
}
break;
case 't':
if (strcmp(optarg, "c_literal") == 0) {
input_type = OPT_INPUT_C_LITERAL;
} else if (strcmp(optarg, "bin") == 0) {
input_type = OPT_INPUT_BINARY;
} else {
fprintf(stderr, "invalid value for --type: %s\n", optarg);
goto end;
}
break;
case 'h':
help = true;
print_help(argv[0], stderr);
goto end;
case 0:
break;
case ':':
fprintf(stderr, "%s: option `-%c' requires an argument\n",
argv[0], optopt);
goto end;
case '?':
default:
fprintf(stderr, "%s: option `-%c' is invalid: ignored\n",
argv[0], optopt);
goto end;
}
}
if (help || !file_path || !pci_id) {
print_help(argv[0], stderr);
exit(0);
}
struct intel_device_info devinfo;
if (!intel_get_device_info_from_pci_id(pci_id, &devinfo)) {
fprintf(stderr, "can't find device information: pci_id=0x%x\n", pci_id);
exit(EXIT_FAILURE);
}
struct brw_isa_info isa;
brw_init_isa_info(&isa, &devinfo);
if (input_type == OPT_INPUT_BINARY)
assembly = i965_disasm_read_binary(fp, &end);
else if (input_type == OPT_INPUT_C_LITERAL)
assembly = i965_disasm_read_c_literal_file(fp, &end);
if (!assembly) {
if (end)
fprintf(stderr, "Unable to allocate buffer to read input file\n");
else
fprintf(stderr, "Failed to read input file\n");
goto end;
}
/* Disassemble i965 instructions from buffer assembly */
brw_disassemble_with_labels(&isa, assembly, start, end, stdout);
result = EXIT_SUCCESS;
end:
if (fp)
fclose(fp);
free(file_path);
free(assembly);
exit(result);
}

View file

@ -0,0 +1,856 @@
/*
Copyright (C) Intel Corp. 2006. All Rights Reserved.
Intel funded Tungsten Graphics to
develop this 3D driver.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice (including the
next paragraph) shall be included in all copies or substantial
portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
**********************************************************************/
/*
* Authors:
* Keith Whitwell <keithw@vmware.com>
*/
#include <sys/stat.h>
#include <fcntl.h>
#include "brw_disasm.h"
#include "brw_eu_defines.h"
#include "brw_eu.h"
#include "brw_shader.h"
#include "intel_gfx_ver_enum.h"
#include "dev/intel_debug.h"
#include "util/u_debug.h"
#include "util/ralloc.h"
/* Returns a conditional modifier that negates the condition. */
enum brw_conditional_mod
brw_negate_cmod(enum brw_conditional_mod cmod)
{
switch (cmod) {
case BRW_CONDITIONAL_Z:
return BRW_CONDITIONAL_NZ;
case BRW_CONDITIONAL_NZ:
return BRW_CONDITIONAL_Z;
case BRW_CONDITIONAL_G:
return BRW_CONDITIONAL_LE;
case BRW_CONDITIONAL_GE:
return BRW_CONDITIONAL_L;
case BRW_CONDITIONAL_L:
return BRW_CONDITIONAL_GE;
case BRW_CONDITIONAL_LE:
return BRW_CONDITIONAL_G;
default:
unreachable("Can't negate this cmod");
}
}
/* Returns the corresponding conditional mod for swapping src0 and
* src1 in e.g. CMP.
*/
enum brw_conditional_mod
brw_swap_cmod(enum brw_conditional_mod cmod)
{
switch (cmod) {
case BRW_CONDITIONAL_Z:
case BRW_CONDITIONAL_NZ:
return cmod;
case BRW_CONDITIONAL_G:
return BRW_CONDITIONAL_L;
case BRW_CONDITIONAL_GE:
return BRW_CONDITIONAL_LE;
case BRW_CONDITIONAL_L:
return BRW_CONDITIONAL_G;
case BRW_CONDITIONAL_LE:
return BRW_CONDITIONAL_GE;
default:
return BRW_CONDITIONAL_NONE;
}
}
/**
* Get the least significant bit offset of the i+1-th component of immediate
* type \p type. For \p i equal to the two's complement of j, return the
* offset of the j-th component starting from the end of the vector. For
* scalar register types return zero.
*/
static unsigned
imm_shift(enum brw_reg_type type, unsigned i)
{
assert(type != BRW_REGISTER_TYPE_UV && type != BRW_REGISTER_TYPE_V &&
"Not implemented.");
if (type == BRW_REGISTER_TYPE_VF)
return 8 * (i & 3);
else
return 0;
}
/**
* Swizzle an arbitrary immediate \p x of the given type according to the
* permutation specified as \p swz.
*/
uint32_t
brw_swizzle_immediate(enum brw_reg_type type, uint32_t x, unsigned swz)
{
if (imm_shift(type, 1)) {
const unsigned n = 32 / imm_shift(type, 1);
uint32_t y = 0;
for (unsigned i = 0; i < n; i++) {
/* Shift the specified component all the way to the right and left to
* discard any undesired L/MSBs, then shift it right into component i.
*/
y |= x >> imm_shift(type, (i & ~3) + BRW_GET_SWZ(swz, i & 3))
<< imm_shift(type, ~0u)
>> imm_shift(type, ~0u - i);
}
return y;
} else {
return x;
}
}
unsigned
brw_get_default_exec_size(struct brw_codegen *p)
{
return p->current->exec_size;
}
unsigned
brw_get_default_group(struct brw_codegen *p)
{
return p->current->group;
}
unsigned
brw_get_default_access_mode(struct brw_codegen *p)
{
return p->current->access_mode;
}
struct tgl_swsb
brw_get_default_swsb(struct brw_codegen *p)
{
return p->current->swsb;
}
void
brw_set_default_exec_size(struct brw_codegen *p, unsigned value)
{
p->current->exec_size = value;
}
void brw_set_default_predicate_control(struct brw_codegen *p, enum brw_predicate pc)
{
p->current->predicate = pc;
}
void brw_set_default_predicate_inverse(struct brw_codegen *p, bool predicate_inverse)
{
p->current->pred_inv = predicate_inverse;
}
void brw_set_default_flag_reg(struct brw_codegen *p, int reg, int subreg)
{
assert(subreg < 2);
p->current->flag_subreg = reg * 2 + subreg;
}
void brw_set_default_access_mode( struct brw_codegen *p, unsigned access_mode )
{
p->current->access_mode = access_mode;
}
void
brw_set_default_compression_control(struct brw_codegen *p,
enum brw_compression compression_control)
{
switch (compression_control) {
case BRW_COMPRESSION_NONE:
/* This is the "use the first set of bits of dmask/vmask/arf
* according to execsize" option.
*/
p->current->group = 0;
break;
case BRW_COMPRESSION_2NDHALF:
/* For SIMD8, this is "use the second set of 8 bits." */
p->current->group = 8;
break;
case BRW_COMPRESSION_COMPRESSED:
/* For SIMD16 instruction compression, use the first set of 16 bits
* since we don't do SIMD32 dispatch.
*/
p->current->group = 0;
break;
default:
unreachable("not reached");
}
if (p->devinfo->ver <= 6) {
p->current->compressed =
(compression_control == BRW_COMPRESSION_COMPRESSED);
}
}
/**
* Enable or disable instruction compression on the given instruction leaving
* the currently selected channel enable group untouched.
*/
void
brw_inst_set_compression(const struct intel_device_info *devinfo,
brw_inst *inst, bool on)
{
if (devinfo->ver >= 6) {
/* No-op, the EU will figure out for us whether the instruction needs to
* be compressed.
*/
} else {
/* The channel group and compression controls are non-orthogonal, there
* are two possible representations for uncompressed instructions and we
* may need to preserve the current one to avoid changing the selected
* channel group inadvertently.
*/
if (on)
brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_COMPRESSED);
else if (brw_inst_qtr_control(devinfo, inst)
== BRW_COMPRESSION_COMPRESSED)
brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
}
}
void
brw_set_default_compression(struct brw_codegen *p, bool on)
{
p->current->compressed = on;
}
/**
* Apply the range of channel enable signals given by
* [group, group + exec_size) to the instruction passed as argument.
*/
void
brw_inst_set_group(const struct intel_device_info *devinfo,
brw_inst *inst, unsigned group)
{
if (devinfo->ver >= 20) {
assert(group % 8 == 0 && group < 32);
brw_inst_set_qtr_control(devinfo, inst, group / 8);
} else if (devinfo->ver >= 7) {
assert(group % 4 == 0 && group < 32);
brw_inst_set_qtr_control(devinfo, inst, group / 8);
brw_inst_set_nib_control(devinfo, inst, (group / 4) % 2);
} else if (devinfo->ver == 6) {
assert(group % 8 == 0 && group < 32);
brw_inst_set_qtr_control(devinfo, inst, group / 8);
} else {
assert(group % 8 == 0 && group < 16);
/* The channel group and compression controls are non-orthogonal, there
* are two possible representations for group zero and we may need to
* preserve the current one to avoid changing the selected compression
* enable inadvertently.
*/
if (group == 8)
brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_2NDHALF);
else if (brw_inst_qtr_control(devinfo, inst) == BRW_COMPRESSION_2NDHALF)
brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
}
}
void
brw_set_default_group(struct brw_codegen *p, unsigned group)
{
p->current->group = group;
}
void brw_set_default_mask_control( struct brw_codegen *p, unsigned value )
{
p->current->mask_control = value;
}
void brw_set_default_saturate( struct brw_codegen *p, bool enable )
{
p->current->saturate = enable;
}
void brw_set_default_acc_write_control(struct brw_codegen *p, unsigned value)
{
p->current->acc_wr_control = value;
}
void brw_set_default_swsb(struct brw_codegen *p, struct tgl_swsb value)
{
p->current->swsb = value;
}
void brw_push_insn_state( struct brw_codegen *p )
{
assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]);
*(p->current + 1) = *p->current;
p->current++;
}
void brw_pop_insn_state( struct brw_codegen *p )
{
assert(p->current != p->stack);
p->current--;
}
/***********************************************************************
*/
void
brw_init_codegen(const struct brw_isa_info *isa,
struct brw_codegen *p, void *mem_ctx)
{
memset(p, 0, sizeof(*p));
p->isa = isa;
p->devinfo = isa->devinfo;
p->automatic_exec_sizes = true;
/*
* Set the initial instruction store array size to 1024, if found that
* isn't enough, then it will double the store size at brw_next_insn()
* until out of memory.
*/
p->store_size = 1024;
p->store = rzalloc_array(mem_ctx, brw_inst, p->store_size);
p->nr_insn = 0;
p->current = p->stack;
memset(p->current, 0, sizeof(p->current[0]));
p->mem_ctx = mem_ctx;
/* Some defaults?
*/
brw_set_default_exec_size(p, BRW_EXECUTE_8);
brw_set_default_mask_control(p, BRW_MASK_ENABLE); /* what does this do? */
brw_set_default_saturate(p, 0);
brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
/* Set up control flow stack */
p->if_stack_depth = 0;
p->if_stack_array_size = 16;
p->if_stack = rzalloc_array(mem_ctx, int, p->if_stack_array_size);
p->loop_stack_depth = 0;
p->loop_stack_array_size = 16;
p->loop_stack = rzalloc_array(mem_ctx, int, p->loop_stack_array_size);
p->if_depth_in_loop = rzalloc_array(mem_ctx, int, p->loop_stack_array_size);
}
const unsigned *brw_get_program( struct brw_codegen *p,
unsigned *sz )
{
*sz = p->next_insn_offset;
return (const unsigned *)p->store;
}
const struct brw_shader_reloc *
brw_get_shader_relocs(struct brw_codegen *p, unsigned *num_relocs)
{
*num_relocs = p->num_relocs;
return p->relocs;
}
DEBUG_GET_ONCE_OPTION(shader_bin_dump_path, "INTEL_SHADER_BIN_DUMP_PATH", NULL);
bool brw_should_dump_shader_bin(void)
{
return debug_get_option_shader_bin_dump_path() != NULL;
}
void brw_dump_shader_bin(void *assembly, int start_offset, int end_offset,
const char *identifier)
{
char *name = ralloc_asprintf(NULL, "%s/%s.bin",
debug_get_option_shader_bin_dump_path(),
identifier);
int fd = open(name, O_CREAT | O_WRONLY, 0777);
ralloc_free(name);
if (fd < 0)
return;
struct stat sb;
if (fstat(fd, &sb) != 0 || (!S_ISREG(sb.st_mode))) {
close(fd);
return;
}
size_t to_write = end_offset - start_offset;
void *write_ptr = assembly + start_offset;
while (to_write) {
ssize_t ret = write(fd, write_ptr, to_write);
if (ret <= 0) {
close(fd);
return;
}
to_write -= ret;
write_ptr += ret;
}
close(fd);
}
bool brw_try_override_assembly(struct brw_codegen *p, int start_offset,
const char *identifier)
{
const char *read_path = getenv("INTEL_SHADER_ASM_READ_PATH");
if (!read_path) {
return false;
}
char *name = ralloc_asprintf(NULL, "%s/%s.bin", read_path, identifier);
int fd = open(name, O_RDONLY);
ralloc_free(name);
if (fd == -1) {
return false;
}
struct stat sb;
if (fstat(fd, &sb) != 0 || (!S_ISREG(sb.st_mode))) {
close(fd);
return false;
}
p->nr_insn -= (p->next_insn_offset - start_offset) / sizeof(brw_inst);
p->nr_insn += sb.st_size / sizeof(brw_inst);
p->next_insn_offset = start_offset + sb.st_size;
p->store_size = (start_offset + sb.st_size) / sizeof(brw_inst);
p->store = (brw_inst *)reralloc_size(p->mem_ctx, p->store, p->next_insn_offset);
assert(p->store);
ssize_t ret = read(fd, (char *)p->store + start_offset, sb.st_size);
close(fd);
if (ret != sb.st_size) {
return false;
}
ASSERTED bool valid =
brw_validate_instructions(p->isa, p->store,
start_offset, p->next_insn_offset,
NULL);
assert(valid);
return true;
}
const struct brw_label *
brw_find_label(const struct brw_label *root, int offset)
{
const struct brw_label *curr = root;
if (curr != NULL)
{
do {
if (curr->offset == offset)
return curr;
curr = curr->next;
} while (curr != NULL);
}
return curr;
}
void
brw_create_label(struct brw_label **labels, int offset, void *mem_ctx)
{
if (*labels != NULL) {
struct brw_label *curr = *labels;
struct brw_label *prev;
do {
prev = curr;
if (curr->offset == offset)
return;
curr = curr->next;
} while (curr != NULL);
curr = ralloc(mem_ctx, struct brw_label);
curr->offset = offset;
curr->number = prev->number + 1;
curr->next = NULL;
prev->next = curr;
} else {
struct brw_label *root = ralloc(mem_ctx, struct brw_label);
root->number = 0;
root->offset = offset;
root->next = NULL;
*labels = root;
}
}
const struct brw_label *
brw_label_assembly(const struct brw_isa_info *isa,
const void *assembly, int start, int end, void *mem_ctx)
{
const struct intel_device_info *const devinfo = isa->devinfo;
struct brw_label *root_label = NULL;
int to_bytes_scale = sizeof(brw_inst) / brw_jump_scale(devinfo);
for (int offset = start; offset < end;) {
const brw_inst *inst = (const brw_inst *) ((const char *) assembly + offset);
brw_inst uncompacted;
bool is_compact = brw_inst_cmpt_control(devinfo, inst);
if (is_compact) {
brw_compact_inst *compacted = (brw_compact_inst *)inst;
brw_uncompact_instruction(isa, &uncompacted, compacted);
inst = &uncompacted;
}
if (brw_has_uip(devinfo, brw_inst_opcode(isa, inst))) {
/* Instructions that have UIP also have JIP. */
brw_create_label(&root_label,
offset + brw_inst_uip(devinfo, inst) * to_bytes_scale, mem_ctx);
brw_create_label(&root_label,
offset + brw_inst_jip(devinfo, inst) * to_bytes_scale, mem_ctx);
} else if (brw_has_jip(devinfo, brw_inst_opcode(isa, inst))) {
int jip;
if (devinfo->ver >= 7) {
jip = brw_inst_jip(devinfo, inst);
} else {
jip = brw_inst_gfx6_jump_count(devinfo, inst);
}
brw_create_label(&root_label, offset + jip * to_bytes_scale, mem_ctx);
}
if (is_compact) {
offset += sizeof(brw_compact_inst);
} else {
offset += sizeof(brw_inst);
}
}
return root_label;
}
void
brw_disassemble_with_labels(const struct brw_isa_info *isa,
const void *assembly, int start, int end, FILE *out)
{
void *mem_ctx = ralloc_context(NULL);
const struct brw_label *root_label =
brw_label_assembly(isa, assembly, start, end, mem_ctx);
brw_disassemble(isa, assembly, start, end, root_label, out);
ralloc_free(mem_ctx);
}
void
brw_disassemble(const struct brw_isa_info *isa,
const void *assembly, int start, int end,
const struct brw_label *root_label, FILE *out)
{
const struct intel_device_info *devinfo = isa->devinfo;
bool dump_hex = INTEL_DEBUG(DEBUG_HEX);
for (int offset = start; offset < end;) {
const brw_inst *insn = (const brw_inst *)((char *)assembly + offset);
brw_inst uncompacted;
if (root_label != NULL) {
const struct brw_label *label = brw_find_label(root_label, offset);
if (label != NULL) {
fprintf(out, "\nLABEL%d:\n", label->number);
}
}
bool compacted = brw_inst_cmpt_control(devinfo, insn);
if (0)
fprintf(out, "0x%08x: ", offset);
if (compacted) {
brw_compact_inst *compacted = (brw_compact_inst *)insn;
if (dump_hex) {
unsigned char * insn_ptr = ((unsigned char *)&insn[0]);
const unsigned int blank_spaces = 24;
for (int i = 0 ; i < 8; i = i + 4) {
fprintf(out, "%02x %02x %02x %02x ",
insn_ptr[i],
insn_ptr[i + 1],
insn_ptr[i + 2],
insn_ptr[i + 3]);
}
/* Make compacted instructions hex value output vertically aligned
* with uncompacted instructions hex value
*/
fprintf(out, "%*c", blank_spaces, ' ');
}
brw_uncompact_instruction(isa, &uncompacted, compacted);
insn = &uncompacted;
} else {
if (dump_hex) {
unsigned char * insn_ptr = ((unsigned char *)&insn[0]);
for (int i = 0 ; i < 16; i = i + 4) {
fprintf(out, "%02x %02x %02x %02x ",
insn_ptr[i],
insn_ptr[i + 1],
insn_ptr[i + 2],
insn_ptr[i + 3]);
}
}
}
brw_disassemble_inst(out, isa, insn, compacted, offset, root_label);
if (compacted) {
offset += sizeof(brw_compact_inst);
} else {
offset += sizeof(brw_inst);
}
}
}
static const struct opcode_desc opcode_descs[] = {
/* IR, HW, name, nsrc, ndst, gfx_vers */
{ BRW_OPCODE_ILLEGAL, 0, "illegal", 0, 0, GFX_ALL },
{ BRW_OPCODE_SYNC, 1, "sync", 1, 0, GFX_GE(GFX12) },
{ BRW_OPCODE_MOV, 1, "mov", 1, 1, GFX_LT(GFX12) },
{ BRW_OPCODE_MOV, 97, "mov", 1, 1, GFX_GE(GFX12) },
{ BRW_OPCODE_SEL, 2, "sel", 2, 1, GFX_LT(GFX12) },
{ BRW_OPCODE_SEL, 98, "sel", 2, 1, GFX_GE(GFX12) },
{ BRW_OPCODE_MOVI, 3, "movi", 2, 1, GFX_GE(GFX45) & GFX_LT(GFX12) },
{ BRW_OPCODE_MOVI, 99, "movi", 2, 1, GFX_GE(GFX12) },
{ BRW_OPCODE_NOT, 4, "not", 1, 1, GFX_LT(GFX12) },
{ BRW_OPCODE_NOT, 100, "not", 1, 1, GFX_GE(GFX12) },
{ BRW_OPCODE_AND, 5, "and", 2, 1, GFX_LT(GFX12) },
{ BRW_OPCODE_AND, 101, "and", 2, 1, GFX_GE(GFX12) },
{ BRW_OPCODE_OR, 6, "or", 2, 1, GFX_LT(GFX12) },
{ BRW_OPCODE_OR, 102, "or", 2, 1, GFX_GE(GFX12) },
{ BRW_OPCODE_XOR, 7, "xor", 2, 1, GFX_LT(GFX12) },
{ BRW_OPCODE_XOR, 103, "xor", 2, 1, GFX_GE(GFX12) },
{ BRW_OPCODE_SHR, 8, "shr", 2, 1, GFX_LT(GFX12) },
{ BRW_OPCODE_SHR, 104, "shr", 2, 1, GFX_GE(GFX12) },
{ BRW_OPCODE_SHL, 9, "shl", 2, 1, GFX_LT(GFX12) },
{ BRW_OPCODE_SHL, 105, "shl", 2, 1, GFX_GE(GFX12) },
{ BRW_OPCODE_DIM, 10, "dim", 1, 1, GFX75 },
{ BRW_OPCODE_SMOV, 10, "smov", 0, 0, GFX_GE(GFX8) & GFX_LT(GFX12) },
{ BRW_OPCODE_SMOV, 106, "smov", 0, 0, GFX_GE(GFX12) },
{ BRW_OPCODE_ASR, 12, "asr", 2, 1, GFX_LT(GFX12) },
{ BRW_OPCODE_ASR, 108, "asr", 2, 1, GFX_GE(GFX12) },
{ BRW_OPCODE_ROR, 14, "ror", 2, 1, GFX11 },
{ BRW_OPCODE_ROR, 110, "ror", 2, 1, GFX_GE(GFX12) },
{ BRW_OPCODE_ROL, 15, "rol", 2, 1, GFX11 },
{ BRW_OPCODE_ROL, 111, "rol", 2, 1, GFX_GE(GFX12) },
{ BRW_OPCODE_CMP, 16, "cmp", 2, 1, GFX_LT(GFX12) },
{ BRW_OPCODE_CMP, 112, "cmp", 2, 1, GFX_GE(GFX12) },
{ BRW_OPCODE_CMPN, 17, "cmpn", 2, 1, GFX_LT(GFX12) },
{ BRW_OPCODE_CMPN, 113, "cmpn", 2, 1, GFX_GE(GFX12) },
{ BRW_OPCODE_CSEL, 18, "csel", 3, 1, GFX_GE(GFX8) & GFX_LT(GFX12) },
{ BRW_OPCODE_CSEL, 114, "csel", 3, 1, GFX_GE(GFX12) },
{ BRW_OPCODE_F32TO16, 19, "f32to16", 1, 1, GFX7 | GFX75 },
{ BRW_OPCODE_F16TO32, 20, "f16to32", 1, 1, GFX7 | GFX75 },
{ BRW_OPCODE_BFREV, 23, "bfrev", 1, 1, GFX_GE(GFX7) & GFX_LT(GFX12) },
{ BRW_OPCODE_BFREV, 119, "bfrev", 1, 1, GFX_GE(GFX12) },
{ BRW_OPCODE_BFE, 24, "bfe", 3, 1, GFX_GE(GFX7) & GFX_LT(GFX12) },
{ BRW_OPCODE_BFE, 120, "bfe", 3, 1, GFX_GE(GFX12) },
{ BRW_OPCODE_BFI1, 25, "bfi1", 2, 1, GFX_GE(GFX7) & GFX_LT(GFX12) },
{ BRW_OPCODE_BFI1, 121, "bfi1", 2, 1, GFX_GE(GFX12) },
{ BRW_OPCODE_BFI2, 26, "bfi2", 3, 1, GFX_GE(GFX7) & GFX_LT(GFX12) },
{ BRW_OPCODE_BFI2, 122, "bfi2", 3, 1, GFX_GE(GFX12) },
{ BRW_OPCODE_JMPI, 32, "jmpi", 0, 0, GFX_ALL },
{ BRW_OPCODE_BRD, 33, "brd", 0, 0, GFX_GE(GFX7) },
{ BRW_OPCODE_IF, 34, "if", 0, 0, GFX_ALL },
{ BRW_OPCODE_IFF, 35, "iff", 0, 0, GFX_LE(GFX5) },
{ BRW_OPCODE_BRC, 35, "brc", 0, 0, GFX_GE(GFX7) },
{ BRW_OPCODE_ELSE, 36, "else", 0, 0, GFX_ALL },
{ BRW_OPCODE_ENDIF, 37, "endif", 0, 0, GFX_ALL },
{ BRW_OPCODE_DO, 38, "do", 0, 0, GFX_LE(GFX5) },
{ BRW_OPCODE_CASE, 38, "case", 0, 0, GFX6 },
{ BRW_OPCODE_WHILE, 39, "while", 0, 0, GFX_ALL },
{ BRW_OPCODE_BREAK, 40, "break", 0, 0, GFX_ALL },
{ BRW_OPCODE_CONTINUE, 41, "cont", 0, 0, GFX_ALL },
{ BRW_OPCODE_HALT, 42, "halt", 0, 0, GFX_ALL },
{ BRW_OPCODE_CALLA, 43, "calla", 0, 0, GFX_GE(GFX75) },
{ BRW_OPCODE_MSAVE, 44, "msave", 0, 0, GFX_LE(GFX5) },
{ BRW_OPCODE_CALL, 44, "call", 0, 0, GFX_GE(GFX6) },
{ BRW_OPCODE_MREST, 45, "mrest", 0, 0, GFX_LE(GFX5) },
{ BRW_OPCODE_RET, 45, "ret", 0, 0, GFX_GE(GFX6) },
{ BRW_OPCODE_PUSH, 46, "push", 0, 0, GFX_LE(GFX5) },
{ BRW_OPCODE_FORK, 46, "fork", 0, 0, GFX6 },
{ BRW_OPCODE_GOTO, 46, "goto", 0, 0, GFX_GE(GFX8) },
{ BRW_OPCODE_POP, 47, "pop", 2, 0, GFX_LE(GFX5) },
{ BRW_OPCODE_WAIT, 48, "wait", 0, 1, GFX_LT(GFX12) },
{ BRW_OPCODE_SEND, 49, "send", 1, 1, GFX_LT(GFX12) },
{ BRW_OPCODE_SENDC, 50, "sendc", 1, 1, GFX_LT(GFX12) },
{ BRW_OPCODE_SEND, 49, "send", 2, 1, GFX_GE(GFX12) },
{ BRW_OPCODE_SENDC, 50, "sendc", 2, 1, GFX_GE(GFX12) },
{ BRW_OPCODE_SENDS, 51, "sends", 2, 1, GFX_GE(GFX9) & GFX_LT(GFX12) },
{ BRW_OPCODE_SENDSC, 52, "sendsc", 2, 1, GFX_GE(GFX9) & GFX_LT(GFX12) },
{ BRW_OPCODE_MATH, 56, "math", 2, 1, GFX_GE(GFX6) },
{ BRW_OPCODE_ADD, 64, "add", 2, 1, GFX_ALL },
{ BRW_OPCODE_MUL, 65, "mul", 2, 1, GFX_ALL },
{ BRW_OPCODE_AVG, 66, "avg", 2, 1, GFX_ALL },
{ BRW_OPCODE_FRC, 67, "frc", 1, 1, GFX_ALL },
{ BRW_OPCODE_RNDU, 68, "rndu", 1, 1, GFX_ALL },
{ BRW_OPCODE_RNDD, 69, "rndd", 1, 1, GFX_ALL },
{ BRW_OPCODE_RNDE, 70, "rnde", 1, 1, GFX_ALL },
{ BRW_OPCODE_RNDZ, 71, "rndz", 1, 1, GFX_ALL },
{ BRW_OPCODE_MAC, 72, "mac", 2, 1, GFX_ALL },
{ BRW_OPCODE_MACH, 73, "mach", 2, 1, GFX_ALL },
{ BRW_OPCODE_LZD, 74, "lzd", 1, 1, GFX_ALL },
{ BRW_OPCODE_FBH, 75, "fbh", 1, 1, GFX_GE(GFX7) },
{ BRW_OPCODE_FBL, 76, "fbl", 1, 1, GFX_GE(GFX7) },
{ BRW_OPCODE_CBIT, 77, "cbit", 1, 1, GFX_GE(GFX7) },
{ BRW_OPCODE_ADDC, 78, "addc", 2, 1, GFX_GE(GFX7) },
{ BRW_OPCODE_SUBB, 79, "subb", 2, 1, GFX_GE(GFX7) },
{ BRW_OPCODE_SAD2, 80, "sad2", 2, 1, GFX_ALL },
{ BRW_OPCODE_SADA2, 81, "sada2", 2, 1, GFX_ALL },
{ BRW_OPCODE_ADD3, 82, "add3", 3, 1, GFX_GE(GFX125) },
{ BRW_OPCODE_DP4, 84, "dp4", 2, 1, GFX_LT(GFX11) },
{ BRW_OPCODE_DPH, 85, "dph", 2, 1, GFX_LT(GFX11) },
{ BRW_OPCODE_DP3, 86, "dp3", 2, 1, GFX_LT(GFX11) },
{ BRW_OPCODE_DP2, 87, "dp2", 2, 1, GFX_LT(GFX11) },
{ BRW_OPCODE_DP4A, 88, "dp4a", 3, 1, GFX_GE(GFX12) },
{ BRW_OPCODE_LINE, 89, "line", 2, 1, GFX_LE(GFX10) },
{ BRW_OPCODE_DPAS, 89, "dpas", 3, 1, GFX_GE(GFX125) },
{ BRW_OPCODE_PLN, 90, "pln", 2, 1, GFX_GE(GFX45) & GFX_LE(GFX10) },
{ BRW_OPCODE_MAD, 91, "mad", 3, 1, GFX_GE(GFX6) },
{ BRW_OPCODE_LRP, 92, "lrp", 3, 1, GFX_GE(GFX6) & GFX_LE(GFX10) },
{ BRW_OPCODE_MADM, 93, "madm", 3, 1, GFX_GE(GFX8) },
{ BRW_OPCODE_NENOP, 125, "nenop", 0, 0, GFX45 },
{ BRW_OPCODE_NOP, 126, "nop", 0, 0, GFX_LT(GFX12) },
{ BRW_OPCODE_NOP, 96, "nop", 0, 0, GFX_GE(GFX12) }
};
void
brw_init_isa_info(struct brw_isa_info *isa,
const struct intel_device_info *devinfo)
{
isa->devinfo = devinfo;
enum gfx_ver ver = gfx_ver_from_devinfo(devinfo);
memset(isa->ir_to_descs, 0, sizeof(isa->ir_to_descs));
memset(isa->hw_to_descs, 0, sizeof(isa->hw_to_descs));
for (unsigned i = 0; i < ARRAY_SIZE(opcode_descs); i++) {
if (opcode_descs[i].gfx_vers & ver) {
const unsigned e = opcode_descs[i].ir;
const unsigned h = opcode_descs[i].hw;
assert(e < ARRAY_SIZE(isa->ir_to_descs) && !isa->ir_to_descs[e]);
assert(h < ARRAY_SIZE(isa->hw_to_descs) && !isa->hw_to_descs[h]);
isa->ir_to_descs[e] = &opcode_descs[i];
isa->hw_to_descs[h] = &opcode_descs[i];
}
}
}
/**
* Return the matching opcode_desc for the specified IR opcode and hardware
* generation, or NULL if the opcode is not supported by the device.
*/
const struct opcode_desc *
brw_opcode_desc(const struct brw_isa_info *isa, enum opcode op)
{
return op < ARRAY_SIZE(isa->ir_to_descs) ? isa->ir_to_descs[op] : NULL;
}
/**
* Return the matching opcode_desc for the specified HW opcode and hardware
* generation, or NULL if the opcode is not supported by the device.
*/
const struct opcode_desc *
brw_opcode_desc_from_hw(const struct brw_isa_info *isa, unsigned hw)
{
return hw < ARRAY_SIZE(isa->hw_to_descs) ? isa->hw_to_descs[hw] : NULL;
}
unsigned
brw_num_sources_from_inst(const struct brw_isa_info *isa,
const brw_inst *inst)
{
const struct intel_device_info *devinfo = isa->devinfo;
const struct opcode_desc *desc =
brw_opcode_desc(isa, brw_inst_opcode(isa, inst));
unsigned math_function;
if (brw_inst_opcode(isa, inst) == BRW_OPCODE_MATH) {
math_function = brw_inst_math_function(devinfo, inst);
} else if (devinfo->ver < 6 &&
brw_inst_opcode(isa, inst) == BRW_OPCODE_SEND) {
if (brw_inst_sfid(devinfo, inst) == BRW_SFID_MATH) {
/* src1 must be a descriptor (including the information to determine
* that the SEND is doing an extended math operation), but src0 can
* actually be null since it serves as the source of the implicit GRF
* to MRF move.
*
* If we stop using that functionality, we'll have to revisit this.
*/
return 2;
} else {
/* Send instructions are allowed to have null sources since they use
* the base_mrf field to specify which message register source.
*/
return 0;
}
} else {
assert(desc->nsrc < 4);
return desc->nsrc;
}
switch (math_function) {
case BRW_MATH_FUNCTION_INV:
case BRW_MATH_FUNCTION_LOG:
case BRW_MATH_FUNCTION_EXP:
case BRW_MATH_FUNCTION_SQRT:
case BRW_MATH_FUNCTION_RSQ:
case BRW_MATH_FUNCTION_SIN:
case BRW_MATH_FUNCTION_COS:
case BRW_MATH_FUNCTION_SINCOS:
case GFX8_MATH_FUNCTION_INVM:
case GFX8_MATH_FUNCTION_RSQRTM:
return 1;
case BRW_MATH_FUNCTION_FDIV:
case BRW_MATH_FUNCTION_POW:
case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
return 2;
default:
unreachable("not reached");
}
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,119 @@
/*
Copyright (C) Intel Corp. 2006. All Rights Reserved.
Intel funded Tungsten Graphics to
develop this 3D driver.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice (including the
next paragraph) shall be included in all copies or substantial
portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
**********************************************************************/
/*
* Authors:
* Keith Whitwell <keithw@vmware.com>
*/
#include "brw_eu_defines.h"
#include "brw_eu.h"
void brw_math_invert( struct brw_codegen *p,
struct brw_reg dst,
struct brw_reg src)
{
gfx4_math(p,
dst,
BRW_MATH_FUNCTION_INV,
0,
src,
BRW_MATH_PRECISION_FULL);
}
void brw_copy4(struct brw_codegen *p,
struct brw_reg dst,
struct brw_reg src,
unsigned count)
{
unsigned i;
dst = vec4(dst);
src = vec4(src);
for (i = 0; i < count; i++)
{
unsigned delta = i*32;
brw_MOV(p, byte_offset(dst, delta), byte_offset(src, delta));
brw_MOV(p, byte_offset(dst, delta+16), byte_offset(src, delta+16));
}
}
void brw_copy8(struct brw_codegen *p,
struct brw_reg dst,
struct brw_reg src,
unsigned count)
{
unsigned i;
dst = vec8(dst);
src = vec8(src);
for (i = 0; i < count; i++)
{
unsigned delta = i*32;
brw_MOV(p, byte_offset(dst, delta), byte_offset(src, delta));
}
}
void brw_copy_indirect_to_indirect(struct brw_codegen *p,
struct brw_indirect dst_ptr,
struct brw_indirect src_ptr,
unsigned count)
{
unsigned i;
for (i = 0; i < count; i++)
{
unsigned delta = i*32;
brw_MOV(p, deref_4f(dst_ptr, delta), deref_4f(src_ptr, delta));
brw_MOV(p, deref_4f(dst_ptr, delta+16), deref_4f(src_ptr, delta+16));
}
}
void brw_copy_from_indirect(struct brw_codegen *p,
struct brw_reg dst,
struct brw_indirect ptr,
unsigned count)
{
unsigned i;
dst = vec4(dst);
for (i = 0; i < count; i++)
{
unsigned delta = i*32;
brw_MOV(p, byte_offset(dst, delta), deref_4f(ptr, delta));
brw_MOV(p, byte_offset(dst, delta+16), deref_4f(ptr, delta+16));
}
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,637 @@
/*
* Copyright © 2010 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* Authors:
* Eric Anholt <eric@anholt.net>
*
*/
#ifndef BRW_FS_H
#define BRW_FS_H
#include "brw_shader.h"
#include "brw_ir_fs.h"
#include "brw_fs_live_variables.h"
#include "brw_ir_performance.h"
#include "compiler/nir/nir.h"
struct bblock_t;
namespace {
struct acp_entry;
}
class fs_visitor;
namespace brw {
/**
* Register pressure analysis of a shader. Estimates how many registers
* are live at any point of the program in GRF units.
*/
struct register_pressure {
register_pressure(const fs_visitor *v);
~register_pressure();
analysis_dependency_class
dependency_class() const
{
return (DEPENDENCY_INSTRUCTION_IDENTITY |
DEPENDENCY_INSTRUCTION_DATA_FLOW |
DEPENDENCY_VARIABLES);
}
bool
validate(const fs_visitor *) const
{
/* FINISHME */
return true;
}
unsigned *regs_live_at_ip;
};
}
struct brw_gs_compile;
namespace brw {
class fs_builder;
}
struct shader_stats {
const char *scheduler_mode;
unsigned promoted_constants;
unsigned spill_count;
unsigned fill_count;
unsigned max_register_pressure;
};
/** Register numbers for thread payload fields. */
struct thread_payload {
/** The number of thread payload registers the hardware will supply. */
uint8_t num_regs;
virtual ~thread_payload() = default;
protected:
thread_payload() : num_regs() {}
};
struct vs_thread_payload : public thread_payload {
vs_thread_payload(const fs_visitor &v);
fs_reg urb_handles;
};
struct tcs_thread_payload : public thread_payload {
tcs_thread_payload(const fs_visitor &v);
fs_reg patch_urb_output;
fs_reg primitive_id;
fs_reg icp_handle_start;
};
struct tes_thread_payload : public thread_payload {
tes_thread_payload(const fs_visitor &v);
fs_reg patch_urb_input;
fs_reg primitive_id;
fs_reg coords[3];
fs_reg urb_output;
};
struct gs_thread_payload : public thread_payload {
gs_thread_payload(fs_visitor &v);
fs_reg urb_handles;
fs_reg primitive_id;
fs_reg instance_id;
fs_reg icp_handle_start;
};
struct fs_thread_payload : public thread_payload {
fs_thread_payload(const fs_visitor &v,
bool &source_depth_to_render_target,
bool &runtime_check_aads_emit);
uint8_t subspan_coord_reg[2];
uint8_t source_depth_reg[2];
uint8_t source_w_reg[2];
uint8_t aa_dest_stencil_reg[2];
uint8_t dest_depth_reg[2];
uint8_t sample_pos_reg[2];
uint8_t sample_mask_in_reg[2];
uint8_t depth_w_coef_reg;
uint8_t barycentric_coord_reg[BRW_BARYCENTRIC_MODE_COUNT][2];
};
struct cs_thread_payload : public thread_payload {
cs_thread_payload(const fs_visitor &v);
void load_subgroup_id(const brw::fs_builder &bld, fs_reg &dest) const;
fs_reg local_invocation_id[3];
protected:
fs_reg subgroup_id_;
};
struct task_mesh_thread_payload : public cs_thread_payload {
task_mesh_thread_payload(fs_visitor &v);
fs_reg extended_parameter_0;
fs_reg local_index;
fs_reg inline_parameter;
fs_reg urb_output;
/* URB to read Task memory inputs. Only valid for MESH stage. */
fs_reg task_urb_input;
};
struct bs_thread_payload : public thread_payload {
bs_thread_payload(const fs_visitor &v);
fs_reg global_arg_ptr;
fs_reg local_arg_ptr;
void load_shader_type(const brw::fs_builder &bld, fs_reg &dest) const;
};
class fs_instruction_scheduler;
/**
* The fragment shader front-end.
*
* Translates either GLSL IR or Mesa IR (for ARB_fragment_program) into FS IR.
*/
class fs_visitor : public backend_shader
{
public:
fs_visitor(const struct brw_compiler *compiler,
const struct brw_compile_params *params,
const brw_base_prog_key *key,
struct brw_stage_prog_data *prog_data,
const nir_shader *shader,
unsigned dispatch_width,
bool needs_register_pressure,
bool debug_enabled);
fs_visitor(const struct brw_compiler *compiler,
const struct brw_compile_params *params,
const brw_wm_prog_key *key,
struct brw_wm_prog_data *prog_data,
const nir_shader *shader,
unsigned dispatch_width,
unsigned num_polygons,
bool needs_register_pressure,
bool debug_enabled);
fs_visitor(const struct brw_compiler *compiler,
const struct brw_compile_params *params,
struct brw_gs_compile *gs_compile,
struct brw_gs_prog_data *prog_data,
const nir_shader *shader,
bool needs_register_pressure,
bool debug_enabled);
void init();
~fs_visitor();
fs_reg vgrf(const glsl_type *const type);
void import_uniforms(fs_visitor *v);
void VARYING_PULL_CONSTANT_LOAD(const brw::fs_builder &bld,
const fs_reg &dst,
const fs_reg &surface,
const fs_reg &surface_handle,
const fs_reg &varying_offset,
uint32_t const_offset,
uint8_t alignment,
unsigned components);
void DEP_RESOLVE_MOV(const brw::fs_builder &bld, int grf);
bool run_fs(bool allow_spilling, bool do_rep_send);
bool run_vs();
bool run_tcs();
bool run_tes();
bool run_gs();
bool run_cs(bool allow_spilling);
bool run_bs(bool allow_spilling);
bool run_task(bool allow_spilling);
bool run_mesh(bool allow_spilling);
void optimize();
void allocate_registers(bool allow_spilling);
uint32_t compute_max_register_pressure();
bool fixup_sends_duplicate_payload();
void fixup_3src_null_dest();
void emit_dummy_memory_fence_before_eot();
void emit_dummy_mov_instruction();
bool fixup_nomask_control_flow();
void assign_curb_setup();
void assign_urb_setup();
void convert_attr_sources_to_hw_regs(fs_inst *inst);
void assign_vs_urb_setup();
void assign_tcs_urb_setup();
void assign_tes_urb_setup();
void assign_gs_urb_setup();
bool assign_regs(bool allow_spilling, bool spill_all);
void assign_regs_trivial();
void calculate_payload_ranges(unsigned payload_node_count,
int *payload_last_use_ip) const;
bool split_virtual_grfs();
bool compact_virtual_grfs();
void assign_constant_locations();
bool get_pull_locs(const fs_reg &src, unsigned *out_surf_index,
unsigned *out_pull_index);
bool lower_constant_loads();
virtual void invalidate_analysis(brw::analysis_dependency_class c);
#ifndef NDEBUG
void validate();
#else
void validate() {}
#endif
bool opt_algebraic();
bool opt_redundant_halt();
bool opt_cse();
bool opt_cse_local(const brw::fs_live_variables &live, bblock_t *block, int &ip);
bool opt_copy_propagation();
bool opt_bank_conflicts();
bool opt_split_sends();
bool register_coalesce();
bool compute_to_mrf();
bool eliminate_find_live_channel();
bool dead_code_eliminate();
bool remove_duplicate_mrf_writes();
bool remove_extra_rounding_modes();
fs_instruction_scheduler *prepare_scheduler(void *mem_ctx);
void schedule_instructions_pre_ra(fs_instruction_scheduler *sched,
instruction_scheduler_mode mode);
void schedule_instructions_post_ra();
void insert_gfx4_send_dependency_workarounds();
void insert_gfx4_pre_send_dependency_workarounds(bblock_t *block,
fs_inst *inst);
void insert_gfx4_post_send_dependency_workarounds(bblock_t *block,
fs_inst *inst);
void vfail(const char *msg, va_list args);
void fail(const char *msg, ...);
void limit_dispatch_width(unsigned n, const char *msg);
bool lower_uniform_pull_constant_loads();
bool lower_load_payload();
bool lower_pack();
bool lower_regioning();
bool lower_logical_sends();
bool lower_integer_multiplication();
bool lower_minmax();
bool lower_simd_width();
bool lower_barycentrics();
bool lower_derivatives();
bool lower_find_live_channel();
bool lower_scoreboard();
bool lower_sub_sat();
bool opt_combine_constants();
void emit_repclear_shader();
void emit_interpolation_setup_gfx4();
void emit_interpolation_setup_gfx6();
bool opt_peephole_sel();
bool opt_saturate_propagation();
bool opt_cmod_propagation();
bool opt_zero_samples();
void set_tcs_invocation_id();
void emit_alpha_test();
fs_inst *emit_single_fb_write(const brw::fs_builder &bld,
fs_reg color1, fs_reg color2,
fs_reg src0_alpha, unsigned components);
void do_emit_fb_writes(int nr_color_regions, bool replicate_alpha);
void emit_fb_writes();
void emit_urb_writes(const fs_reg &gs_vertex_count = fs_reg());
void emit_gs_control_data_bits(const fs_reg &vertex_count);
void emit_gs_thread_end();
bool mark_last_urb_write_with_eot();
void emit_tcs_thread_end();
void emit_urb_fence();
void emit_cs_terminate();
fs_reg interp_reg(const brw::fs_builder &bld, unsigned location,
unsigned channel, unsigned comp);
fs_reg per_primitive_reg(const brw::fs_builder &bld,
int location, unsigned comp);
virtual void dump_instruction_to_file(const backend_instruction *inst, FILE *file) const;
virtual void dump_instructions_to_file(FILE *file) const;
const brw_base_prog_key *const key;
const struct brw_sampler_prog_key_data *key_tex;
struct brw_gs_compile *gs_compile;
struct brw_stage_prog_data *prog_data;
brw_analysis<brw::fs_live_variables, backend_shader> live_analysis;
brw_analysis<brw::register_pressure, fs_visitor> regpressure_analysis;
brw_analysis<brw::performance, fs_visitor> performance_analysis;
/** Number of uniform variable components visited. */
unsigned uniforms;
/** Byte-offset for the next available spot in the scratch space buffer. */
unsigned last_scratch;
/**
* Array mapping UNIFORM register numbers to the push parameter index,
* or -1 if this uniform register isn't being uploaded as a push constant.
*/
int *push_constant_loc;
fs_reg frag_depth;
fs_reg frag_stencil;
fs_reg sample_mask;
fs_reg outputs[VARYING_SLOT_MAX];
fs_reg dual_src_output;
int first_non_payload_grf;
/** Either BRW_MAX_GRF or GFX7_MRF_HACK_START */
unsigned max_grf;
bool failed;
char *fail_msg;
thread_payload *payload_;
thread_payload &payload() {
return *this->payload_;
}
vs_thread_payload &vs_payload() {
assert(stage == MESA_SHADER_VERTEX);
return *static_cast<vs_thread_payload *>(this->payload_);
}
tcs_thread_payload &tcs_payload() {
assert(stage == MESA_SHADER_TESS_CTRL);
return *static_cast<tcs_thread_payload *>(this->payload_);
}
tes_thread_payload &tes_payload() {
assert(stage == MESA_SHADER_TESS_EVAL);
return *static_cast<tes_thread_payload *>(this->payload_);
}
gs_thread_payload &gs_payload() {
assert(stage == MESA_SHADER_GEOMETRY);
return *static_cast<gs_thread_payload *>(this->payload_);
}
fs_thread_payload &fs_payload() {
assert(stage == MESA_SHADER_FRAGMENT);
return *static_cast<fs_thread_payload *>(this->payload_);
};
cs_thread_payload &cs_payload() {
assert(gl_shader_stage_uses_workgroup(stage));
return *static_cast<cs_thread_payload *>(this->payload_);
}
task_mesh_thread_payload &task_mesh_payload() {
assert(stage == MESA_SHADER_TASK || stage == MESA_SHADER_MESH);
return *static_cast<task_mesh_thread_payload *>(this->payload_);
}
bs_thread_payload &bs_payload() {
assert(stage >= MESA_SHADER_RAYGEN && stage <= MESA_SHADER_CALLABLE);
return *static_cast<bs_thread_payload *>(this->payload_);
}
bool source_depth_to_render_target;
bool runtime_check_aads_emit;
fs_reg pixel_x;
fs_reg pixel_y;
fs_reg pixel_z;
fs_reg wpos_w;
fs_reg pixel_w;
fs_reg delta_xy[BRW_BARYCENTRIC_MODE_COUNT];
fs_reg final_gs_vertex_count;
fs_reg control_data_bits;
fs_reg invocation_id;
unsigned grf_used;
bool spilled_any_registers;
bool needs_register_pressure;
const unsigned dispatch_width; /**< 8, 16 or 32 */
const unsigned max_polygons;
unsigned max_dispatch_width;
/* The API selected subgroup size */
unsigned api_subgroup_size; /**< 0, 8, 16, 32 */
struct shader_stats shader_stats;
void lower_mul_dword_inst(fs_inst *inst, bblock_t *block);
void lower_mul_qword_inst(fs_inst *inst, bblock_t *block);
void lower_mulh_inst(fs_inst *inst, bblock_t *block);
unsigned workgroup_size() const;
void debug_optimizer(const nir_shader *nir,
const char *pass_name,
int iteration, int pass_num) const;
};
/**
* Return the flag register used in fragment shaders to keep track of live
* samples. On Gfx7+ we use f1.0-f1.1 to allow discard jumps in SIMD32
* dispatch mode, while earlier generations are constrained to f0.1, which
* limits the dispatch width to SIMD16 for fragment shaders that use discard.
*/
static inline unsigned
sample_mask_flag_subreg(const fs_visitor &s)
{
assert(s.stage == MESA_SHADER_FRAGMENT);
return s.devinfo->ver >= 7 ? 2 : 1;
}
/**
* The fragment shader code generator.
*
* Translates FS IR to actual i965 assembly code.
*/
class fs_generator
{
public:
fs_generator(const struct brw_compiler *compiler,
const struct brw_compile_params *params,
struct brw_stage_prog_data *prog_data,
bool runtime_check_aads_emit,
gl_shader_stage stage);
~fs_generator();
void enable_debug(const char *shader_name);
int generate_code(const cfg_t *cfg, int dispatch_width,
struct shader_stats shader_stats,
const brw::performance &perf,
struct brw_compile_stats *stats,
unsigned max_polygons = 0);
void add_const_data(void *data, unsigned size);
void add_resume_sbt(unsigned num_resume_shaders, uint64_t *sbt);
const unsigned *get_assembly();
private:
void fire_fb_write(fs_inst *inst,
struct brw_reg payload,
struct brw_reg implied_header,
GLuint nr);
void generate_send(fs_inst *inst,
struct brw_reg dst,
struct brw_reg desc,
struct brw_reg ex_desc,
struct brw_reg payload,
struct brw_reg payload2);
void generate_fb_write(fs_inst *inst, struct brw_reg payload);
void generate_fb_read(fs_inst *inst, struct brw_reg dst,
struct brw_reg payload);
void generate_cs_terminate(fs_inst *inst, struct brw_reg payload);
void generate_barrier(fs_inst *inst, struct brw_reg src);
bool generate_linterp(fs_inst *inst, struct brw_reg dst,
struct brw_reg *src);
void generate_tex(fs_inst *inst, struct brw_reg dst,
struct brw_reg surface_index,
struct brw_reg sampler_index);
void generate_ddx(const fs_inst *inst,
struct brw_reg dst, struct brw_reg src);
void generate_ddy(const fs_inst *inst,
struct brw_reg dst, struct brw_reg src);
void generate_scratch_write(fs_inst *inst, struct brw_reg src);
void generate_scratch_read(fs_inst *inst, struct brw_reg dst);
void generate_scratch_read_gfx7(fs_inst *inst, struct brw_reg dst);
void generate_scratch_header(fs_inst *inst, struct brw_reg dst);
void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst,
struct brw_reg index,
struct brw_reg offset);
void generate_varying_pull_constant_load_gfx4(fs_inst *inst,
struct brw_reg dst,
struct brw_reg index);
void generate_set_sample_id(fs_inst *inst,
struct brw_reg dst,
struct brw_reg src0,
struct brw_reg src1);
void generate_halt(fs_inst *inst);
void generate_mov_indirect(fs_inst *inst,
struct brw_reg dst,
struct brw_reg reg,
struct brw_reg indirect_byte_offset);
void generate_shuffle(fs_inst *inst,
struct brw_reg dst,
struct brw_reg src,
struct brw_reg idx);
void generate_quad_swizzle(const fs_inst *inst,
struct brw_reg dst, struct brw_reg src,
unsigned swiz);
bool patch_halt_jumps();
const struct brw_compiler *compiler;
const struct brw_compile_params *params;
const struct intel_device_info *devinfo;
struct brw_codegen *p;
struct brw_stage_prog_data * const prog_data;
unsigned dispatch_width; /**< 8, 16 or 32 */
exec_list discard_halt_patches;
bool runtime_check_aads_emit;
bool debug_flag;
const char *shader_name;
gl_shader_stage stage;
void *mem_ctx;
};
namespace brw {
fs_reg
fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2],
brw_reg_type type = BRW_REGISTER_TYPE_F,
unsigned n = 1);
fs_reg
fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2]);
inline fs_reg
dynamic_msaa_flags(const struct brw_wm_prog_data *wm_prog_data)
{
return fs_reg(UNIFORM, wm_prog_data->msaa_flags_param,
BRW_REGISTER_TYPE_UD);
}
void
check_dynamic_msaa_flag(const fs_builder &bld,
const struct brw_wm_prog_data *wm_prog_data,
enum intel_msaa_flags flag);
bool
lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i);
}
void shuffle_from_32bit_read(const brw::fs_builder &bld,
const fs_reg &dst,
const fs_reg &src,
uint32_t first_component,
uint32_t components);
fs_reg setup_imm_df(const brw::fs_builder &bld,
double v);
fs_reg setup_imm_b(const brw::fs_builder &bld,
int8_t v);
fs_reg setup_imm_ub(const brw::fs_builder &bld,
uint8_t v);
enum brw_barycentric_mode brw_barycentric_mode(nir_intrinsic_instr *intr);
uint32_t brw_fb_write_msg_control(const fs_inst *inst,
const struct brw_wm_prog_data *prog_data);
void brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data);
bool brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width);
fs_reg brw_sample_mask_reg(const brw::fs_builder &bld);
void brw_emit_predicate_on_sample_mask(const brw::fs_builder &bld, fs_inst *inst);
int brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
const brw_stage_prog_data *prog_data);
bool brw_lower_dpas(fs_visitor &v);
void nir_to_brw(fs_visitor *s);
#endif /* BRW_FS_H */

View file

@ -0,0 +1,955 @@
/*
* Copyright © 2017 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
/** @file brw_fs_bank_conflicts.cpp
*
* This file contains a GRF bank conflict mitigation pass. The pass is
* intended to be run after register allocation and works by rearranging the
* layout of the GRF space (without altering the semantics of the program) in
* a way that minimizes the number of GRF bank conflicts incurred by ternary
* instructions.
*
* Unfortunately there is close to no information about bank conflicts in the
* hardware spec, but experimentally on Gfx7-Gfx9 ternary instructions seem to
* incur an average bank conflict penalty of one cycle per SIMD8 op whenever
* the second and third source are stored in the same GRF bank (\sa bank_of()
* for the exact bank layout) which cannot be fetched during the same cycle by
* the EU, unless the EU logic manages to optimize out the read cycle of a
* duplicate source register (\sa is_conflict_optimized_out()).
*
* The asymptotic run-time of the algorithm is dominated by the
* shader_conflict_weight_matrix() computation below, which is O(n) on the
* number of instructions in the program, however for small and medium-sized
* programs the run-time is likely to be dominated by
* optimize_reg_permutation() which is O(m^3) on the number of GRF atoms of
* the program (\sa partitioning), which is bounded (since the program uses a
* bounded number of registers post-regalloc) and of the order of 100. For
* that reason optimize_reg_permutation() is vectorized in order to keep the
* cubic term within reasonable bounds for m close to its theoretical maximum.
*/
#include "brw_fs.h"
#include "brw_cfg.h"
#ifdef __SSE2__
#include <emmintrin.h>
/**
* Thin layer around vector intrinsics so they can be easily replaced with
* e.g. the fall-back scalar path, an implementation with different vector
* width or using different SIMD architectures (AVX-512?!).
*
* This implementation operates on pairs of independent SSE2 integer vectors à
* la SIMD16 for somewhat improved throughput. SSE2 is supported by virtually
* all platforms that care about bank conflicts, so this path should almost
* always be available in practice.
*/
namespace {
/**
* SIMD integer vector data type.
*/
struct vector_type {
__m128i v[2];
};
/**
* Scalar data type matching the representation of a single component of \p
* vector_type.
*/
typedef int16_t scalar_type;
/**
* Maximum integer value representable as a \p scalar_type.
*/
const scalar_type max_scalar = INT16_MAX;
/**
* Number of components of a \p vector_type.
*/
const unsigned vector_width = 2 * sizeof(__m128i) / sizeof(scalar_type);
/**
* Set the i-th component of vector \p v to \p x.
*/
void
set(vector_type &v, unsigned i, scalar_type x)
{
assert(i < vector_width);
memcpy((char *)v.v + i * sizeof(x), &x, sizeof(x));
}
/**
* Get the i-th component of vector \p v.
*/
scalar_type
get(const vector_type &v, unsigned i)
{
assert(i < vector_width);
scalar_type x;
memcpy(&x, (char *)v.v + i * sizeof(x), sizeof(x));
return x;
}
/**
* Add two vectors with saturation.
*/
vector_type
adds(const vector_type &v, const vector_type &w)
{
const vector_type u = {{
_mm_adds_epi16(v.v[0], w.v[0]),
_mm_adds_epi16(v.v[1], w.v[1])
}};
return u;
}
/**
* Subtract two vectors with saturation.
*/
vector_type
subs(const vector_type &v, const vector_type &w)
{
const vector_type u = {{
_mm_subs_epi16(v.v[0], w.v[0]),
_mm_subs_epi16(v.v[1], w.v[1])
}};
return u;
}
/**
* Compute the bitwise conjunction of two vectors.
*/
vector_type
mask(const vector_type &v, const vector_type &w)
{
const vector_type u = {{
_mm_and_si128(v.v[0], w.v[0]),
_mm_and_si128(v.v[1], w.v[1])
}};
return u;
}
/**
* Reduce the components of a vector using saturating addition.
*/
scalar_type
sums(const vector_type &v)
{
const __m128i v8 = _mm_adds_epi16(v.v[0], v.v[1]);
const __m128i v4 = _mm_adds_epi16(v8, _mm_shuffle_epi32(v8, 0x4e));
const __m128i v2 = _mm_adds_epi16(v4, _mm_shuffle_epi32(v4, 0xb1));
const __m128i v1 = _mm_adds_epi16(v2, _mm_shufflelo_epi16(v2, 0xb1));
return _mm_extract_epi16(v1, 0);
}
}
#else
/**
* Thin layer around vector intrinsics so they can be easily replaced with
* e.g. the fall-back scalar path, an implementation with different vector
* width or using different SIMD architectures (AVX-512?!).
*
* This implementation operates on scalar values and doesn't rely on
* any vector extensions. This is mainly intended for debugging and
* to keep this file building on exotic platforms.
*/
namespace {
/**
* SIMD integer vector data type.
*/
typedef int16_t vector_type;
/**
* Scalar data type matching the representation of a single component of \p
* vector_type.
*/
typedef int16_t scalar_type;
/**
* Maximum integer value representable as a \p scalar_type.
*/
const scalar_type max_scalar = INT16_MAX;
/**
* Number of components of a \p vector_type.
*/
const unsigned vector_width = 1;
/**
* Set the i-th component of vector \p v to \p x.
*/
void
set(vector_type &v, unsigned i, scalar_type x)
{
assert(i < vector_width);
v = x;
}
/**
* Get the i-th component of vector \p v.
*/
scalar_type
get(const vector_type &v, unsigned i)
{
assert(i < vector_width);
return v;
}
/**
* Add two vectors with saturation.
*/
vector_type
adds(vector_type v, vector_type w)
{
return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) + w));
}
/**
* Subtract two vectors with saturation.
*/
vector_type
subs(vector_type v, vector_type w)
{
return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) - w));
}
/**
* Compute the bitwise conjunction of two vectors.
*/
vector_type
mask(vector_type v, vector_type w)
{
return v & w;
}
/**
* Reduce the components of a vector using saturating addition.
*/
scalar_type
sums(vector_type v)
{
return v;
}
}
#endif
/**
* Swap \p x and \p y.
*/
#define SWAP(x, y) do { \
__typeof(y) _swap_tmp = y; \
y = x; \
x = _swap_tmp; \
} while (0)
namespace {
/**
* Variable-length vector type intended to represent cycle-count costs for
* arbitrary atom-to-bank assignments. It's indexed by a pair of integers
* (i, p), where i is an atom index and p in {0, 1} indicates the parity of
* the conflict (respectively, whether the cost is incurred whenever the
* atoms are assigned the same bank b or opposite-parity banks b and b^1).
* \sa shader_conflict_weight_matrix()
*/
struct weight_vector_type {
weight_vector_type() : v(NULL), size(0) {}
weight_vector_type(unsigned n) : v(alloc(n)), size(n) {}
weight_vector_type(const weight_vector_type &u) :
v(alloc(u.size)), size(u.size)
{
memcpy(v, u.v,
DIV_ROUND_UP(u.size, vector_width) * sizeof(vector_type));
}
~weight_vector_type()
{
free(v);
}
weight_vector_type &
operator=(weight_vector_type u)
{
SWAP(v, u.v);
SWAP(size, u.size);
return *this;
}
vector_type *v;
unsigned size;
private:
static vector_type *
alloc(unsigned n)
{
const unsigned align = MAX2(sizeof(void *), __alignof__(vector_type));
const unsigned size = DIV_ROUND_UP(n, vector_width) * sizeof(vector_type);
void *p;
if (posix_memalign(&p, align, size))
return NULL;
memset(p, 0, size);
return reinterpret_cast<vector_type *>(p);
}
};
/**
* Set the (i, p)-th component of weight vector \p v to \p x.
*/
void
set(weight_vector_type &v, unsigned i, unsigned p, scalar_type x)
{
set(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width, x);
}
/**
* Get the (i, p)-th component of weight vector \p v.
*/
scalar_type
get(const weight_vector_type &v, unsigned i, unsigned p)
{
return get(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width);
}
/**
* Swap the (i, p)-th and (j, q)-th components of weight vector \p v.
*/
void
swap(weight_vector_type &v,
unsigned i, unsigned p,
unsigned j, unsigned q)
{
const scalar_type tmp = get(v, i, p);
set(v, i, p, get(v, j, q));
set(v, j, q, tmp);
}
}
namespace {
/**
* Object that represents the partitioning of an arbitrary register space
* into indivisible units (referred to as atoms below) that can potentially
* be rearranged independently from other registers. The partitioning is
* inferred from a number of contiguity requirements specified using
* require_contiguous(). This allows efficient look-up of the atom index a
* given register address belongs to, or conversely the range of register
* addresses that belong to a given atom.
*/
struct partitioning {
/**
* Create a (for the moment unrestricted) partitioning of a register
* file of size \p n. The units are arbitrary.
*/
partitioning(unsigned n) :
max_reg(n),
offsets(new unsigned[n + num_terminator_atoms]),
atoms(new unsigned[n + num_terminator_atoms])
{
for (unsigned i = 0; i < n + num_terminator_atoms; i++) {
offsets[i] = i;
atoms[i] = i;
}
}
partitioning(const partitioning &p) :
max_reg(p.max_reg),
offsets(new unsigned[p.num_atoms() + num_terminator_atoms]),
atoms(new unsigned[p.max_reg + num_terminator_atoms])
{
memcpy(offsets, p.offsets,
sizeof(unsigned) * (p.num_atoms() + num_terminator_atoms));
memcpy(atoms, p.atoms,
sizeof(unsigned) * (p.max_reg + num_terminator_atoms));
}
~partitioning()
{
delete[] offsets;
delete[] atoms;
}
partitioning &
operator=(partitioning p)
{
SWAP(max_reg, p.max_reg);
SWAP(offsets, p.offsets);
SWAP(atoms, p.atoms);
return *this;
}
/**
* Require register range [reg, reg + n[ to be considered part of the
* same atom.
*/
void
require_contiguous(unsigned reg, unsigned n)
{
unsigned r = atoms[reg];
/* Renumber atoms[reg...] = { r... } and their offsets[r...] for the
* case that the specified contiguity requirement leads to the fusion
* (yay) of one or more existing atoms.
*/
for (unsigned reg1 = reg + 1; reg1 <= max_reg; reg1++) {
if (offsets[atoms[reg1]] < reg + n) {
atoms[reg1] = r;
} else {
if (offsets[atoms[reg1 - 1]] != offsets[atoms[reg1]])
r++;
offsets[r] = offsets[atoms[reg1]];
atoms[reg1] = r;
}
}
}
/**
* Get the atom index register address \p reg belongs to.
*/
unsigned
atom_of_reg(unsigned reg) const
{
return atoms[reg];
}
/**
* Get the base register address that belongs to atom \p r.
*/
unsigned
reg_of_atom(unsigned r) const
{
return offsets[r];
}
/**
* Get the size of atom \p r in register address units.
*/
unsigned
size_of_atom(unsigned r) const
{
assert(r < num_atoms());
return reg_of_atom(r + 1) - reg_of_atom(r);
}
/**
* Get the number of atoms the whole register space is partitioned into.
*/
unsigned
num_atoms() const
{
return atoms[max_reg];
}
private:
/**
* Number of trailing atoms inserted for convenience so among other
* things we don't need to special-case the last element in
* size_of_atom().
*/
static const unsigned num_terminator_atoms = 1;
unsigned max_reg;
unsigned *offsets;
unsigned *atoms;
};
/**
* Only GRF sources (whether they have been register-allocated or not) can
* possibly incur bank conflicts.
*/
bool
is_grf(const fs_reg &r)
{
return r.file == VGRF || r.file == FIXED_GRF;
}
/**
* Register offset of \p r in GRF units. Useful because the representation
* of GRFs post-register allocation is somewhat inconsistent and depends on
* whether the register already had a fixed GRF offset prior to register
* allocation or whether it was part of a VGRF allocation.
*/
unsigned
reg_of(const fs_reg &r)
{
assert(is_grf(r));
if (r.file == VGRF)
return r.nr + r.offset / REG_SIZE;
else
return reg_offset(r) / REG_SIZE;
}
/**
* Calculate the finest partitioning of the GRF space compatible with the
* register contiguity requirements derived from all instructions part of
* the program.
*/
partitioning
shader_reg_partitioning(const fs_visitor *v)
{
partitioning p(BRW_MAX_GRF);
foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
if (is_grf(inst->dst))
p.require_contiguous(reg_of(inst->dst), regs_written(inst));
for (int i = 0; i < inst->sources; i++) {
if (is_grf(inst->src[i]))
p.require_contiguous(reg_of(inst->src[i]), regs_read(inst, i));
}
}
return p;
}
/**
* Return the set of GRF atoms that should be left untouched at their
* original location to avoid violating hardware or software assumptions.
*/
bool *
shader_reg_constraints(const fs_visitor *v, const partitioning &p)
{
bool *constrained = new bool[p.num_atoms()]();
/* These are read implicitly by some send-message instructions without
* any indication at the IR level. Assume they are unsafe to move
* around.
*/
for (unsigned reg = 0; reg < 2; reg++)
constrained[p.atom_of_reg(reg)] = true;
/* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference",
* subsection "EUISA Instructions", Send Message (page 990):
*
* "r127 must not be used for return address when there is a src and
* dest overlap in send instruction."
*
* Register allocation ensures that, so don't move 127 around to avoid
* breaking that property.
*/
if (v->devinfo->ver >= 8)
constrained[p.atom_of_reg(127)] = true;
foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
/* Assume that anything referenced via fixed GRFs is baked into the
* hardware's fixed-function logic and may be unsafe to move around.
* Also take into account the source GRF restrictions of EOT
* send-message instructions.
*/
if (inst->dst.file == FIXED_GRF)
constrained[p.atom_of_reg(reg_of(inst->dst))] = true;
for (int i = 0; i < inst->sources; i++) {
if (inst->src[i].file == FIXED_GRF ||
(is_grf(inst->src[i]) && inst->eot))
constrained[p.atom_of_reg(reg_of(inst->src[i]))] = true;
}
/* Preserve the original allocation of VGRFs used by the barycentric
* source of the LINTERP instruction on Gfx6, since pair-aligned
* barycentrics allow the PLN instruction to be used.
*/
if (v->devinfo->has_pln && v->devinfo->ver <= 6 &&
inst->opcode == FS_OPCODE_LINTERP)
constrained[p.atom_of_reg(reg_of(inst->src[0]))] = true;
/* The location of the Gfx7 MRF hack registers is hard-coded in the
* rest of the compiler back-end. Don't attempt to move them around.
*/
if (v->devinfo->ver >= 7) {
assert(inst->dst.file != MRF);
for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
const unsigned reg = GFX7_MRF_HACK_START + inst->base_mrf + i;
constrained[p.atom_of_reg(reg)] = true;
}
}
}
return constrained;
}
/**
* Return whether the hardware will be able to prevent a bank conflict by
* optimizing out the read cycle of a source register. The formula was
* found experimentally.
*/
bool
is_conflict_optimized_out(const intel_device_info *devinfo,
const fs_inst *inst)
{
return devinfo->ver >= 9 &&
((is_grf(inst->src[0]) && (reg_of(inst->src[0]) == reg_of(inst->src[1]) ||
reg_of(inst->src[0]) == reg_of(inst->src[2]))) ||
reg_of(inst->src[1]) == reg_of(inst->src[2]));
}
/**
* Return a matrix that allows reasonably efficient computation of the
* cycle-count cost of bank conflicts incurred throughout the whole program
* for any given atom-to-bank assignment.
*
* More precisely, if C_r_s_p is the result of this function, the total
* cost of all bank conflicts involving any given atom r can be readily
* recovered as follows:
*
* S(B) = Sum_s_p(d_(p^B_r)_(B_s) * C_r_s_p)
*
* where d_i_j is the Kronecker delta, and B_r indicates the bank
* assignment of r. \sa delta_conflicts() for a vectorized implementation
* of the expression above.
*
* FINISHME: Teach this about the Gfx10+ bank conflict rules, which are
* somewhat more relaxed than on previous generations. In the
* meantime optimizing based on Gfx9 weights is likely to be more
* helpful than not optimizing at all.
*/
weight_vector_type *
shader_conflict_weight_matrix(const fs_visitor *v, const partitioning &p)
{
weight_vector_type *conflicts = new weight_vector_type[p.num_atoms()];
for (unsigned r = 0; r < p.num_atoms(); r++)
conflicts[r] = weight_vector_type(2 * p.num_atoms());
/* Crude approximation of the number of times the current basic block
* will be executed at run-time.
*/
unsigned block_scale = 1;
foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
if (inst->opcode == BRW_OPCODE_DO) {
block_scale *= 10;
} else if (inst->opcode == BRW_OPCODE_WHILE) {
block_scale /= 10;
} else if (inst->is_3src(v->compiler) &&
is_grf(inst->src[1]) && is_grf(inst->src[2])) {
const unsigned r = p.atom_of_reg(reg_of(inst->src[1]));
const unsigned s = p.atom_of_reg(reg_of(inst->src[2]));
/* Estimate of the cycle-count cost of incurring a bank conflict
* for this instruction. This is only true on the average, for a
* sequence of back-to-back ternary instructions, since the EU
* front-end only seems to be able to issue a new instruction at
* an even cycle. The cost of a bank conflict incurred by an
* isolated ternary instruction may be higher.
*/
const unsigned exec_size = inst->dst.component_size(inst->exec_size);
const unsigned cycle_scale = block_scale * DIV_ROUND_UP(exec_size,
REG_SIZE);
/* Neglect same-atom conflicts (since they're either trivial or
* impossible to avoid without splitting the atom), and conflicts
* known to be optimized out by the hardware.
*/
if (r != s && !is_conflict_optimized_out(v->devinfo, inst)) {
/* Calculate the parity of the sources relative to the start of
* their respective atoms. If their parity is the same (and
* none of the atoms straddle the 2KB mark), the instruction
* will incur a conflict iff both atoms are assigned the same
* bank b. If their parity is opposite, the instruction will
* incur a conflict iff they are assigned opposite banks (b and
* b^1).
*/
const bool p_r = 1 & (reg_of(inst->src[1]) - p.reg_of_atom(r));
const bool p_s = 1 & (reg_of(inst->src[2]) - p.reg_of_atom(s));
const unsigned p = p_r ^ p_s;
/* Calculate the updated cost of a hypothetical conflict
* between atoms r and s. Note that the weight matrix is
* symmetric with respect to indices r and s by construction.
*/
const scalar_type w = MIN2(unsigned(max_scalar),
get(conflicts[r], s, p) + cycle_scale);
set(conflicts[r], s, p, w);
set(conflicts[s], r, p, w);
}
}
}
return conflicts;
}
/**
* Return the set of GRF atoms that could potentially lead to bank
* conflicts if laid out unfavorably in the GRF space according to
* the specified \p conflicts matrix (\sa
* shader_conflict_weight_matrix()).
*/
bool *
have_any_conflicts(const partitioning &p,
const weight_vector_type *conflicts)
{
bool *any_conflicts = new bool[p.num_atoms()]();
for (unsigned r = 0; r < p.num_atoms(); r++) {
const unsigned m = DIV_ROUND_UP(conflicts[r].size, vector_width);
for (unsigned s = 0; s < m; s++)
any_conflicts[r] |= sums(conflicts[r].v[s]);
}
return any_conflicts;
}
/**
* Calculate the difference between two S(B) cost estimates as defined
* above (\sa shader_conflict_weight_matrix()). This represents the
* (partial) cycle-count benefit from moving an atom r from bank p to n.
* The respective bank assignments Bp and Bn are encoded as the \p
* bank_mask_p and \p bank_mask_n bitmasks for efficient computation,
* according to the formula:
*
* bank_mask(B)_s_p = -d_(p^B_r)_(B_s)
*
* Notice the similarity with the delta function in the S(B) expression
* above, and how bank_mask(B) can be precomputed for every possible
* selection of r since bank_mask(B) only depends on it via B_r that may
* only assume one of four different values, so the caller can keep every
* possible bank_mask(B) vector in memory without much hassle (\sa
* bank_characteristics()).
*/
int
delta_conflicts(const weight_vector_type &bank_mask_p,
const weight_vector_type &bank_mask_n,
const weight_vector_type &conflicts)
{
const unsigned m = DIV_ROUND_UP(conflicts.size, vector_width);
vector_type s_p = {}, s_n = {};
for (unsigned r = 0; r < m; r++) {
s_p = adds(s_p, mask(bank_mask_p.v[r], conflicts.v[r]));
s_n = adds(s_n, mask(bank_mask_n.v[r], conflicts.v[r]));
}
return sums(subs(s_p, s_n));
}
/**
* Register atom permutation, represented as the start GRF offset each atom
* is mapped into.
*/
struct permutation {
permutation() : v(NULL), size(0) {}
permutation(unsigned n) :
v(new unsigned[n]()), size(n) {}
permutation(const permutation &p) :
v(new unsigned[p.size]), size(p.size)
{
memcpy(v, p.v, p.size * sizeof(unsigned));
}
~permutation()
{
delete[] v;
}
permutation &
operator=(permutation p)
{
SWAP(v, p.v);
SWAP(size, p.size);
return *this;
}
unsigned *v;
unsigned size;
};
/**
* Return an identity permutation of GRF atoms.
*/
permutation
identity_reg_permutation(const partitioning &p)
{
permutation map(p.num_atoms());
for (unsigned r = 0; r < map.size; r++)
map.v[r] = p.reg_of_atom(r);
return map;
}
/**
* Return the bank index of GRF address \p reg, numbered according to the
* table:
* Even Odd
* Lo 0 1
* Hi 2 3
*/
unsigned
bank_of(unsigned reg)
{
return (reg & 0x40) >> 5 | (reg & 1);
}
/**
* Return bitmasks suitable for use as bank mask arguments for the
* delta_conflicts() computation. Note that this is just the (negative)
* characteristic function of each bank, if you regard it as a set
* containing all atoms assigned to it according to the \p map array.
*/
weight_vector_type *
bank_characteristics(const permutation &map)
{
weight_vector_type *banks = new weight_vector_type[4];
for (unsigned b = 0; b < 4; b++) {
banks[b] = weight_vector_type(2 * map.size);
for (unsigned j = 0; j < map.size; j++) {
for (unsigned p = 0; p < 2; p++)
set(banks[b], j, p,
(b ^ p) == bank_of(map.v[j]) ? -1 : 0);
}
}
return banks;
}
/**
* Return an improved permutation of GRF atoms based on \p map attempting
* to reduce the total cycle-count cost of bank conflicts greedily.
*
* Note that this doesn't attempt to merge multiple atoms into one, which
* may allow it to do a better job in some cases -- It simply reorders
* existing atoms in the GRF space without affecting their identity.
*/
permutation
optimize_reg_permutation(const partitioning &p,
const bool *constrained,
const weight_vector_type *conflicts,
permutation map)
{
const bool *any_conflicts = have_any_conflicts(p, conflicts);
weight_vector_type *banks = bank_characteristics(map);
for (unsigned r = 0; r < map.size; r++) {
const unsigned bank_r = bank_of(map.v[r]);
if (!constrained[r]) {
unsigned best_s = r;
int best_benefit = 0;
for (unsigned s = 0; s < map.size; s++) {
const unsigned bank_s = bank_of(map.v[s]);
if (bank_r != bank_s && !constrained[s] &&
p.size_of_atom(r) == p.size_of_atom(s) &&
(any_conflicts[r] || any_conflicts[s])) {
const int benefit =
delta_conflicts(banks[bank_r], banks[bank_s], conflicts[r]) +
delta_conflicts(banks[bank_s], banks[bank_r], conflicts[s]);
if (benefit > best_benefit) {
best_s = s;
best_benefit = benefit;
}
}
}
if (best_s != r) {
for (unsigned b = 0; b < 4; b++) {
for (unsigned p = 0; p < 2; p++)
swap(banks[b], r, p, best_s, p);
}
SWAP(map.v[r], map.v[best_s]);
}
}
}
delete[] banks;
delete[] any_conflicts;
return map;
}
/**
* Apply the GRF atom permutation given by \p map to register \p r and
* return the result.
*/
fs_reg
transform(const partitioning &p, const permutation &map, fs_reg r)
{
if (r.file == VGRF) {
const unsigned reg = reg_of(r);
const unsigned s = p.atom_of_reg(reg);
r.nr = map.v[s] + reg - p.reg_of_atom(s);
r.offset = r.offset % REG_SIZE;
}
return r;
}
}
bool
fs_visitor::opt_bank_conflicts()
{
assert(grf_used || !"Must be called after register allocation");
/* TODO: Re-work this pass for Gfx20+. */
if (devinfo->ver >= 20)
return false;
/* No ternary instructions -- No bank conflicts. */
if (devinfo->ver < 6)
return false;
const partitioning p = shader_reg_partitioning(this);
const bool *constrained = shader_reg_constraints(this, p);
const weight_vector_type *conflicts =
shader_conflict_weight_matrix(this, p);
const permutation map =
optimize_reg_permutation(p, constrained, conflicts,
identity_reg_permutation(p));
foreach_block_and_inst(block, fs_inst, inst, cfg) {
inst->dst = transform(p, map, inst->dst);
for (int i = 0; i < inst->sources; i++)
inst->src[i] = transform(p, map, inst->src[i]);
}
delete[] conflicts;
delete[] constrained;
return true;
}
/**
* Return whether the instruction incurs GRF bank conflict cycles.
*
* Note that this is only accurate after register allocation because otherwise
* we don't know which bank each VGRF is going to end up aligned to.
*/
bool
has_bank_conflict(const struct brw_isa_info *isa, const fs_inst *inst)
{
return is_3src(isa, inst->opcode) &&
is_grf(inst->src[1]) && is_grf(inst->src[2]) &&
bank_of(reg_of(inst->src[1])) == bank_of(reg_of(inst->src[2])) &&
!is_conflict_optimized_out(isa->devinfo, inst);
}

View file

@ -0,0 +1,965 @@
/* -*- c++ -*- */
/*
* Copyright © 2010-2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_FS_BUILDER_H
#define BRW_FS_BUILDER_H
#include "brw_ir_fs.h"
#include "brw_shader.h"
#include "brw_eu.h"
#include "brw_fs.h"
namespace brw {
/**
* Toolbox to assemble an FS IR program out of individual instructions.
*
* This object is meant to have an interface consistent with
* brw::vec4_builder. They cannot be fully interchangeable because
* brw::fs_builder generates scalar code while brw::vec4_builder generates
* vector code.
*/
class fs_builder {
public:
/** Type used in this IR to represent a source of an instruction. */
typedef fs_reg src_reg;
/** Type used in this IR to represent the destination of an instruction. */
typedef fs_reg dst_reg;
/** Type used in this IR to represent an instruction. */
typedef fs_inst instruction;
/**
* Construct an fs_builder that inserts instructions into \p shader.
* \p dispatch_width gives the native execution width of the program.
*/
fs_builder(fs_visitor *shader,
unsigned dispatch_width) :
shader(shader), block(NULL), cursor(NULL),
_dispatch_width(dispatch_width),
_group(0),
force_writemask_all(false),
annotation()
{
}
explicit fs_builder(fs_visitor *s) : fs_builder(s, s->dispatch_width) {}
/**
* Construct an fs_builder that inserts instructions into \p shader
* before instruction \p inst in basic block \p block. The default
* execution controls and debug annotation are initialized from the
* instruction passed as argument.
*/
fs_builder(fs_visitor *shader, bblock_t *block, fs_inst *inst) :
shader(shader), block(block), cursor(inst),
_dispatch_width(inst->exec_size),
_group(inst->group),
force_writemask_all(inst->force_writemask_all)
{
annotation.str = inst->annotation;
annotation.ir = inst->ir;
}
/**
* Construct an fs_builder that inserts instructions before \p cursor in
* basic block \p block, inheriting other code generation parameters
* from this.
*/
fs_builder
at(bblock_t *block, exec_node *cursor) const
{
fs_builder bld = *this;
bld.block = block;
bld.cursor = cursor;
return bld;
}
/**
* Construct an fs_builder appending instructions at the end of the
* instruction list of the shader, inheriting other code generation
* parameters from this.
*/
fs_builder
at_end() const
{
return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
}
/**
* Construct a builder specifying the default SIMD width and group of
* channel enable signals, inheriting other code generation parameters
* from this.
*
* \p n gives the default SIMD width, \p i gives the slot group used for
* predication and control flow masking in multiples of \p n channels.
*/
fs_builder
group(unsigned n, unsigned i) const
{
fs_builder bld = *this;
if (n <= dispatch_width() && i < dispatch_width() / n) {
bld._group += i * n;
} else {
/* The requested channel group isn't a subset of the channel group
* of this builder, which means that the resulting instructions
* would use (potentially undefined) channel enable signals not
* specified by the parent builder. That's only valid if the
* instruction doesn't have per-channel semantics, in which case
* we should clear off the default group index in order to prevent
* emitting instructions with channel group not aligned to their
* own execution size.
*/
assert(force_writemask_all);
bld._group = 0;
}
bld._dispatch_width = n;
return bld;
}
/**
* Alias for group() with width equal to eight.
*/
fs_builder
quarter(unsigned i) const
{
return group(8, i);
}
/**
* Construct a builder with per-channel control flow execution masking
* disabled if \p b is true. If control flow execution masking is
* already disabled this has no effect.
*/
fs_builder
exec_all(bool b = true) const
{
fs_builder bld = *this;
if (b)
bld.force_writemask_all = true;
return bld;
}
/**
* Construct a builder with the given debug annotation info.
*/
fs_builder
annotate(const char *str, const void *ir = NULL) const
{
fs_builder bld = *this;
bld.annotation.str = str;
bld.annotation.ir = ir;
return bld;
}
/**
* Get the SIMD width in use.
*/
unsigned
dispatch_width() const
{
return _dispatch_width;
}
/**
* Get the channel group in use.
*/
unsigned
group() const
{
return _group;
}
/**
* Allocate a virtual register of natural vector size (one for this IR)
* and SIMD width. \p n gives the amount of space to allocate in
* dispatch_width units (which is just enough space for one logical
* component in this IR).
*/
dst_reg
vgrf(enum brw_reg_type type, unsigned n = 1) const
{
const unsigned unit = reg_unit(shader->devinfo);
assert(dispatch_width() <= 32);
if (n > 0)
return dst_reg(VGRF, shader->alloc.allocate(
DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
unit * REG_SIZE) * unit),
type);
else
return retype(null_reg_ud(), type);
}
/**
* Create a null register of floating type.
*/
dst_reg
null_reg_f() const
{
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
}
dst_reg
null_reg_df() const
{
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
}
/**
* Create a null register of signed integer type.
*/
dst_reg
null_reg_d() const
{
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
}
/**
* Create a null register of unsigned integer type.
*/
dst_reg
null_reg_ud() const
{
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
}
/**
* Insert an instruction into the program.
*/
instruction *
emit(const instruction &inst) const
{
return emit(new(shader->mem_ctx) instruction(inst));
}
/**
* Create and insert a nullary control instruction into the program.
*/
instruction *
emit(enum opcode opcode) const
{
return emit(instruction(opcode, dispatch_width()));
}
/**
* Create and insert a nullary instruction into the program.
*/
instruction *
emit(enum opcode opcode, const dst_reg &dst) const
{
return emit(instruction(opcode, dispatch_width(), dst));
}
/**
* Create and insert a unary instruction into the program.
*/
instruction *
emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
{
switch (opcode) {
case SHADER_OPCODE_RCP:
case SHADER_OPCODE_RSQ:
case SHADER_OPCODE_SQRT:
case SHADER_OPCODE_EXP2:
case SHADER_OPCODE_LOG2:
case SHADER_OPCODE_SIN:
case SHADER_OPCODE_COS:
return emit(instruction(opcode, dispatch_width(), dst,
fix_math_operand(src0)));
default:
return emit(instruction(opcode, dispatch_width(), dst, src0));
}
}
/**
* Create and insert a binary instruction into the program.
*/
instruction *
emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
const src_reg &src1) const
{
switch (opcode) {
case SHADER_OPCODE_POW:
case SHADER_OPCODE_INT_QUOTIENT:
case SHADER_OPCODE_INT_REMAINDER:
return emit(instruction(opcode, dispatch_width(), dst,
fix_math_operand(src0),
fix_math_operand(src1)));
default:
return emit(instruction(opcode, dispatch_width(), dst,
src0, src1));
}
}
/**
* Create and insert a ternary instruction into the program.
*/
instruction *
emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
const src_reg &src1, const src_reg &src2) const
{
switch (opcode) {
case BRW_OPCODE_BFE:
case BRW_OPCODE_BFI2:
case BRW_OPCODE_MAD:
case BRW_OPCODE_LRP:
return emit(instruction(opcode, dispatch_width(), dst,
fix_3src_operand(src0),
fix_3src_operand(src1),
fix_3src_operand(src2)));
default:
return emit(instruction(opcode, dispatch_width(), dst,
src0, src1, src2));
}
}
/**
* Create and insert an instruction with a variable number of sources
* into the program.
*/
instruction *
emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
unsigned n) const
{
/* Use the emit() methods for specific operand counts to ensure that
* opcode-specific operand fixups occur.
*/
if (n == 2) {
return emit(opcode, dst, srcs[0], srcs[1]);
} else if (n == 3) {
return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
} else {
return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
}
}
/**
* Insert a preallocated instruction into the program.
*/
instruction *
emit(instruction *inst) const
{
assert(inst->exec_size <= 32);
assert(inst->exec_size == dispatch_width() ||
force_writemask_all);
inst->group = _group;
inst->force_writemask_all = force_writemask_all;
inst->annotation = annotation.str;
inst->ir = annotation.ir;
if (block)
static_cast<instruction *>(cursor)->insert_before(block, inst);
else
cursor->insert_before(inst);
return inst;
}
/**
* Select \p src0 if the comparison of both sources with the given
* conditional mod evaluates to true, otherwise select \p src1.
*
* Generally useful to get the minimum or maximum of two values.
*/
instruction *
emit_minmax(const dst_reg &dst, const src_reg &src0,
const src_reg &src1, brw_conditional_mod mod) const
{
assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
/* In some cases we can't have bytes as operand for src1, so use the
* same type for both operand.
*/
return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
fix_unsigned_negate(src1)));
}
/**
* Copy any live channel from \p src to the first channel of the result.
*/
src_reg
emit_uniformize(const src_reg &src) const
{
/* FIXME: We use a vector chan_index and dst to allow constant and
* copy propagration to move result all the way into the consuming
* instruction (typically a surface index or sampler index for a
* send). This uses 1 or 3 extra hw registers in 16 or 32 wide
* dispatch. Once we teach const/copy propagation about scalars we
* should go back to scalar destinations here.
*/
const fs_builder ubld = exec_all();
const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
const dst_reg dst = vgrf(src.type);
ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
return src_reg(component(dst, 0));
}
src_reg
move_to_vgrf(const src_reg &src, unsigned num_components) const
{
src_reg *const src_comps = new src_reg[num_components];
for (unsigned i = 0; i < num_components; i++)
src_comps[i] = offset(src, dispatch_width(), i);
const dst_reg dst = vgrf(src.type, num_components);
LOAD_PAYLOAD(dst, src_comps, num_components, 0);
delete[] src_comps;
return src_reg(dst);
}
void
emit_scan_step(enum opcode opcode, brw_conditional_mod mod,
const dst_reg &tmp,
unsigned left_offset, unsigned left_stride,
unsigned right_offset, unsigned right_stride) const
{
dst_reg left, right;
left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
if ((tmp.type == BRW_REGISTER_TYPE_Q ||
tmp.type == BRW_REGISTER_TYPE_UQ) &&
!shader->devinfo->has_64bit_int) {
switch (opcode) {
case BRW_OPCODE_MUL:
/* This will get lowered by integer MUL lowering */
set_condmod(mod, emit(opcode, right, left, right));
break;
case BRW_OPCODE_SEL: {
/* In order for the comparisons to work out right, we need our
* comparisons to be strict.
*/
assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE);
if (mod == BRW_CONDITIONAL_GE)
mod = BRW_CONDITIONAL_G;
/* We treat the bottom 32 bits as unsigned regardless of
* whether or not the integer as a whole is signed.
*/
dst_reg right_low = subscript(right, BRW_REGISTER_TYPE_UD, 0);
dst_reg left_low = subscript(left, BRW_REGISTER_TYPE_UD, 0);
/* The upper bits get the same sign as the 64-bit type */
brw_reg_type type32 = brw_reg_type_from_bit_size(32, tmp.type);
dst_reg right_high = subscript(right, type32, 1);
dst_reg left_high = subscript(left, type32, 1);
/* Build up our comparison:
*
* l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
*/
CMP(null_reg_ud(), retype(left_low, BRW_REGISTER_TYPE_UD),
retype(right_low, BRW_REGISTER_TYPE_UD), mod);
set_predicate(BRW_PREDICATE_NORMAL,
CMP(null_reg_ud(), left_high, right_high,
BRW_CONDITIONAL_EQ));
set_predicate_inv(BRW_PREDICATE_NORMAL, true,
CMP(null_reg_ud(), left_high, right_high, mod));
/* We could use selects here or we could use predicated MOVs
* because the destination and second source (if it were a SEL)
* are the same.
*/
set_predicate(BRW_PREDICATE_NORMAL, MOV(right_low, left_low));
set_predicate(BRW_PREDICATE_NORMAL, MOV(right_high, left_high));
break;
}
default:
unreachable("Unsupported 64-bit scan op");
}
} else {
set_condmod(mod, emit(opcode, right, left, right));
}
}
void
emit_scan(enum opcode opcode, const dst_reg &tmp,
unsigned cluster_size, brw_conditional_mod mod) const
{
assert(dispatch_width() >= 8);
/* The instruction splitting code isn't advanced enough to split
* these so we need to handle that ourselves.
*/
if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
const unsigned half_width = dispatch_width() / 2;
const fs_builder ubld = exec_all().group(half_width, 0);
dst_reg left = tmp;
dst_reg right = horiz_offset(tmp, half_width);
ubld.emit_scan(opcode, left, cluster_size, mod);
ubld.emit_scan(opcode, right, cluster_size, mod);
if (cluster_size > half_width) {
ubld.emit_scan_step(opcode, mod, tmp,
half_width - 1, 0, half_width, 1);
}
return;
}
if (cluster_size > 1) {
const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2);
}
if (cluster_size > 2) {
if (type_sz(tmp.type) <= 4) {
const fs_builder ubld =
exec_all().group(dispatch_width() / 4, 0);
ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4);
ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4);
} else {
/* For 64-bit types, we have to do things differently because
* the code above would land us with destination strides that
* the hardware can't handle. Fortunately, we'll only be
* 8-wide in that case and it's the same number of
* instructions.
*/
const fs_builder ubld = exec_all().group(2, 0);
for (unsigned i = 0; i < dispatch_width(); i += 4)
ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1);
}
}
for (unsigned i = 4;
i < MIN2(cluster_size, dispatch_width());
i *= 2) {
const fs_builder ubld = exec_all().group(i, 0);
ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1);
if (dispatch_width() > i * 2)
ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);
if (dispatch_width() > i * 4) {
ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
}
}
}
instruction *
emit_undef_for_dst(const instruction *old_inst) const
{
assert(old_inst->dst.file == VGRF);
instruction *inst = emit(SHADER_OPCODE_UNDEF,
retype(old_inst->dst, BRW_REGISTER_TYPE_UD));
inst->size_written = old_inst->size_written;
return inst;
}
/**
* Assorted arithmetic ops.
* @{
*/
#define ALU1(op) \
instruction * \
op(const dst_reg &dst, const src_reg &src0) const \
{ \
return emit(BRW_OPCODE_##op, dst, src0); \
}
#define ALU2(op) \
instruction * \
op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
{ \
return emit(BRW_OPCODE_##op, dst, src0, src1); \
}
#define ALU2_ACC(op) \
instruction * \
op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
{ \
instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
inst->writes_accumulator = true; \
return inst; \
}
#define ALU3(op) \
instruction * \
op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
const src_reg &src2) const \
{ \
return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
}
ALU2(ADD)
ALU3(ADD3)
ALU2_ACC(ADDC)
ALU2(AND)
ALU2(ASR)
ALU2(AVG)
ALU3(BFE)
ALU2(BFI1)
ALU3(BFI2)
ALU1(BFREV)
ALU1(CBIT)
ALU1(DIM)
ALU2(DP2)
ALU2(DP3)
ALU2(DP4)
ALU2(DPH)
ALU1(FBH)
ALU1(FBL)
ALU1(FRC)
ALU3(DP4A)
ALU2(LINE)
ALU1(LZD)
ALU2(MAC)
ALU2_ACC(MACH)
ALU3(MAD)
ALU1(MOV)
ALU2(MUL)
ALU1(NOT)
ALU2(OR)
ALU2(PLN)
ALU1(RNDD)
ALU1(RNDE)
ALU1(RNDU)
ALU1(RNDZ)
ALU2(ROL)
ALU2(ROR)
ALU2(SAD2)
ALU2_ACC(SADA2)
ALU2(SEL)
ALU2(SHL)
ALU2(SHR)
ALU2_ACC(SUBB)
ALU2(XOR)
#undef ALU3
#undef ALU2_ACC
#undef ALU2
#undef ALU1
instruction *
F32TO16(const dst_reg &dst, const src_reg &src) const
{
assert(dst.type == BRW_REGISTER_TYPE_HF);
assert(src.type == BRW_REGISTER_TYPE_F);
if (shader->devinfo->ver >= 8) {
return MOV(dst, src);
} else {
assert(shader->devinfo->ver == 7);
return emit(BRW_OPCODE_F32TO16,
retype(dst, BRW_REGISTER_TYPE_W), src);
}
}
instruction *
F16TO32(const dst_reg &dst, const src_reg &src) const
{
assert(dst.type == BRW_REGISTER_TYPE_F);
assert(src.type == BRW_REGISTER_TYPE_HF);
if (shader->devinfo->ver >= 8) {
return MOV(dst, src);
} else {
assert(shader->devinfo->ver == 7);
return emit(BRW_OPCODE_F16TO32,
dst, retype(src, BRW_REGISTER_TYPE_W));
}
}
/** @} */
/**
* CMP: Sets the low bit of the destination channels with the result
* of the comparison, while the upper bits are undefined, and updates
* the flag register with the packed 16 bits of the result.
*/
instruction *
CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
brw_conditional_mod condition) const
{
/* Take the instruction:
*
* CMP null<d> src0<f> src1<f>
*
* Original gfx4 does type conversion to the destination type
* before comparison, producing garbage results for floating
* point comparisons.
*
* The destination type doesn't matter on newer generations,
* so we set the type to match src0 so we can compact the
* instruction.
*/
return set_condmod(condition,
emit(BRW_OPCODE_CMP, retype(dst, src0.type),
fix_unsigned_negate(src0),
fix_unsigned_negate(src1)));
}
/**
* CMPN: Behaves like CMP, but produces true if src1 is NaN.
*/
instruction *
CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
brw_conditional_mod condition) const
{
/* Take the instruction:
*
* CMP null<d> src0<f> src1<f>
*
* Original gfx4 does type conversion to the destination type
* before comparison, producing garbage results for floating
* point comparisons.
*
* The destination type doesn't matter on newer generations,
* so we set the type to match src0 so we can compact the
* instruction.
*/
return set_condmod(condition,
emit(BRW_OPCODE_CMPN, retype(dst, src0.type),
fix_unsigned_negate(src0),
fix_unsigned_negate(src1)));
}
/**
* Gfx4 predicated IF.
*/
instruction *
IF(brw_predicate predicate) const
{
return set_predicate(predicate, emit(BRW_OPCODE_IF));
}
/**
* CSEL: dst = src2 <op> 0.0f ? src0 : src1
*/
instruction *
CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
const src_reg &src2, brw_conditional_mod condition) const
{
/* CSEL only operates on floats, so we can't do integer </<=/>=/>
* comparisons. Zero/non-zero (== and !=) comparisons almost work.
* 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
*/
assert(src2.type == BRW_REGISTER_TYPE_F);
return set_condmod(condition,
emit(BRW_OPCODE_CSEL,
retype(dst, BRW_REGISTER_TYPE_F),
retype(src0, BRW_REGISTER_TYPE_F),
retype(src1, BRW_REGISTER_TYPE_F),
src2));
}
/**
* Emit a linear interpolation instruction.
*/
instruction *
LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
const src_reg &a) const
{
if (shader->devinfo->ver >= 6 && shader->devinfo->ver <= 10) {
/* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
* we need to reorder the operands.
*/
return emit(BRW_OPCODE_LRP, dst, a, y, x);
} else {
/* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
const dst_reg y_times_a = vgrf(dst.type);
const dst_reg one_minus_a = vgrf(dst.type);
const dst_reg x_times_one_minus_a = vgrf(dst.type);
MUL(y_times_a, y, a);
ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
}
}
/**
* Collect a number of registers in a contiguous range of registers.
*/
instruction *
LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
unsigned sources, unsigned header_size) const
{
instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
inst->header_size = header_size;
inst->size_written = header_size * REG_SIZE;
for (unsigned i = header_size; i < sources; i++) {
inst->size_written += dispatch_width() * type_sz(src[i].type) *
dst.stride;
}
return inst;
}
instruction *
UNDEF(const dst_reg &dst) const
{
assert(dst.file == VGRF);
assert(dst.offset % REG_SIZE == 0);
instruction *inst = emit(SHADER_OPCODE_UNDEF,
retype(dst, BRW_REGISTER_TYPE_UD));
inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE - dst.offset;
return inst;
}
instruction *
DPAS(const dst_reg &dst, const src_reg &src0, const src_reg &src1, const src_reg &src2,
unsigned sdepth, unsigned rcount) const
{
assert(_dispatch_width == 8);
assert(sdepth == 8);
assert(rcount == 1 || rcount == 2 || rcount == 4 || rcount == 8);
instruction *inst = emit(BRW_OPCODE_DPAS, dst, src0, src1, src2);
inst->sdepth = sdepth;
inst->rcount = rcount;
if (dst.type == BRW_REGISTER_TYPE_HF) {
inst->size_written = rcount * REG_SIZE / 2;
} else {
inst->size_written = rcount * REG_SIZE;
}
return inst;
}
fs_visitor *shader;
fs_inst *BREAK() { return emit(BRW_OPCODE_BREAK); }
fs_inst *DO() { return emit(BRW_OPCODE_DO); }
fs_inst *ENDIF() { return emit(BRW_OPCODE_ENDIF); }
fs_inst *NOP() { return emit(BRW_OPCODE_NOP); }
fs_inst *WHILE() { return emit(BRW_OPCODE_WHILE); }
fs_inst *CONTINUE() { return emit(BRW_OPCODE_CONTINUE); }
private:
/**
* Workaround for negation of UD registers. See comment in
* fs_generator::generate_code() for more details.
*/
src_reg
fix_unsigned_negate(const src_reg &src) const
{
if (src.type == BRW_REGISTER_TYPE_UD &&
src.negate) {
dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
MOV(temp, src);
return src_reg(temp);
} else {
return src;
}
}
/**
* Workaround for source register modes not supported by the ternary
* instruction encoding.
*/
src_reg
fix_3src_operand(const src_reg &src) const
{
switch (src.file) {
case FIXED_GRF:
/* FINISHME: Could handle scalar region, other stride=1 regions */
if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
src.width != BRW_WIDTH_8 ||
src.hstride != BRW_HORIZONTAL_STRIDE_1)
break;
FALLTHROUGH;
case ATTR:
case VGRF:
case UNIFORM:
case IMM:
return src;
default:
break;
}
dst_reg expanded = vgrf(src.type);
MOV(expanded, src);
return expanded;
}
/**
* Workaround for source register modes not supported by the math
* instruction.
*/
src_reg
fix_math_operand(const src_reg &src) const
{
/* Can't do hstride == 0 args on gfx6 math, so expand it out. We
* might be able to do better by doing execsize = 1 math and then
* expanding that result out, but we would need to be careful with
* masking.
*
* Gfx6 hardware ignores source modifiers (negate and abs) on math
* instructions, so we also move to a temp to set those up.
*
* Gfx7 relaxes most of the above restrictions, but still can't use IMM
* operands to math
*/
if ((shader->devinfo->ver == 6 &&
(src.file == IMM || src.file == UNIFORM ||
src.abs || src.negate)) ||
(shader->devinfo->ver == 7 && src.file == IMM)) {
const dst_reg tmp = vgrf(src.type);
MOV(tmp, src);
return tmp;
} else {
return src;
}
}
bblock_t *block;
exec_node *cursor;
unsigned _dispatch_width;
unsigned _group;
bool force_writemask_all;
/** Debug annotation info. */
struct {
const char *str;
const void *ir;
} annotation;
};
}
static inline fs_reg
offset(const fs_reg &reg, const brw::fs_builder &bld, unsigned delta)
{
return offset(reg, bld.dispatch_width(), delta);
}
#endif

View file

@ -0,0 +1,568 @@
/*
* Copyright © 2014 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_fs.h"
#include "brw_cfg.h"
#include "brw_eu.h"
/** @file brw_fs_cmod_propagation.cpp
*
* Implements a pass that propagates the conditional modifier from a CMP x 0.0
* instruction into the instruction that generated x. For instance, in this
* sequence
*
* add(8) g70<1>F g69<8,8,1>F 4096F
* cmp.ge.f0(8) null g70<8,8,1>F 0F
*
* we can do the comparison as part of the ADD instruction directly:
*
* add.ge.f0(8) g70<1>F g69<8,8,1>F 4096F
*
* If there had been a use of the flag register and another CMP using g70
*
* add.ge.f0(8) g70<1>F g69<8,8,1>F 4096F
* (+f0) sel(8) g71<F> g72<8,8,1>F g73<8,8,1>F
* cmp.ge.f0(8) null g70<8,8,1>F 0F
*
* we can recognize that the CMP is generating the flag value that already
* exists and therefore remove the instruction.
*/
using namespace brw;
static bool
cmod_propagate_cmp_to_add(const intel_device_info *devinfo, bblock_t *block,
fs_inst *inst)
{
bool read_flag = false;
const unsigned flags_written = inst->flags_written(devinfo);
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
if (scan_inst->opcode == BRW_OPCODE_ADD &&
!scan_inst->is_partial_write() &&
scan_inst->exec_size == inst->exec_size) {
bool negate;
/* A CMP is basically a subtraction. The result of the
* subtraction must be the same as the result of the addition.
* This means that one of the operands must be negated. So (a +
* b) vs (a == -b) or (a + -b) vs (a == b).
*/
if ((inst->src[0].equals(scan_inst->src[0]) &&
inst->src[1].negative_equals(scan_inst->src[1])) ||
(inst->src[0].equals(scan_inst->src[1]) &&
inst->src[1].negative_equals(scan_inst->src[0]))) {
negate = false;
} else if ((inst->src[0].negative_equals(scan_inst->src[0]) &&
inst->src[1].equals(scan_inst->src[1])) ||
(inst->src[0].negative_equals(scan_inst->src[1]) &&
inst->src[1].equals(scan_inst->src[0]))) {
negate = true;
} else {
goto not_match;
}
/* If the scan instruction writes a different flag register than the
* instruction we're trying to propagate from, bail.
*
* FINISHME: The second part of the condition may be too strong.
* Perhaps (scan_inst->flags_written() & flags_written) !=
* flags_written?
*/
if (scan_inst->flags_written(devinfo) != 0 &&
scan_inst->flags_written(devinfo) != flags_written)
goto not_match;
/* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags":
*
* * Note that the [post condition signal] bits generated at
* the output of a compute are before the .sat.
*
* Paragraph about post_zero does not mention saturation, but
* testing it on actual GPUs shows that conditional modifiers
* are applied after saturation.
*
* * post_zero bit: This bit reflects whether the final
* result is zero after all the clamping, normalizing,
* or format conversion logic.
*
* For signed types we don't care about saturation: it won't
* change the result of conditional modifier.
*
* For floating and unsigned types there two special cases,
* when we can remove inst even if scan_inst is saturated: G
* and LE. Since conditional modifiers are just comparisons
* against zero, saturating positive values to the upper
* limit never changes the result of comparison.
*
* For negative values:
* (sat(x) > 0) == (x > 0) --- false
* (sat(x) <= 0) == (x <= 0) --- true
*/
const enum brw_conditional_mod cond =
negate ? brw_swap_cmod(inst->conditional_mod)
: inst->conditional_mod;
if (scan_inst->saturate &&
(brw_reg_type_is_floating_point(scan_inst->dst.type) ||
brw_reg_type_is_unsigned_integer(scan_inst->dst.type)) &&
(cond != BRW_CONDITIONAL_G &&
cond != BRW_CONDITIONAL_LE))
goto not_match;
/* Otherwise, try propagating the conditional. */
if (scan_inst->can_do_cmod() &&
((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
scan_inst->conditional_mod == cond)) {
scan_inst->conditional_mod = cond;
scan_inst->flag_subreg = inst->flag_subreg;
inst->remove(block, true);
return true;
}
break;
}
not_match:
if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
break;
read_flag = read_flag ||
(scan_inst->flags_read(devinfo) & flags_written) != 0;
}
return false;
}
/**
* Propagate conditional modifiers from NOT instructions
*
* Attempt to convert sequences like
*
* or(8) g78<8,8,1> g76<8,8,1>UD g77<8,8,1>UD
* ...
* not.nz.f0(8) null g78<8,8,1>UD
*
* into
*
* or.z.f0(8) g78<8,8,1> g76<8,8,1>UD g77<8,8,1>UD
*/
static bool
cmod_propagate_not(const intel_device_info *devinfo, bblock_t *block,
fs_inst *inst)
{
const enum brw_conditional_mod cond = brw_negate_cmod(inst->conditional_mod);
bool read_flag = false;
const unsigned flags_written = inst->flags_written(devinfo);
if (cond != BRW_CONDITIONAL_Z && cond != BRW_CONDITIONAL_NZ)
return false;
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
inst->src[0], inst->size_read(0))) {
if (scan_inst->opcode != BRW_OPCODE_OR &&
scan_inst->opcode != BRW_OPCODE_AND)
break;
if (scan_inst->is_partial_write() ||
scan_inst->dst.offset != inst->src[0].offset ||
scan_inst->exec_size != inst->exec_size)
break;
/* If the scan instruction writes a different flag register than the
* instruction we're trying to propagate from, bail.
*
* FINISHME: The second part of the condition may be too strong.
* Perhaps (scan_inst->flags_written() & flags_written) !=
* flags_written?
*/
if (scan_inst->flags_written(devinfo) != 0 &&
scan_inst->flags_written(devinfo) != flags_written)
break;
if (scan_inst->can_do_cmod() &&
((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
scan_inst->conditional_mod == cond)) {
scan_inst->conditional_mod = cond;
scan_inst->flag_subreg = inst->flag_subreg;
inst->remove(block, true);
return true;
}
break;
}
if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
break;
read_flag = read_flag ||
(scan_inst->flags_read(devinfo) & flags_written) != 0;
}
return false;
}
static bool
opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block)
{
bool progress = false;
UNUSED int ip = block->end_ip + 1;
foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
ip--;
if ((inst->opcode != BRW_OPCODE_AND &&
inst->opcode != BRW_OPCODE_CMP &&
inst->opcode != BRW_OPCODE_MOV &&
inst->opcode != BRW_OPCODE_NOT) ||
inst->predicate != BRW_PREDICATE_NONE ||
!inst->dst.is_null() ||
(inst->src[0].file != VGRF && inst->src[0].file != ATTR &&
inst->src[0].file != UNIFORM))
continue;
/* An ABS source modifier can only be handled when processing a compare
* with a value other than zero.
*/
if (inst->src[0].abs &&
(inst->opcode != BRW_OPCODE_CMP || inst->src[1].is_zero()))
continue;
/* Only an AND.NZ can be propagated. Many AND.Z instructions are
* generated (for ir_unop_not in fs_visitor::emit_bool_to_cond_code).
* Propagating those would require inverting the condition on the CMP.
* This changes both the flag value and the register destination of the
* CMP. That result may be used elsewhere, so we can't change its value
* on a whim.
*/
if (inst->opcode == BRW_OPCODE_AND &&
!(inst->src[1].is_one() &&
inst->conditional_mod == BRW_CONDITIONAL_NZ &&
!inst->src[0].negate))
continue;
/* A CMP with a second source of zero can match with anything. A CMP
* with a second source that is not zero can only match with an ADD
* instruction.
*
* Only apply this optimization to float-point sources. It can fail for
* integers. For inputs a = 0x80000000, b = 4, int(0x80000000) < 4, but
* int(0x80000000) - 4 overflows and results in 0x7ffffffc. that's not
* less than zero, so the flags get set differently than for (a < b).
*/
if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) {
if (brw_reg_type_is_floating_point(inst->src[0].type) &&
cmod_propagate_cmp_to_add(devinfo, block, inst))
progress = true;
continue;
}
if (inst->opcode == BRW_OPCODE_NOT) {
progress = cmod_propagate_not(devinfo, block, inst) || progress;
continue;
}
bool read_flag = false;
const unsigned flags_written = inst->flags_written(devinfo);
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
inst->src[0], inst->size_read(0))) {
/* If the scan instruction writes a different flag register than
* the instruction we're trying to propagate from, bail.
*
* FINISHME: The second part of the condition may be too strong.
* Perhaps (scan_inst->flags_written() & flags_written) !=
* flags_written?
*/
if (scan_inst->flags_written(devinfo) != 0 &&
scan_inst->flags_written(devinfo) != flags_written)
break;
if (scan_inst->is_partial_write() ||
scan_inst->dst.offset != inst->src[0].offset ||
scan_inst->exec_size != inst->exec_size)
break;
/* If the write mask is different we can't propagate. */
if (scan_inst->force_writemask_all != inst->force_writemask_all)
break;
/* CMP's result is the same regardless of dest type. */
if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
scan_inst->opcode == BRW_OPCODE_CMP &&
brw_reg_type_is_integer(inst->dst.type)) {
inst->remove(block, true);
progress = true;
break;
}
/* If the AND wasn't handled by the previous case, it isn't safe
* to remove it.
*/
if (inst->opcode == BRW_OPCODE_AND)
break;
if (inst->opcode == BRW_OPCODE_MOV) {
if (brw_reg_type_is_floating_point(scan_inst->dst.type)) {
/* If the destination type of scan_inst is floating-point,
* then:
*
* - The source of the MOV instruction must be the same
* type.
*
* - The destination of the MOV instruction must be float
* point with a size at least as large as the destination
* of inst. Size-reducing f2f conversions could cause
* non-zero values to become zero, etc.
*/
if (scan_inst->dst.type != inst->src[0].type)
break;
if (!brw_reg_type_is_floating_point(inst->dst.type))
break;
if (type_sz(scan_inst->dst.type) > type_sz(inst->dst.type))
break;
} else {
/* If the destination type of scan_inst is integer, then:
*
* - The source of the MOV instruction must be integer with
* the same size.
*
* - If the conditional modifier is Z or NZ, then the
* destination type of inst must either be floating point
* (of any size) or integer with a size at least as large
* as the destination of inst.
*
* - If the conditional modifier is neither Z nor NZ, then the
* destination type of inst must either be floating point
* (of any size) or integer with a size at least as large
* as the destination of inst and the same signedness.
*/
if (!brw_reg_type_is_integer(inst->src[0].type) ||
type_sz(scan_inst->dst.type) != type_sz(inst->src[0].type))
break;
if (brw_reg_type_is_integer(inst->dst.type)) {
if (type_sz(inst->dst.type) < type_sz(scan_inst->dst.type))
break;
if (inst->conditional_mod != BRW_CONDITIONAL_Z &&
inst->conditional_mod != BRW_CONDITIONAL_NZ &&
brw_reg_type_is_unsigned_integer(inst->dst.type) !=
brw_reg_type_is_unsigned_integer(scan_inst->dst.type))
break;
}
}
} else {
/* Not safe to use inequality operators if the types are
* different.
*/
if (scan_inst->dst.type != inst->src[0].type &&
inst->conditional_mod != BRW_CONDITIONAL_Z &&
inst->conditional_mod != BRW_CONDITIONAL_NZ)
break;
/* Comparisons operate differently for ints and floats */
if (scan_inst->dst.type != inst->dst.type) {
/* Comparison result may be altered if the bit-size changes
* since that affects range, denorms, etc
*/
if (type_sz(scan_inst->dst.type) != type_sz(inst->dst.type))
break;
if (brw_reg_type_is_floating_point(scan_inst->dst.type) !=
brw_reg_type_is_floating_point(inst->dst.type))
break;
}
}
/* Knowing following:
* - CMP writes to flag register the result of
* applying cmod to the `src0 - src1`.
* After that it stores the same value to dst.
* Other instructions first store their result to
* dst, and then store cmod(dst) to the flag
* register.
* - inst is either CMP or MOV
* - inst->dst is null
* - inst->src[0] overlaps with scan_inst->dst
* - inst->src[1] is zero
* - scan_inst wrote to a flag register
*
* There can be three possible paths:
*
* - scan_inst is CMP:
*
* Considering that src0 is either 0x0 (false),
* or 0xffffffff (true), and src1 is 0x0:
*
* - If inst's cmod is NZ, we can always remove
* scan_inst: NZ is invariant for false and true. This
* holds even if src0 is NaN: .nz is the only cmod,
* that returns true for NaN.
*
* - .g is invariant if src0 has a UD type
*
* - .l is invariant if src0 has a D type
*
* - scan_inst and inst have the same cmod:
*
* If scan_inst is anything than CMP, it already
* wrote the appropriate value to the flag register.
*
* - else:
*
* We can change cmod of scan_inst to that of inst,
* and remove inst. It is valid as long as we make
* sure that no instruction uses the flag register
* between scan_inst and inst.
*/
if (!inst->src[0].negate &&
scan_inst->flags_written(devinfo)) {
if (scan_inst->opcode == BRW_OPCODE_CMP) {
if ((inst->conditional_mod == BRW_CONDITIONAL_NZ) ||
(inst->conditional_mod == BRW_CONDITIONAL_G &&
inst->src[0].type == BRW_REGISTER_TYPE_UD) ||
(inst->conditional_mod == BRW_CONDITIONAL_L &&
inst->src[0].type == BRW_REGISTER_TYPE_D)) {
inst->remove(block, true);
progress = true;
break;
}
} else if (scan_inst->conditional_mod == inst->conditional_mod) {
/* On Gfx4 and Gfx5 sel.cond will dirty the flags, but the
* flags value is not based on the result stored in the
* destination. On all other platforms sel.cond will not
* write the flags, so execution will not get to this point.
*/
if (scan_inst->opcode == BRW_OPCODE_SEL) {
assert(devinfo->ver <= 5);
} else {
inst->remove(block, true);
progress = true;
}
break;
} else if (!read_flag && scan_inst->can_do_cmod()) {
scan_inst->conditional_mod = inst->conditional_mod;
scan_inst->flag_subreg = inst->flag_subreg;
inst->remove(block, true);
progress = true;
break;
}
}
/* The conditional mod of the CMP/CMPN instructions behaves
* specially because the flag output is not calculated from the
* result of the instruction, but the other way around, which
* means that even if the condmod to propagate and the condmod
* from the CMP instruction are the same they will in general give
* different results because they are evaluated based on different
* inputs.
*/
if (scan_inst->opcode == BRW_OPCODE_CMP ||
scan_inst->opcode == BRW_OPCODE_CMPN)
break;
/* From the Sky Lake PRM, Vol 2a, "Multiply":
*
* "When multiplying integer data types, if one of the sources
* is a DW, the resulting full precision data is stored in
* the accumulator. However, if the destination data type is
* either W or DW, the low bits of the result are written to
* the destination register and the remaining high bits are
* discarded. This results in undefined Overflow and Sign
* flags. Therefore, conditional modifiers and saturation
* (.sat) cannot be used in this case."
*
* We just disallow cmod propagation on all integer multiplies.
*/
if (!brw_reg_type_is_floating_point(scan_inst->dst.type) &&
scan_inst->opcode == BRW_OPCODE_MUL)
break;
enum brw_conditional_mod cond =
inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
: inst->conditional_mod;
/* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags":
*
* * Note that the [post condition signal] bits generated at
* the output of a compute are before the .sat.
*
* Paragraph about post_zero does not mention saturation, but
* testing it on actual GPUs shows that conditional modifiers are
* applied after saturation.
*
* * post_zero bit: This bit reflects whether the final
* result is zero after all the clamping, normalizing,
* or format conversion logic.
*
* For this reason, no additional restrictions are necessary on
* instructions with saturate.
*/
/* Otherwise, try propagating the conditional. */
if (scan_inst->can_do_cmod() &&
((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
scan_inst->conditional_mod == cond)) {
scan_inst->conditional_mod = cond;
scan_inst->flag_subreg = inst->flag_subreg;
inst->remove(block, true);
progress = true;
}
break;
}
if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
break;
read_flag = read_flag ||
(scan_inst->flags_read(devinfo) & flags_written) != 0;
}
}
/* There is progress if and only if instructions were removed. */
assert(progress == (block->end_ip_delta != 0));
return progress;
}
bool
fs_visitor::opt_cmod_propagation()
{
bool progress = false;
foreach_block_reverse(block, cfg) {
progress = opt_cmod_propagation_local(devinfo, block) || progress;
}
if (progress) {
cfg->adjust_block_ips();
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
}
return progress;
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,396 @@
/*
* Copyright © 2012 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_fs.h"
#include "brw_fs_builder.h"
#include "brw_cfg.h"
/** @file brw_fs_cse.cpp
*
* Support for local common subexpression elimination.
*
* See Muchnick's Advanced Compiler Design and Implementation, section
* 13.1 (p378).
*/
using namespace brw;
namespace {
struct aeb_entry : public exec_node {
/** The instruction that generates the expression value. */
fs_inst *generator;
/** The temporary where the value is stored. */
fs_reg tmp;
};
}
static bool
is_expression(const fs_visitor *v, const fs_inst *const inst)
{
switch (inst->opcode) {
case BRW_OPCODE_MOV:
case BRW_OPCODE_SEL:
case BRW_OPCODE_NOT:
case BRW_OPCODE_AND:
case BRW_OPCODE_OR:
case BRW_OPCODE_XOR:
case BRW_OPCODE_SHR:
case BRW_OPCODE_SHL:
case BRW_OPCODE_ASR:
case BRW_OPCODE_CMP:
case BRW_OPCODE_CMPN:
case BRW_OPCODE_ADD:
case BRW_OPCODE_MUL:
case SHADER_OPCODE_MULH:
case BRW_OPCODE_FRC:
case BRW_OPCODE_RNDU:
case BRW_OPCODE_RNDD:
case BRW_OPCODE_RNDE:
case BRW_OPCODE_RNDZ:
case BRW_OPCODE_LINE:
case BRW_OPCODE_PLN:
case BRW_OPCODE_MAD:
case BRW_OPCODE_LRP:
case FS_OPCODE_FB_READ_LOGICAL:
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
case FS_OPCODE_LINTERP:
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
case FS_OPCODE_LOAD_LIVE_CHANNELS:
case SHADER_OPCODE_BROADCAST:
case SHADER_OPCODE_MOV_INDIRECT:
case SHADER_OPCODE_TEX_LOGICAL:
case SHADER_OPCODE_TXD_LOGICAL:
case SHADER_OPCODE_TXF_LOGICAL:
case SHADER_OPCODE_TXL_LOGICAL:
case SHADER_OPCODE_TXS_LOGICAL:
case FS_OPCODE_TXB_LOGICAL:
case SHADER_OPCODE_TXF_CMS_LOGICAL:
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
case SHADER_OPCODE_TXF_UMS_LOGICAL:
case SHADER_OPCODE_TXF_MCS_LOGICAL:
case SHADER_OPCODE_LOD_LOGICAL:
case SHADER_OPCODE_TG4_LOGICAL:
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
case FS_OPCODE_PACK:
return true;
case SHADER_OPCODE_RCP:
case SHADER_OPCODE_RSQ:
case SHADER_OPCODE_SQRT:
case SHADER_OPCODE_EXP2:
case SHADER_OPCODE_LOG2:
case SHADER_OPCODE_POW:
case SHADER_OPCODE_INT_QUOTIENT:
case SHADER_OPCODE_INT_REMAINDER:
case SHADER_OPCODE_SIN:
case SHADER_OPCODE_COS:
return inst->mlen < 2;
case SHADER_OPCODE_LOAD_PAYLOAD:
return !is_coalescing_payload(v->alloc, inst);
default:
return inst->is_send_from_grf() && !inst->has_side_effects() &&
!inst->is_volatile();
}
}
static bool
operands_match(const fs_inst *a, const fs_inst *b, bool *negate)
{
fs_reg *xs = a->src;
fs_reg *ys = b->src;
if (a->opcode == BRW_OPCODE_MAD) {
return xs[0].equals(ys[0]) &&
((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) ||
(xs[2].equals(ys[1]) && xs[1].equals(ys[2])));
} else if (a->opcode == BRW_OPCODE_MUL && a->dst.type == BRW_REGISTER_TYPE_F) {
bool xs0_negate = xs[0].negate;
bool xs1_negate = xs[1].file == IMM ? xs[1].f < 0.0f
: xs[1].negate;
bool ys0_negate = ys[0].negate;
bool ys1_negate = ys[1].file == IMM ? ys[1].f < 0.0f
: ys[1].negate;
float xs1_imm = xs[1].f;
float ys1_imm = ys[1].f;
xs[0].negate = false;
xs[1].negate = false;
ys[0].negate = false;
ys[1].negate = false;
xs[1].f = fabsf(xs[1].f);
ys[1].f = fabsf(ys[1].f);
bool ret = (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
(xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
xs[0].negate = xs0_negate;
xs[1].negate = xs[1].file == IMM ? false : xs1_negate;
ys[0].negate = ys0_negate;
ys[1].negate = ys[1].file == IMM ? false : ys1_negate;
xs[1].f = xs1_imm;
ys[1].f = ys1_imm;
*negate = (xs0_negate != xs1_negate) != (ys0_negate != ys1_negate);
if (*negate && (a->saturate || b->saturate))
return false;
return ret;
} else if (!a->is_commutative()) {
bool match = true;
for (int i = 0; i < a->sources; i++) {
if (!xs[i].equals(ys[i])) {
match = false;
break;
}
}
return match;
} else {
return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
(xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
}
}
static bool
instructions_match(fs_inst *a, fs_inst *b, bool *negate)
{
return a->opcode == b->opcode &&
a->force_writemask_all == b->force_writemask_all &&
a->exec_size == b->exec_size &&
a->group == b->group &&
a->saturate == b->saturate &&
a->predicate == b->predicate &&
a->predicate_inverse == b->predicate_inverse &&
a->conditional_mod == b->conditional_mod &&
a->flag_subreg == b->flag_subreg &&
a->dst.type == b->dst.type &&
a->offset == b->offset &&
a->mlen == b->mlen &&
a->ex_mlen == b->ex_mlen &&
a->sfid == b->sfid &&
a->desc == b->desc &&
a->size_written == b->size_written &&
a->base_mrf == b->base_mrf &&
a->check_tdr == b->check_tdr &&
a->send_has_side_effects == b->send_has_side_effects &&
a->eot == b->eot &&
a->header_size == b->header_size &&
a->shadow_compare == b->shadow_compare &&
a->pi_noperspective == b->pi_noperspective &&
a->target == b->target &&
a->sources == b->sources &&
operands_match(a, b, negate);
}
static void
create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate)
{
unsigned written = regs_written(inst);
unsigned dst_width =
DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
fs_inst *copy;
if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
assert(src.file == VGRF);
fs_reg *payload = ralloc_array(bld.shader->mem_ctx, fs_reg,
inst->sources);
for (int i = 0; i < inst->header_size; i++) {
payload[i] = src;
src.offset += REG_SIZE;
}
for (int i = inst->header_size; i < inst->sources; i++) {
src.type = inst->src[i].type;
payload[i] = src;
src = offset(src, bld, 1);
}
copy = bld.LOAD_PAYLOAD(inst->dst, payload, inst->sources,
inst->header_size);
} else if (written != dst_width) {
assert(src.file == VGRF);
assert(written % dst_width == 0);
const int sources = written / dst_width;
fs_reg *payload = ralloc_array(bld.shader->mem_ctx, fs_reg, sources);
for (int i = 0; i < sources; i++) {
payload[i] = src;
src = offset(src, bld, 1);
}
copy = bld.LOAD_PAYLOAD(inst->dst, payload, sources, 0);
} else {
copy = bld.MOV(inst->dst, src);
copy->group = inst->group;
copy->force_writemask_all = inst->force_writemask_all;
copy->src[0].negate = negate;
}
assert(regs_written(copy) == written);
}
bool
fs_visitor::opt_cse_local(const fs_live_variables &live, bblock_t *block, int &ip)
{
bool progress = false;
exec_list aeb;
void *cse_ctx = ralloc_context(NULL);
foreach_inst_in_block(fs_inst, inst, block) {
/* Skip some cases. */
if (is_expression(this, inst) && !inst->is_partial_write() &&
((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) ||
inst->dst.is_null()))
{
bool found = false;
bool negate = false;
foreach_in_list_use_after(aeb_entry, entry, &aeb) {
/* Match current instruction's expression against those in AEB. */
if (!(entry->generator->dst.is_null() && !inst->dst.is_null()) &&
instructions_match(inst, entry->generator, &negate)) {
found = true;
progress = true;
break;
}
}
if (!found) {
if (inst->opcode != BRW_OPCODE_MOV ||
(inst->opcode == BRW_OPCODE_MOV &&
inst->src[0].file == IMM &&
inst->src[0].type == BRW_REGISTER_TYPE_VF)) {
/* Our first sighting of this expression. Create an entry. */
aeb_entry *entry = ralloc(cse_ctx, aeb_entry);
entry->tmp = reg_undef;
entry->generator = inst;
aeb.push_tail(entry);
}
} else {
/* This is at least our second sighting of this expression.
* If we don't have a temporary already, make one.
*/
bool no_existing_temp = entry->tmp.file == BAD_FILE;
if (no_existing_temp && !entry->generator->dst.is_null()) {
const fs_builder ibld = fs_builder(this, block, entry->generator)
.at(block, entry->generator->next);
int written = regs_written(entry->generator);
entry->tmp = fs_reg(VGRF, alloc.allocate(written),
entry->generator->dst.type);
create_copy_instr(ibld, entry->generator, entry->tmp, false);
entry->generator->dst = entry->tmp;
}
/* dest <- temp */
if (!inst->dst.is_null()) {
assert(inst->size_written == entry->generator->size_written);
assert(inst->dst.type == entry->tmp.type);
const fs_builder ibld(this, block, inst);
create_copy_instr(ibld, inst, entry->tmp, negate);
}
/* Set our iterator so that next time through the loop inst->next
* will get the instruction in the basic block after the one we've
* removed.
*/
fs_inst *prev = (fs_inst *)inst->prev;
inst->remove(block);
inst = prev;
}
}
/* Discard jumps aren't represented in the CFG unfortunately, so we need
* to make sure that they behave as a CSE barrier, since we lack global
* dataflow information. This is particularly likely to cause problems
* with instructions dependent on the current execution mask like
* SHADER_OPCODE_FIND_LIVE_CHANNEL.
*/
if (inst->opcode == BRW_OPCODE_HALT ||
inst->opcode == SHADER_OPCODE_HALT_TARGET)
aeb.make_empty();
foreach_in_list_safe(aeb_entry, entry, &aeb) {
/* Kill all AEB entries that write a different value to or read from
* the flag register if we just wrote it.
*/
if (inst->flags_written(devinfo)) {
bool negate; /* dummy */
if (entry->generator->flags_read(devinfo) ||
(entry->generator->flags_written(devinfo) &&
!instructions_match(inst, entry->generator, &negate))) {
entry->remove();
ralloc_free(entry);
continue;
}
}
for (int i = 0; i < entry->generator->sources; i++) {
fs_reg *src_reg = &entry->generator->src[i];
/* Kill all AEB entries that use the destination we just
* overwrote.
*/
if (regions_overlap(inst->dst, inst->size_written,
entry->generator->src[i],
entry->generator->size_read(i))) {
entry->remove();
ralloc_free(entry);
break;
}
/* Kill any AEB entries using registers that don't get reused any
* more -- a sure sign they'll fail operands_match().
*/
if (src_reg->file == VGRF && live.vgrf_end[src_reg->nr] < ip) {
entry->remove();
ralloc_free(entry);
break;
}
}
}
ip++;
}
ralloc_free(cse_ctx);
return progress;
}
bool
fs_visitor::opt_cse()
{
const fs_live_variables &live = live_analysis.require();
bool progress = false;
int ip = 0;
foreach_block (block, cfg) {
progress = opt_cse_local(live, block, ip) || progress;
}
if (progress)
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
return progress;
}

View file

@ -0,0 +1,152 @@
/*
* Copyright © 2014 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_fs.h"
#include "brw_fs_live_variables.h"
#include "brw_cfg.h"
/** @file brw_fs_dead_code_eliminate.cpp
*
* Dataflow-aware dead code elimination.
*
* Walks the instruction list from the bottom, removing instructions that
* have results that both aren't used in later blocks and haven't been read
* yet in the tail end of this block.
*/
using namespace brw;
/**
* Is it safe to eliminate the instruction?
*/
static bool
can_eliminate(const intel_device_info *devinfo, const fs_inst *inst,
BITSET_WORD *flag_live)
{
return !inst->is_control_flow() &&
!inst->has_side_effects() &&
!(flag_live[0] & inst->flags_written(devinfo)) &&
!inst->writes_accumulator;
}
/**
* Is it safe to omit the write, making the destination ARF null?
*/
static bool
can_omit_write(const fs_inst *inst)
{
switch (inst->opcode) {
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
return true;
default:
/* We can eliminate the destination write for ordinary instructions,
* but not most SENDs.
*/
if (inst->opcode < 128 && inst->mlen == 0)
return true;
/* It might not be safe for other virtual opcodes. */
return false;
}
}
bool
fs_visitor::dead_code_eliminate()
{
bool progress = false;
const fs_live_variables &live_vars = live_analysis.require();
int num_vars = live_vars.num_vars;
BITSET_WORD *live = rzalloc_array(NULL, BITSET_WORD, BITSET_WORDS(num_vars));
BITSET_WORD *flag_live = rzalloc_array(NULL, BITSET_WORD, 1);
foreach_block_reverse_safe(block, cfg) {
memcpy(live, live_vars.block_data[block->num].liveout,
sizeof(BITSET_WORD) * BITSET_WORDS(num_vars));
memcpy(flag_live, live_vars.block_data[block->num].flag_liveout,
sizeof(BITSET_WORD));
foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
if (inst->dst.file == VGRF) {
const unsigned var = live_vars.var_from_reg(inst->dst);
bool result_live = false;
for (unsigned i = 0; i < regs_written(inst); i++)
result_live |= BITSET_TEST(live, var + i);
if (!result_live &&
(can_omit_write(inst) || can_eliminate(devinfo, inst, flag_live))) {
inst->dst = fs_reg(spread(retype(brw_null_reg(), inst->dst.type),
inst->dst.stride));
progress = true;
}
}
if (inst->dst.is_null() && can_eliminate(devinfo, inst, flag_live)) {
inst->opcode = BRW_OPCODE_NOP;
progress = true;
}
if (inst->dst.file == VGRF) {
if (!inst->is_partial_write()) {
const unsigned var = live_vars.var_from_reg(inst->dst);
for (unsigned i = 0; i < regs_written(inst); i++) {
BITSET_CLEAR(live, var + i);
}
}
}
if (!inst->predicate && inst->exec_size >= 8)
flag_live[0] &= ~inst->flags_written(devinfo);
if (inst->opcode == BRW_OPCODE_NOP) {
inst->remove(block, true);
continue;
}
for (int i = 0; i < inst->sources; i++) {
if (inst->src[i].file == VGRF) {
int var = live_vars.var_from_reg(inst->src[i]);
for (unsigned j = 0; j < regs_read(inst, i); j++) {
BITSET_SET(live, var + j);
}
}
}
flag_live[0] |= inst->flags_read(devinfo);
}
}
cfg->adjust_block_ips();
ralloc_free(live);
ralloc_free(flag_live);
if (progress)
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
return progress;
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,371 @@
/*
* Copyright © 2012 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* Authors:
* Eric Anholt <eric@anholt.net>
*
*/
#include "brw_fs.h"
#include "brw_fs_live_variables.h"
using namespace brw;
#define MAX_INSTRUCTION (1 << 30)
/** @file brw_fs_live_variables.cpp
*
* Support for calculating liveness information about virtual GRFs.
*
* This produces a live interval for each whole virtual GRF. We could
* choose to expose per-component live intervals for VGRFs of size > 1,
* but we currently do not. It is easier for the consumers of this
* information to work with whole VGRFs.
*
* However, we internally track use/def information at the per-GRF level for
* greater accuracy. Large VGRFs may be accessed piecemeal over many
* (possibly non-adjacent) instructions. In this case, examining a single
* instruction is insufficient to decide whether a whole VGRF is ultimately
* used or defined. Tracking individual components allows us to easily
* assemble this information.
*
* See Muchnick's Advanced Compiler Design and Implementation, section
* 14.1 (p444).
*/
void
fs_live_variables::setup_one_read(struct block_data *bd,
int ip, const fs_reg &reg)
{
int var = var_from_reg(reg);
assert(var < num_vars);
start[var] = MIN2(start[var], ip);
end[var] = MAX2(end[var], ip);
/* The use[] bitset marks when the block makes use of a variable (VGRF
* channel) without having completely defined that variable within the
* block.
*/
if (!BITSET_TEST(bd->def, var))
BITSET_SET(bd->use, var);
}
void
fs_live_variables::setup_one_write(struct block_data *bd, fs_inst *inst,
int ip, const fs_reg &reg)
{
int var = var_from_reg(reg);
assert(var < num_vars);
start[var] = MIN2(start[var], ip);
end[var] = MAX2(end[var], ip);
/* The def[] bitset marks when an initialization in a block completely
* screens off previous updates of that variable (VGRF channel).
*/
if (inst->dst.file == VGRF) {
if (!inst->is_partial_write() && !BITSET_TEST(bd->use, var))
BITSET_SET(bd->def, var);
BITSET_SET(bd->defout, var);
}
}
/**
* Sets up the use[] and def[] bitsets.
*
* The basic-block-level live variable analysis needs to know which
* variables get used before they're completely defined, and which
* variables are completely defined before they're used.
*
* These are tracked at the per-component level, rather than whole VGRFs.
*/
void
fs_live_variables::setup_def_use()
{
int ip = 0;
foreach_block (block, cfg) {
assert(ip == block->start_ip);
if (block->num > 0)
assert(cfg->blocks[block->num - 1]->end_ip == ip - 1);
struct block_data *bd = &block_data[block->num];
foreach_inst_in_block(fs_inst, inst, block) {
/* Set use[] for this instruction */
for (unsigned int i = 0; i < inst->sources; i++) {
fs_reg reg = inst->src[i];
if (reg.file != VGRF)
continue;
for (unsigned j = 0; j < regs_read(inst, i); j++) {
setup_one_read(bd, ip, reg);
reg.offset += REG_SIZE;
}
}
bd->flag_use[0] |= inst->flags_read(devinfo) & ~bd->flag_def[0];
/* Set def[] for this instruction */
if (inst->dst.file == VGRF) {
fs_reg reg = inst->dst;
for (unsigned j = 0; j < regs_written(inst); j++) {
setup_one_write(bd, inst, ip, reg);
reg.offset += REG_SIZE;
}
}
if (!inst->predicate && inst->exec_size >= 8)
bd->flag_def[0] |= inst->flags_written(devinfo) & ~bd->flag_use[0];
ip++;
}
}
}
/**
* The algorithm incrementally sets bits in liveout and livein,
* propagating it through control flow. It will eventually terminate
* because it only ever adds bits, and stops when no bits are added in
* a pass.
*/
void
fs_live_variables::compute_live_variables()
{
bool cont = true;
/* Propagate defin and defout down the CFG to calculate the union of live
* variables potentially defined along any possible control flow path.
*/
do {
cont = false;
foreach_block (block, cfg) {
const struct block_data *bd = &block_data[block->num];
foreach_list_typed(bblock_link, child_link, link, &block->children) {
struct block_data *child_bd = &block_data[child_link->block->num];
for (int i = 0; i < bitset_words; i++) {
const BITSET_WORD new_def = bd->defout[i] & ~child_bd->defin[i];
child_bd->defin[i] |= new_def;
child_bd->defout[i] |= new_def;
cont |= new_def;
}
}
}
} while (cont);
do {
cont = false;
foreach_block_reverse (block, cfg) {
struct block_data *bd = &block_data[block->num];
/* Update liveout */
foreach_list_typed(bblock_link, child_link, link, &block->children) {
struct block_data *child_bd = &block_data[child_link->block->num];
for (int i = 0; i < bitset_words; i++) {
BITSET_WORD new_liveout = (child_bd->livein[i] &
~bd->liveout[i]);
new_liveout &= bd->defout[i]; /* Screen off uses with no reaching def */
if (new_liveout)
bd->liveout[i] |= new_liveout;
}
BITSET_WORD new_liveout = (child_bd->flag_livein[0] &
~bd->flag_liveout[0]);
if (new_liveout)
bd->flag_liveout[0] |= new_liveout;
}
/* Update livein */
for (int i = 0; i < bitset_words; i++) {
BITSET_WORD new_livein = (bd->use[i] |
(bd->liveout[i] &
~bd->def[i]));
new_livein &= bd->defin[i]; /* Screen off uses with no reaching def */
if (new_livein & ~bd->livein[i]) {
bd->livein[i] |= new_livein;
cont = true;
}
}
BITSET_WORD new_livein = (bd->flag_use[0] |
(bd->flag_liveout[0] &
~bd->flag_def[0]));
if (new_livein & ~bd->flag_livein[0]) {
bd->flag_livein[0] |= new_livein;
cont = true;
}
}
} while (cont);
}
/**
* Extend the start/end ranges for each variable to account for the
* new information calculated from control flow.
*/
void
fs_live_variables::compute_start_end()
{
foreach_block (block, cfg) {
struct block_data *bd = &block_data[block->num];
unsigned i;
BITSET_FOREACH_SET(i, bd->livein, (unsigned)num_vars) {
start[i] = MIN2(start[i], block->start_ip);
end[i] = MAX2(end[i], block->start_ip);
}
BITSET_FOREACH_SET(i, bd->liveout, (unsigned)num_vars) {
start[i] = MIN2(start[i], block->end_ip);
end[i] = MAX2(end[i], block->end_ip);
}
}
}
fs_live_variables::fs_live_variables(const backend_shader *s)
: devinfo(s->devinfo), cfg(s->cfg)
{
mem_ctx = ralloc_context(NULL);
linear_ctx *lin_ctx = linear_context(mem_ctx);
num_vgrfs = s->alloc.count;
num_vars = 0;
var_from_vgrf = linear_zalloc_array(lin_ctx, int, num_vgrfs);
for (int i = 0; i < num_vgrfs; i++) {
var_from_vgrf[i] = num_vars;
num_vars += s->alloc.sizes[i];
}
vgrf_from_var = linear_zalloc_array(lin_ctx, int, num_vars);
for (int i = 0; i < num_vgrfs; i++) {
for (unsigned j = 0; j < s->alloc.sizes[i]; j++) {
vgrf_from_var[var_from_vgrf[i] + j] = i;
}
}
start = ralloc_array(mem_ctx, int, num_vars);
end = linear_zalloc_array(lin_ctx, int, num_vars);
for (int i = 0; i < num_vars; i++) {
start[i] = MAX_INSTRUCTION;
end[i] = -1;
}
vgrf_start = ralloc_array(mem_ctx, int, num_vgrfs);
vgrf_end = ralloc_array(mem_ctx, int, num_vgrfs);
for (int i = 0; i < num_vgrfs; i++) {
vgrf_start[i] = MAX_INSTRUCTION;
vgrf_end[i] = -1;
}
block_data = linear_zalloc_array(lin_ctx, struct block_data, cfg->num_blocks);
bitset_words = BITSET_WORDS(num_vars);
for (int i = 0; i < cfg->num_blocks; i++) {
block_data[i].def = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
block_data[i].use = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
block_data[i].livein = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
block_data[i].liveout = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
block_data[i].defin = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
block_data[i].defout = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
block_data[i].flag_def[0] = 0;
block_data[i].flag_use[0] = 0;
block_data[i].flag_livein[0] = 0;
block_data[i].flag_liveout[0] = 0;
}
setup_def_use();
compute_live_variables();
compute_start_end();
/* Merge the per-component live ranges to whole VGRF live ranges. */
for (int i = 0; i < num_vars; i++) {
const unsigned vgrf = vgrf_from_var[i];
vgrf_start[vgrf] = MIN2(vgrf_start[vgrf], start[i]);
vgrf_end[vgrf] = MAX2(vgrf_end[vgrf], end[i]);
}
}
fs_live_variables::~fs_live_variables()
{
ralloc_free(mem_ctx);
}
static bool
check_register_live_range(const fs_live_variables *live, int ip,
const fs_reg &reg, unsigned n)
{
const unsigned var = live->var_from_reg(reg);
if (var + n > unsigned(live->num_vars) ||
live->vgrf_start[reg.nr] > ip || live->vgrf_end[reg.nr] < ip)
return false;
for (unsigned j = 0; j < n; j++) {
if (live->start[var + j] > ip || live->end[var + j] < ip)
return false;
}
return true;
}
bool
fs_live_variables::validate(const backend_shader *s) const
{
int ip = 0;
foreach_block_and_inst(block, fs_inst, inst, s->cfg) {
for (unsigned i = 0; i < inst->sources; i++) {
if (inst->src[i].file == VGRF &&
!check_register_live_range(this, ip,
inst->src[i], regs_read(inst, i)))
return false;
}
if (inst->dst.file == VGRF &&
!check_register_live_range(this, ip, inst->dst, regs_written(inst)))
return false;
ip++;
}
return true;
}
bool
fs_live_variables::vars_interfere(int a, int b) const
{
return !(end[b] <= start[a] ||
end[a] <= start[b]);
}
bool
fs_live_variables::vgrfs_interfere(int a, int b) const
{
return !(vgrf_end[a] <= vgrf_start[b] ||
vgrf_end[b] <= vgrf_start[a]);
}

View file

@ -0,0 +1,148 @@
/*
* Copyright © 2012 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* Authors:
* Eric Anholt <eric@anholt.net>
*
*/
#ifndef BRW_FS_LIVE_VARIABLES_H
#define BRW_FS_LIVE_VARIABLES_H
#include "brw_ir_analysis.h"
#include "brw_ir_fs.h"
#include "util/bitset.h"
struct cfg_t;
struct backend_shader;
namespace brw {
class fs_live_variables {
public:
struct block_data {
/**
* Which variables are defined before being used in the block.
*
* Note that for our purposes, "defined" means unconditionally, completely
* defined.
*/
BITSET_WORD *def;
/**
* Which variables are used before being defined in the block.
*/
BITSET_WORD *use;
/** Which defs reach the entry point of the block. */
BITSET_WORD *livein;
/** Which defs reach the exit point of the block. */
BITSET_WORD *liveout;
/**
* Variables such that the entry point of the block may be reached from any
* of their definitions.
*/
BITSET_WORD *defin;
/**
* Variables such that the exit point of the block may be reached from any
* of their definitions.
*/
BITSET_WORD *defout;
BITSET_WORD flag_def[1];
BITSET_WORD flag_use[1];
BITSET_WORD flag_livein[1];
BITSET_WORD flag_liveout[1];
};
fs_live_variables(const backend_shader *s);
~fs_live_variables();
bool validate(const backend_shader *s) const;
analysis_dependency_class
dependency_class() const
{
return (DEPENDENCY_INSTRUCTION_IDENTITY |
DEPENDENCY_INSTRUCTION_DATA_FLOW |
DEPENDENCY_VARIABLES);
}
bool vars_interfere(int a, int b) const;
bool vgrfs_interfere(int a, int b) const;
int var_from_reg(const fs_reg &reg) const
{
return var_from_vgrf[reg.nr] + reg.offset / REG_SIZE;
}
/** Map from virtual GRF number to index in block_data arrays. */
int *var_from_vgrf;
/**
* Map from any index in block_data to the virtual GRF containing it.
*
* For alloc.sizes of [1, 2, 3], vgrf_from_var would contain
* [0, 1, 1, 2, 2, 2].
*/
int *vgrf_from_var;
int num_vars;
int num_vgrfs;
int bitset_words;
/** @{
* Final computed live ranges for each var (each component of each virtual
* GRF).
*/
int *start;
int *end;
/** @} */
/** @{
* Final computed live ranges for each VGRF.
*/
int *vgrf_start;
int *vgrf_end;
/** @} */
/** Per-basic-block information on live variables */
struct block_data *block_data;
protected:
void setup_def_use();
void setup_one_read(struct block_data *bd, int ip, const fs_reg &reg);
void setup_one_write(struct block_data *bd, fs_inst *inst, int ip,
const fs_reg &reg);
void compute_live_variables();
void compute_start_end();
const struct intel_device_info *devinfo;
const cfg_t *cfg;
void *mem_ctx;
};
} /* namespace brw */
#endif /* BRW_FS_LIVE_VARIABLES_H */

View file

@ -0,0 +1,306 @@
/*
* Copyright 2023 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "brw_fs.h"
#include "brw_fs_builder.h"
using namespace brw;
static void
f16_using_mac(const fs_builder &bld, fs_inst *inst)
{
/* We only intend to support configurations where the destination and
* accumulator have the same type.
*/
if (!inst->src[0].is_null())
assert(inst->dst.type == inst->src[0].type);
assert(inst->src[1].type == BRW_REGISTER_TYPE_HF);
assert(inst->src[2].type == BRW_REGISTER_TYPE_HF);
const brw_reg_type src0_type = inst->dst.type;
const brw_reg_type src1_type = BRW_REGISTER_TYPE_HF;
const brw_reg_type src2_type = BRW_REGISTER_TYPE_HF;
const fs_reg dest = inst->dst;
fs_reg src0 = inst->src[0];
const fs_reg src1 = retype(inst->src[1], src1_type);
const fs_reg src2 = retype(inst->src[2], src2_type);
const unsigned dest_stride =
dest.type == BRW_REGISTER_TYPE_HF ? REG_SIZE / 2 : REG_SIZE;
for (unsigned r = 0; r < inst->rcount; r++) {
fs_reg temp = bld.vgrf(BRW_REGISTER_TYPE_HF, 1);
for (unsigned subword = 0; subword < 2; subword++) {
for (unsigned s = 0; s < inst->sdepth; s++) {
/* The first multiply of the dot-product operation has to
* explicitly write the accumulator register. The successive MAC
* instructions will implicitly read *and* write the
* accumulator. Those MAC instructions can also optionally
* explicitly write some other register.
*
* FINISHME: The accumulator can actually hold 16 HF values. On
* Gfx12 there are two accumulators. It should be possible to do
* this in SIMD16 or even SIMD32. I was unable to get this to work
* properly.
*/
if (s == 0 && subword == 0) {
const unsigned acc_width = 8;
fs_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD),
inst->group % acc_width);
if (bld.shader->devinfo->verx10 >= 125) {
acc = subscript(acc, BRW_REGISTER_TYPE_HF, subword);
} else {
acc = retype(acc, BRW_REGISTER_TYPE_HF);
}
bld.MUL(acc,
subscript(retype(byte_offset(src1, s * REG_SIZE),
BRW_REGISTER_TYPE_UD),
BRW_REGISTER_TYPE_HF, subword),
component(retype(byte_offset(src2, r * REG_SIZE),
BRW_REGISTER_TYPE_HF),
s * 2 + subword))
->writes_accumulator = true;
} else {
fs_reg result;
/* As mentioned above, the MAC had an optional, explicit
* destination register. Various optimization passes are not
* clever enough to understand the intricacies of this
* instruction, so only write the result register on the final
* MAC in the sequence.
*/
if ((s + 1) == inst->sdepth && subword == 1)
result = temp;
else
result = retype(bld.null_reg_ud(), BRW_REGISTER_TYPE_HF);
bld.MAC(result,
subscript(retype(byte_offset(src1, s * REG_SIZE),
BRW_REGISTER_TYPE_UD),
BRW_REGISTER_TYPE_HF, subword),
component(retype(byte_offset(src2, r * REG_SIZE),
BRW_REGISTER_TYPE_HF),
s * 2 + subword))
->writes_accumulator = true;
}
}
}
if (!src0.is_null()) {
if (src0_type != BRW_REGISTER_TYPE_HF) {
fs_reg temp2 = bld.vgrf(src0_type, 1);
bld.MOV(temp2, temp);
bld.ADD(byte_offset(dest, r * dest_stride),
temp2,
byte_offset(src0, r * dest_stride));
} else {
bld.ADD(byte_offset(dest, r * dest_stride),
temp,
byte_offset(src0, r * dest_stride));
}
} else {
bld.MOV(byte_offset(dest, r * dest_stride), temp);
}
}
}
static void
int8_using_dp4a(const fs_builder &bld, fs_inst *inst)
{
/* We only intend to support configurations where the destination and
* accumulator have the same type.
*/
if (!inst->src[0].is_null())
assert(inst->dst.type == inst->src[0].type);
assert(inst->src[1].type == BRW_REGISTER_TYPE_B ||
inst->src[1].type == BRW_REGISTER_TYPE_UB);
assert(inst->src[2].type == BRW_REGISTER_TYPE_B ||
inst->src[2].type == BRW_REGISTER_TYPE_UB);
const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB
? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB
? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
fs_reg dest = inst->dst;
fs_reg src0 = inst->src[0];
const fs_reg src1 = retype(inst->src[1], src1_type);
const fs_reg src2 = retype(inst->src[2], src2_type);
const unsigned dest_stride = REG_SIZE;
for (unsigned r = 0; r < inst->rcount; r++) {
if (!src0.is_null()) {
bld.MOV(dest, src0);
src0 = byte_offset(src0, dest_stride);
} else {
bld.MOV(dest, retype(brw_imm_d(0), dest.type));
}
for (unsigned s = 0; s < inst->sdepth; s++) {
bld.DP4A(dest,
dest,
byte_offset(src1, s * REG_SIZE),
component(byte_offset(src2, r * REG_SIZE), s))
->saturate = inst->saturate;
}
dest = byte_offset(dest, dest_stride);
}
}
static void
int8_using_mul_add(const fs_builder &bld, fs_inst *inst)
{
/* We only intend to support configurations where the destination and
* accumulator have the same type.
*/
if (!inst->src[0].is_null())
assert(inst->dst.type == inst->src[0].type);
assert(inst->src[1].type == BRW_REGISTER_TYPE_B ||
inst->src[1].type == BRW_REGISTER_TYPE_UB);
assert(inst->src[2].type == BRW_REGISTER_TYPE_B ||
inst->src[2].type == BRW_REGISTER_TYPE_UB);
const brw_reg_type src0_type = inst->dst.type;
const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB
? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB
? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
fs_reg dest = inst->dst;
fs_reg src0 = inst->src[0];
const fs_reg src1 = retype(inst->src[1], src1_type);
const fs_reg src2 = retype(inst->src[2], src2_type);
const unsigned dest_stride = REG_SIZE;
for (unsigned r = 0; r < inst->rcount; r++) {
if (!src0.is_null()) {
bld.MOV(dest, src0);
src0 = byte_offset(src0, dest_stride);
} else {
bld.MOV(dest, retype(brw_imm_d(0), dest.type));
}
for (unsigned s = 0; s < inst->sdepth; s++) {
fs_reg temp1 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
fs_reg temp2 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
fs_reg temp3 = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
const brw_reg_type temp_type =
(inst->src[1].type == BRW_REGISTER_TYPE_B ||
inst->src[2].type == BRW_REGISTER_TYPE_B)
? BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW;
/* Expand 8 dwords of packed bytes into 16 dwords of packed
* words.
*
* FINISHME: Gfx9 should not need this work around. Gfx11
* may be able to use integer MAD. Both platforms may be
* able to use MAC.
*/
bld.group(32, 0).MOV(retype(temp3, temp_type),
retype(byte_offset(src2, r * REG_SIZE),
inst->src[2].type));
bld.MUL(subscript(temp1, temp_type, 0),
subscript(retype(byte_offset(src1, s * REG_SIZE),
BRW_REGISTER_TYPE_UD),
inst->src[1].type, 0),
subscript(component(retype(temp3,
BRW_REGISTER_TYPE_UD),
s * 2),
temp_type, 0));
bld.MUL(subscript(temp1, temp_type, 1),
subscript(retype(byte_offset(src1, s * REG_SIZE),
BRW_REGISTER_TYPE_UD),
inst->src[1].type, 1),
subscript(component(retype(temp3,
BRW_REGISTER_TYPE_UD),
s * 2),
temp_type, 1));
bld.MUL(subscript(temp2, temp_type, 0),
subscript(retype(byte_offset(src1, s * REG_SIZE),
BRW_REGISTER_TYPE_UD),
inst->src[1].type, 2),
subscript(component(retype(temp3,
BRW_REGISTER_TYPE_UD),
s * 2 + 1),
temp_type, 0));
bld.MUL(subscript(temp2, temp_type, 1),
subscript(retype(byte_offset(src1, s * REG_SIZE),
BRW_REGISTER_TYPE_UD),
inst->src[1].type, 3),
subscript(component(retype(temp3,
BRW_REGISTER_TYPE_UD),
s * 2 + 1),
temp_type, 1));
bld.ADD(subscript(temp1, src0_type, 0),
subscript(temp1, temp_type, 0),
subscript(temp1, temp_type, 1));
bld.ADD(subscript(temp2, src0_type, 0),
subscript(temp2, temp_type, 0),
subscript(temp2, temp_type, 1));
bld.ADD(retype(temp1, src0_type),
retype(temp1, src0_type),
retype(temp2, src0_type));
bld.ADD(dest, dest, retype(temp1, src0_type))
->saturate = inst->saturate;
}
dest = byte_offset(dest, dest_stride);
}
}
bool
brw_lower_dpas(fs_visitor &v)
{
bool progress = false;
foreach_block_and_inst_safe(block, fs_inst, inst, v.cfg) {
if (inst->opcode != BRW_OPCODE_DPAS)
continue;
const fs_builder bld = fs_builder(&v, block, inst).group(8, 0).exec_all();
if (brw_reg_type_is_floating_point(inst->dst.type)) {
f16_using_mac(bld, inst);
} else {
if (v.devinfo->ver >= 12) {
int8_using_dp4a(bld, inst);
} else {
int8_using_mul_add(bld, inst);
}
}
inst->remove(block);
progress = true;
}
if (progress)
v.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
return progress;
}

View file

@ -0,0 +1,92 @@
/*
* Copyright © 2015 Connor Abbott
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "util/half_float.h"
#include "brw_fs.h"
#include "brw_cfg.h"
#include "brw_fs_builder.h"
using namespace brw;
bool
fs_visitor::lower_pack()
{
bool progress = false;
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
if (inst->opcode != FS_OPCODE_PACK &&
inst->opcode != FS_OPCODE_PACK_HALF_2x16_SPLIT)
continue;
assert(inst->dst.file == VGRF);
assert(inst->saturate == false);
fs_reg dst = inst->dst;
const fs_builder ibld(this, block, inst);
/* The lowering generates 2 instructions for what was previously 1. This
* can trick the IR to believe we're doing partial writes, but the
* register is actually fully written. Mark it as undef to help the IR
* reduce the liveness of the register.
*/
if (!inst->is_partial_write())
ibld.emit_undef_for_dst(inst);
switch (inst->opcode) {
case FS_OPCODE_PACK:
for (unsigned i = 0; i < inst->sources; i++)
ibld.MOV(subscript(dst, inst->src[i].type, i), inst->src[i]);
break;
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
assert(dst.type == BRW_REGISTER_TYPE_UD);
for (unsigned i = 0; i < inst->sources; i++) {
if (inst->src[i].file == IMM) {
const uint32_t half = _mesa_float_to_half(inst->src[i].f);
ibld.MOV(subscript(dst, BRW_REGISTER_TYPE_UW, i),
brw_imm_uw(half));
} else if (i == 1 && devinfo->ver < 9) {
/* Pre-Skylake requires DWord aligned destinations */
fs_reg tmp = ibld.vgrf(BRW_REGISTER_TYPE_UD);
ibld.F32TO16(subscript(tmp, BRW_REGISTER_TYPE_HF, 0),
inst->src[i]);
ibld.MOV(subscript(dst, BRW_REGISTER_TYPE_UW, 1),
subscript(tmp, BRW_REGISTER_TYPE_UW, 0));
} else {
ibld.F32TO16(subscript(dst, BRW_REGISTER_TYPE_HF, i),
inst->src[i]);
}
}
break;
default:
unreachable("skipped above");
}
inst->remove(block);
progress = true;
}
if (progress)
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
return progress;
}

View file

@ -0,0 +1,677 @@
/*
* Copyright © 2018 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_fs.h"
#include "brw_cfg.h"
#include "brw_fs_builder.h"
using namespace brw;
namespace {
/* From the SKL PRM Vol 2a, "Move":
*
* "A mov with the same source and destination type, no source modifier,
* and no saturation is a raw move. A packed byte destination region (B
* or UB type with HorzStride == 1 and ExecSize > 1) can only be written
* using raw move."
*/
bool
is_byte_raw_mov(const fs_inst *inst)
{
return type_sz(inst->dst.type) == 1 &&
inst->opcode == BRW_OPCODE_MOV &&
inst->src[0].type == inst->dst.type &&
!inst->saturate &&
!inst->src[0].negate &&
!inst->src[0].abs;
}
/*
* Return an acceptable byte stride for the destination of an instruction
* that requires it to have some particular alignment.
*/
unsigned
required_dst_byte_stride(const fs_inst *inst)
{
if (inst->dst.is_accumulator()) {
/* If the destination is an accumulator, insist that we leave the
* stride alone. We cannot "fix" accumulator destinations by writing
* to a temporary and emitting a MOV into the original destination.
* For multiply instructions (our one use of the accumulator), the
* MUL writes the full 66 bits of the accumulator whereas the MOV we
* would emit only writes 33 bits and leaves the top 33 bits
* undefined.
*
* It's safe to just require the original stride here because the
* lowering pass will detect the mismatch in has_invalid_src_region
* and fix the sources of the multiply instead of the destination.
*/
return inst->dst.stride * type_sz(inst->dst.type);
} else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
!is_byte_raw_mov(inst)) {
return get_exec_type_size(inst);
} else {
/* Calculate the maximum byte stride and the minimum/maximum type
* size across all source and destination operands we are required to
* lower.
*/
unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);
unsigned min_size = type_sz(inst->dst.type);
unsigned max_size = type_sz(inst->dst.type);
for (unsigned i = 0; i < inst->sources; i++) {
if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
const unsigned size = type_sz(inst->src[i].type);
max_stride = MAX2(max_stride, inst->src[i].stride * size);
min_size = MIN2(min_size, size);
max_size = MAX2(max_size, size);
}
}
/* All operands involved in lowering need to fit in the calculated
* stride.
*/
assert(max_size <= 4 * min_size);
/* Attempt to use the largest byte stride among all present operands,
* but never exceed a stride of 4 since that would lead to illegal
* destination regions during lowering.
*/
return MIN2(max_stride, 4 * min_size);
}
}
/*
* Return an acceptable byte sub-register offset for the destination of an
* instruction that requires it to be aligned to the sub-register offset of
* the sources.
*/
unsigned
required_dst_byte_offset(const intel_device_info *devinfo, const fs_inst *inst)
{
for (unsigned i = 0; i < inst->sources; i++) {
if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
if (reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE) !=
reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE))
return 0;
}
return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
}
/*
* Return the closest legal execution type for an instruction on
* the specified platform.
*/
brw_reg_type
required_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
{
const brw_reg_type t = get_exec_type(inst);
const bool has_64bit = brw_reg_type_is_floating_point(t) ?
devinfo->has_64bit_float : devinfo->has_64bit_int;
switch (inst->opcode) {
case SHADER_OPCODE_SHUFFLE:
/* IVB has an issue (which we found empirically) where it reads
* two address register components per channel for indirectly
* addressed 64-bit sources.
*
* From the Cherryview PRM Vol 7. "Register Region Restrictions":
*
* "When source or destination datatype is 64b or operation is
* integer DWord multiply, indirect addressing must not be
* used."
*
* Work around both of the above and handle platforms that
* don't support 64-bit types at all.
*/
if ((!devinfo->has_64bit_int ||
devinfo->platform == INTEL_PLATFORM_CHV ||
intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4)
return BRW_REGISTER_TYPE_UD;
else if (has_dst_aligned_region_restriction(devinfo, inst))
return brw_int_type(type_sz(t), false);
else
return t;
case SHADER_OPCODE_SEL_EXEC:
if ((!has_64bit || devinfo->has_64bit_float_via_math_pipe) &&
type_sz(t) > 4)
return BRW_REGISTER_TYPE_UD;
else
return t;
case SHADER_OPCODE_QUAD_SWIZZLE:
if (has_dst_aligned_region_restriction(devinfo, inst))
return brw_int_type(type_sz(t), false);
else
return t;
case SHADER_OPCODE_CLUSTER_BROADCAST:
/* From the Cherryview PRM Vol 7. "Register Region Restrictions":
*
* "When source or destination datatype is 64b or operation is
* integer DWord multiply, indirect addressing must not be
* used."
*
* For MTL (verx10 == 125), float64 is supported, but int64 is not.
* Therefore we need to lower cluster broadcast using 32-bit int ops.
*
* For gfx12.5+ platforms that support int64, the register regions
* used by cluster broadcast aren't supported by the 64-bit pipeline.
*
* Work around the above and handle platforms that don't
* support 64-bit types at all.
*/
if ((!has_64bit || devinfo->verx10 >= 125 ||
devinfo->platform == INTEL_PLATFORM_CHV ||
intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4)
return BRW_REGISTER_TYPE_UD;
else
return brw_int_type(type_sz(t), false);
case SHADER_OPCODE_BROADCAST:
case SHADER_OPCODE_MOV_INDIRECT:
if (((devinfo->verx10 == 70 ||
devinfo->platform == INTEL_PLATFORM_CHV ||
intel_device_info_is_9lp(devinfo) ||
devinfo->verx10 >= 125) && type_sz(inst->src[0].type) > 4) ||
(devinfo->verx10 >= 125 &&
brw_reg_type_is_floating_point(inst->src[0].type)))
return brw_int_type(type_sz(t), false);
else
return t;
default:
return t;
}
}
/*
* Return the stride between channels of the specified register in
* byte units, or ~0u if the region cannot be represented with a
* single one-dimensional stride.
*/
unsigned
byte_stride(const fs_reg &reg)
{
switch (reg.file) {
case BAD_FILE:
case UNIFORM:
case IMM:
case VGRF:
case MRF:
case ATTR:
return reg.stride * type_sz(reg.type);
case ARF:
case FIXED_GRF:
if (reg.is_null()) {
return 0;
} else {
const unsigned hstride = reg.hstride ? 1 << (reg.hstride - 1) : 0;
const unsigned vstride = reg.vstride ? 1 << (reg.vstride - 1) : 0;
const unsigned width = 1 << reg.width;
if (width == 1) {
return vstride * type_sz(reg.type);
} else if (hstride * width == vstride) {
return hstride * type_sz(reg.type);
} else {
return ~0u;
}
}
default:
unreachable("Invalid register file");
}
}
/*
* Return whether the instruction has an unsupported channel bit layout
* specified for the i-th source region.
*/
bool
has_invalid_src_region(const intel_device_info *devinfo, const fs_inst *inst,
unsigned i)
{
if (is_send(inst) || inst->is_math() || inst->is_control_source(i) ||
inst->opcode == BRW_OPCODE_DPAS) {
return false;
}
/* Empirical testing shows that Broadwell has a bug affecting half-float
* MAD instructions when any of its sources has a non-zero offset, such
* as:
*
* mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q };
*
* We used to generate code like this for SIMD8 executions where we
* used to pack components Y and W of a vector at offset 16B of a SIMD
* register. The problem doesn't occur if the stride of the source is 0.
*/
if (devinfo->ver == 8 &&
inst->opcode == BRW_OPCODE_MAD &&
inst->src[i].type == BRW_REGISTER_TYPE_HF &&
reg_offset(inst->src[i]) % REG_SIZE > 0 &&
inst->src[i].stride != 0) {
return true;
}
const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
const unsigned src_byte_offset = reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
return has_dst_aligned_region_restriction(devinfo, inst) &&
!is_uniform(inst->src[i]) &&
(byte_stride(inst->src[i]) != byte_stride(inst->dst) ||
src_byte_offset != dst_byte_offset);
}
/*
* Return whether the instruction has an unsupported channel bit layout
* specified for the destination region.
*/
bool
has_invalid_dst_region(const intel_device_info *devinfo,
const fs_inst *inst)
{
if (is_send(inst) || inst->is_math()) {
return false;
} else {
const brw_reg_type exec_type = get_exec_type(inst);
const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
type_sz(inst->dst.type) < type_sz(exec_type);
return (has_dst_aligned_region_restriction(devinfo, inst) &&
(required_dst_byte_stride(inst) != byte_stride(inst->dst) ||
required_dst_byte_offset(devinfo, inst) != dst_byte_offset)) ||
(is_narrowing_conversion &&
required_dst_byte_stride(inst) != byte_stride(inst->dst));
}
}
/**
* Return a non-zero value if the execution type of the instruction is
* unsupported. The destination and sources matching the returned mask
* will be bit-cast to an integer type of appropriate size, lowering any
* source or destination modifiers into separate MOV instructions.
*/
unsigned
has_invalid_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
{
if (required_exec_type(devinfo, inst) != get_exec_type(inst)) {
switch (inst->opcode) {
case SHADER_OPCODE_SHUFFLE:
case SHADER_OPCODE_QUAD_SWIZZLE:
case SHADER_OPCODE_CLUSTER_BROADCAST:
case SHADER_OPCODE_BROADCAST:
case SHADER_OPCODE_MOV_INDIRECT:
return 0x1;
case SHADER_OPCODE_SEL_EXEC:
return 0x3;
default:
unreachable("Unknown invalid execution type source mask.");
}
} else {
return 0;
}
}
/*
* Return whether the instruction has unsupported source modifiers
* specified for the i-th source region.
*/
bool
has_invalid_src_modifiers(const intel_device_info *devinfo,
const fs_inst *inst, unsigned i)
{
return (!inst->can_do_source_mods(devinfo) &&
(inst->src[i].negate || inst->src[i].abs)) ||
((has_invalid_exec_type(devinfo, inst) & (1u << i)) &&
(inst->src[i].negate || inst->src[i].abs ||
inst->src[i].type != get_exec_type(inst)));
}
/*
* Return whether the instruction has an unsupported type conversion
* specified for the destination.
*/
bool
has_invalid_conversion(const intel_device_info *devinfo, const fs_inst *inst)
{
switch (inst->opcode) {
case BRW_OPCODE_MOV:
return false;
case BRW_OPCODE_SEL:
return inst->dst.type != get_exec_type(inst);
default:
/* FIXME: We assume the opcodes not explicitly mentioned before just
* work fine with arbitrary conversions, unless they need to be
* bit-cast.
*/
return has_invalid_exec_type(devinfo, inst) &&
inst->dst.type != get_exec_type(inst);
}
}
/**
* Return whether the instruction has unsupported destination modifiers.
*/
bool
has_invalid_dst_modifiers(const intel_device_info *devinfo, const fs_inst *inst)
{
return (has_invalid_exec_type(devinfo, inst) &&
(inst->saturate || inst->conditional_mod)) ||
has_invalid_conversion(devinfo, inst);
}
/**
* Return whether the instruction has non-standard semantics for the
* conditional mod which don't cause the flag register to be updated with
* the comparison result.
*/
bool
has_inconsistent_cmod(const fs_inst *inst)
{
return inst->opcode == BRW_OPCODE_SEL ||
inst->opcode == BRW_OPCODE_CSEL ||
inst->opcode == BRW_OPCODE_IF ||
inst->opcode == BRW_OPCODE_WHILE;
}
bool
lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
}
namespace brw {
/**
* Remove any modifiers from the \p i-th source region of the instruction,
* including negate, abs and any implicit type conversion to the execution
* type. Instead any source modifiers will be implemented as a separate
* MOV instruction prior to the original instruction.
*/
bool
lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
{
assert(inst->components_read(i) == 1);
assert(v->devinfo->has_integer_dword_mul ||
inst->opcode != BRW_OPCODE_MUL ||
brw_reg_type_is_floating_point(get_exec_type(inst)) ||
MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4 ||
type_sz(inst->src[i].type) == get_exec_type_size(inst));
const fs_builder ibld(v, block, inst);
const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
inst->src[i] = tmp;
return true;
}
}
namespace {
/**
* Remove any modifiers from the destination region of the instruction,
* including saturate, conditional mod and any implicit type conversion
* from the execution type. Instead any destination modifiers will be
* implemented as a separate MOV instruction after the original
* instruction.
*/
bool
lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
{
const fs_builder ibld(v, block, inst);
const brw_reg_type type = get_exec_type(inst);
/* Not strictly necessary, but if possible use a temporary with the same
* channel alignment as the current destination in order to avoid
* violating the restrictions enforced later on by lower_src_region()
* and lower_dst_region(), which would introduce additional copy
* instructions into the program unnecessarily.
*/
const unsigned stride =
type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
fs_reg tmp = ibld.vgrf(type, stride);
ibld.UNDEF(tmp);
tmp = horiz_stride(tmp, stride);
/* Emit a MOV taking care of all the destination modifiers. */
fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
mov->saturate = inst->saturate;
if (!has_inconsistent_cmod(inst))
mov->conditional_mod = inst->conditional_mod;
if (inst->opcode != BRW_OPCODE_SEL) {
mov->predicate = inst->predicate;
mov->predicate_inverse = inst->predicate_inverse;
}
mov->flag_subreg = inst->flag_subreg;
lower_instruction(v, block, mov);
/* Point the original instruction at the temporary, and clean up any
* destination modifiers.
*/
assert(inst->size_written == inst->dst.component_size(inst->exec_size));
inst->dst = tmp;
inst->size_written = inst->dst.component_size(inst->exec_size);
inst->saturate = false;
if (!has_inconsistent_cmod(inst))
inst->conditional_mod = BRW_CONDITIONAL_NONE;
assert(!inst->flags_written(v->devinfo) || !mov->predicate);
return true;
}
/**
* Remove any non-trivial shuffling of data from the \p i-th source region
* of the instruction. Instead implement the region as a series of integer
* copies into a temporary with the same channel layout as the destination.
*/
bool
lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
{
assert(inst->components_read(i) == 1);
const fs_builder ibld(v, block, inst);
const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
type_sz(inst->src[i].type);
assert(stride > 0);
fs_reg tmp = ibld.vgrf(inst->src[i].type, stride);
ibld.UNDEF(tmp);
tmp = horiz_stride(tmp, stride);
/* Emit a series of 32-bit integer copies with any source modifiers
* cleaned up (because their semantics are dependent on the type).
*/
const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
false);
const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
fs_reg raw_src = inst->src[i];
raw_src.negate = false;
raw_src.abs = false;
for (unsigned j = 0; j < n; j++)
ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
/* Point the original instruction at the temporary, making sure to keep
* any source modifiers in the instruction.
*/
fs_reg lower_src = tmp;
lower_src.negate = inst->src[i].negate;
lower_src.abs = inst->src[i].abs;
inst->src[i] = lower_src;
return true;
}
/**
* Remove any non-trivial shuffling of data from the destination region of
* the instruction. Instead implement the region as a series of integer
* copies from a temporary with a channel layout compatible with the
* sources.
*/
bool
lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
{
/* We cannot replace the result of an integer multiply which writes the
* accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
* value whereas the MOV will act on only 32 or 33 bits of the
* accumulator.
*/
assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
brw_reg_type_is_floating_point(inst->dst.type));
const fs_builder ibld(v, block, inst);
const unsigned stride = required_dst_byte_stride(inst) /
type_sz(inst->dst.type);
assert(stride > 0);
fs_reg tmp = ibld.vgrf(inst->dst.type, stride);
ibld.UNDEF(tmp);
tmp = horiz_stride(tmp, stride);
/* Emit a series of 32-bit integer copies from the temporary into the
* original destination.
*/
const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
false);
const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
/* Note that in general we cannot simply predicate the copies on the
* same flag register as the original instruction, since it may have
* been overwritten by the instruction itself. Instead initialize
* the temporary with the previous contents of the destination
* register.
*/
for (unsigned j = 0; j < n; j++)
ibld.MOV(subscript(tmp, raw_type, j),
subscript(inst->dst, raw_type, j));
}
for (unsigned j = 0; j < n; j++)
ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
subscript(tmp, raw_type, j));
/* Point the original instruction at the temporary, making sure to keep
* any destination modifiers in the instruction.
*/
assert(inst->size_written == inst->dst.component_size(inst->exec_size));
inst->dst = tmp;
inst->size_written = inst->dst.component_size(inst->exec_size);
return true;
}
/**
* Change sources and destination of the instruction to an
* appropriate legal type, splitting the instruction into multiple
* ones of smaller execution type if necessary, to be used in cases
* where the execution type of an instruction is unsupported.
*/
bool
lower_exec_type(fs_visitor *v, bblock_t *block, fs_inst *inst)
{
assert(inst->dst.type == get_exec_type(inst));
const unsigned mask = has_invalid_exec_type(v->devinfo, inst);
const brw_reg_type raw_type = required_exec_type(v->devinfo, inst);
const unsigned n = get_exec_type_size(inst) / type_sz(raw_type);
const fs_builder ibld(v, block, inst);
fs_reg tmp = ibld.vgrf(inst->dst.type, inst->dst.stride);
ibld.UNDEF(tmp);
tmp = horiz_stride(tmp, inst->dst.stride);
for (unsigned j = 0; j < n; j++) {
fs_inst sub_inst = *inst;
for (unsigned i = 0; i < inst->sources; i++) {
if (mask & (1u << i)) {
assert(inst->src[i].type == inst->dst.type);
sub_inst.src[i] = subscript(inst->src[i], raw_type, j);
}
}
sub_inst.dst = subscript(tmp, raw_type, j);
assert(sub_inst.size_written == sub_inst.dst.component_size(sub_inst.exec_size));
assert(!sub_inst.flags_written(v->devinfo) && !sub_inst.saturate);
ibld.emit(sub_inst);
fs_inst *mov = ibld.MOV(subscript(inst->dst, raw_type, j),
subscript(tmp, raw_type, j));
if (inst->opcode != BRW_OPCODE_SEL) {
mov->predicate = inst->predicate;
mov->predicate_inverse = inst->predicate_inverse;
}
lower_instruction(v, block, mov);
}
inst->remove(block);
return true;
}
/**
* Legalize the source and destination regioning controls of the specified
* instruction.
*/
bool
lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
{
const intel_device_info *devinfo = v->devinfo;
bool progress = false;
if (has_invalid_dst_modifiers(devinfo, inst))
progress |= lower_dst_modifiers(v, block, inst);
if (has_invalid_dst_region(devinfo, inst))
progress |= lower_dst_region(v, block, inst);
for (unsigned i = 0; i < inst->sources; i++) {
if (has_invalid_src_modifiers(devinfo, inst, i))
progress |= lower_src_modifiers(v, block, inst, i);
if (has_invalid_src_region(devinfo, inst, i))
progress |= lower_src_region(v, block, inst, i);
}
if (has_invalid_exec_type(devinfo, inst))
progress |= lower_exec_type(v, block, inst);
return progress;
}
}
bool
fs_visitor::lower_regioning()
{
bool progress = false;
foreach_block_and_inst_safe(block, fs_inst, inst, cfg)
progress |= lower_instruction(this, block, inst);
if (progress)
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
return progress;
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,349 @@
/*
* Copyright © 2012 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
/** @file brw_fs_register_coalesce.cpp
*
* Implements register coalescing: Checks if the two registers involved in a
* raw move don't interfere, in which case they can both be stored in the same
* place and the MOV removed.
*
* To do this, all uses of the source of the MOV in the shader are replaced
* with the destination of the MOV. For example:
*
* add vgrf3:F, vgrf1:F, vgrf2:F
* mov vgrf4:F, vgrf3:F
* mul vgrf5:F, vgrf5:F, vgrf4:F
*
* becomes
*
* add vgrf4:F, vgrf1:F, vgrf2:F
* mul vgrf5:F, vgrf5:F, vgrf4:F
*/
#include "brw_fs.h"
#include "brw_cfg.h"
#include "brw_fs_live_variables.h"
using namespace brw;
static bool
is_nop_mov(const fs_inst *inst)
{
if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
fs_reg dst = inst->dst;
for (int i = 0; i < inst->sources; i++) {
if (!dst.equals(inst->src[i])) {
return false;
}
dst.offset += (i < inst->header_size ? REG_SIZE :
inst->exec_size * dst.stride *
type_sz(inst->src[i].type));
}
return true;
} else if (inst->opcode == BRW_OPCODE_MOV) {
return inst->dst.equals(inst->src[0]);
}
return false;
}
static bool
is_coalesce_candidate(const fs_visitor *v, const fs_inst *inst)
{
if ((inst->opcode != BRW_OPCODE_MOV &&
inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) ||
inst->is_partial_write() ||
inst->saturate ||
inst->src[0].file != VGRF ||
inst->src[0].negate ||
inst->src[0].abs ||
!inst->src[0].is_contiguous() ||
inst->dst.file != VGRF ||
inst->dst.type != inst->src[0].type) {
return false;
}
if (v->alloc.sizes[inst->src[0].nr] >
v->alloc.sizes[inst->dst.nr])
return false;
if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
if (!is_coalescing_payload(v->alloc, inst)) {
return false;
}
}
return true;
}
static bool
can_coalesce_vars(const fs_live_variables &live, const cfg_t *cfg,
const bblock_t *block, const fs_inst *inst,
int dst_var, int src_var)
{
if (!live.vars_interfere(src_var, dst_var))
return true;
int dst_start = live.start[dst_var];
int dst_end = live.end[dst_var];
int src_start = live.start[src_var];
int src_end = live.end[src_var];
/* Variables interfere and one line range isn't a subset of the other. */
if ((dst_end > src_end && src_start < dst_start) ||
(src_end > dst_end && dst_start < src_start))
return false;
/* Check for a write to either register in the intersection of their live
* ranges.
*/
int start_ip = MAX2(dst_start, src_start);
int end_ip = MIN2(dst_end, src_end);
foreach_block(scan_block, cfg) {
if (scan_block->end_ip < start_ip)
continue;
int scan_ip = scan_block->start_ip - 1;
bool seen_src_write = false;
bool seen_copy = false;
foreach_inst_in_block(fs_inst, scan_inst, scan_block) {
scan_ip++;
/* Ignore anything before the intersection of the live ranges */
if (scan_ip < start_ip)
continue;
/* Ignore the copying instruction itself */
if (scan_inst == inst) {
seen_copy = true;
continue;
}
if (scan_ip > end_ip)
return true; /* registers do not interfere */
if (seen_src_write && !seen_copy) {
/* In order to satisfy the guarantee of register coalescing, we
* must ensure that the two registers always have the same value
* during the intersection of their live ranges. One way to do
* this is to simply ensure that neither is ever written apart
* from the one copy which syncs up the two registers. However,
* this can be overly conservative and only works in the case
* where the destination live range is entirely contained in the
* source live range.
*
* To handle the other case where the source is contained in the
* destination, we allow writes to the source register as long as
* they happen before the copy, in the same block as the copy, and
* the destination is never read between first such write and the
* copy. This effectively moves the write from the copy up.
*/
for (int j = 0; j < scan_inst->sources; j++) {
if (regions_overlap(scan_inst->src[j], scan_inst->size_read(j),
inst->dst, inst->size_written))
return false; /* registers interfere */
}
}
/* The MOV being coalesced had better be the only instruction which
* writes to the coalesce destination in the intersection.
*/
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
inst->dst, inst->size_written))
return false; /* registers interfere */
/* See the big comment above */
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
inst->src[0], inst->size_read(0))) {
if (seen_copy || scan_block != block ||
(scan_inst->force_writemask_all && !inst->force_writemask_all))
return false;
seen_src_write = true;
}
}
}
return true;
}
bool
fs_visitor::register_coalesce()
{
bool progress = false;
fs_live_variables &live = live_analysis.require();
int src_size = 0;
int channels_remaining = 0;
unsigned src_reg = ~0u, dst_reg = ~0u;
int *dst_reg_offset = new int[MAX_VGRF_SIZE(devinfo)];
fs_inst **mov = new fs_inst *[MAX_VGRF_SIZE(devinfo)];
int *dst_var = new int[MAX_VGRF_SIZE(devinfo)];
int *src_var = new int[MAX_VGRF_SIZE(devinfo)];
foreach_block_and_inst(block, fs_inst, inst, cfg) {
if (!is_coalesce_candidate(this, inst))
continue;
if (is_nop_mov(inst)) {
inst->opcode = BRW_OPCODE_NOP;
progress = true;
continue;
}
if (src_reg != inst->src[0].nr) {
src_reg = inst->src[0].nr;
src_size = alloc.sizes[inst->src[0].nr];
assert(src_size <= MAX_VGRF_SIZE(devinfo));
channels_remaining = src_size;
memset(mov, 0, sizeof(*mov) * MAX_VGRF_SIZE(devinfo));
dst_reg = inst->dst.nr;
}
if (dst_reg != inst->dst.nr)
continue;
if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
for (int i = 0; i < src_size; i++) {
dst_reg_offset[i] = i;
}
mov[0] = inst;
channels_remaining -= regs_written(inst);
} else {
const int offset = inst->src[0].offset / REG_SIZE;
if (mov[offset]) {
/* This is the second time that this offset in the register has
* been set. This means, in particular, that inst->dst was
* live before this instruction and that the live ranges of
* inst->dst and inst->src[0] overlap and we can't coalesce the
* two variables. Let's ensure that doesn't happen.
*/
channels_remaining = -1;
continue;
}
for (unsigned i = 0; i < MAX2(inst->size_written / REG_SIZE, 1); i++)
dst_reg_offset[offset + i] = inst->dst.offset / REG_SIZE + i;
mov[offset] = inst;
channels_remaining -= regs_written(inst);
}
if (channels_remaining)
continue;
bool can_coalesce = true;
for (int i = 0; i < src_size; i++) {
if (dst_reg_offset[i] != dst_reg_offset[0] + i) {
/* Registers are out-of-order. */
can_coalesce = false;
src_reg = ~0u;
break;
}
dst_var[i] = live.var_from_vgrf[dst_reg] + dst_reg_offset[i];
src_var[i] = live.var_from_vgrf[src_reg] + i;
if (!can_coalesce_vars(live, cfg, block, inst, dst_var[i], src_var[i])) {
can_coalesce = false;
src_reg = ~0u;
break;
}
}
if (!can_coalesce)
continue;
progress = true;
for (int i = 0; i < src_size; i++) {
if (!mov[i])
continue;
if (mov[i]->conditional_mod == BRW_CONDITIONAL_NONE) {
mov[i]->opcode = BRW_OPCODE_NOP;
mov[i]->dst = reg_undef;
for (int j = 0; j < mov[i]->sources; j++) {
mov[i]->src[j] = reg_undef;
}
} else {
/* If we have a conditional modifier, rewrite the MOV to be a
* MOV.cmod from the coalesced register. Hopefully, cmod
* propagation will clean this up and move it to the instruction
* that writes the register. If not, this keeps things correct
* while still letting us coalesce.
*/
assert(mov[i]->opcode == BRW_OPCODE_MOV);
assert(mov[i]->sources == 1);
mov[i]->src[0] = mov[i]->dst;
mov[i]->dst = retype(brw_null_reg(), mov[i]->dst.type);
}
}
foreach_block_and_inst(block, fs_inst, scan_inst, cfg) {
if (scan_inst->dst.file == VGRF &&
scan_inst->dst.nr == src_reg) {
scan_inst->dst.nr = dst_reg;
scan_inst->dst.offset = scan_inst->dst.offset % REG_SIZE +
dst_reg_offset[scan_inst->dst.offset / REG_SIZE] * REG_SIZE;
}
for (int j = 0; j < scan_inst->sources; j++) {
if (scan_inst->src[j].file == VGRF &&
scan_inst->src[j].nr == src_reg) {
scan_inst->src[j].nr = dst_reg;
scan_inst->src[j].offset = scan_inst->src[j].offset % REG_SIZE +
dst_reg_offset[scan_inst->src[j].offset / REG_SIZE] * REG_SIZE;
}
}
}
for (int i = 0; i < src_size; i++) {
live.start[dst_var[i]] = MIN2(live.start[dst_var[i]],
live.start[src_var[i]]);
live.end[dst_var[i]] = MAX2(live.end[dst_var[i]],
live.end[src_var[i]]);
}
src_reg = ~0u;
}
if (progress) {
foreach_block_and_inst_safe (block, backend_instruction, inst, cfg) {
if (inst->opcode == BRW_OPCODE_NOP) {
inst->remove(block, true);
}
}
cfg->adjust_block_ips();
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
}
delete[] src_var;
delete[] dst_var;
delete[] mov;
delete[] dst_reg_offset;
return progress;
}

View file

@ -0,0 +1,165 @@
/*
* Copyright © 2013 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_fs.h"
#include "brw_fs_live_variables.h"
#include "brw_cfg.h"
using namespace brw;
/** @file brw_fs_saturate_propagation.cpp
*
* Implements a pass that propagates the SAT modifier from a MOV.SAT into the
* instruction that produced the source of the MOV.SAT, thereby allowing the
* MOV's src and dst to be coalesced and the MOV removed.
*
* For instance,
*
* ADD tmp, src0, src1
* MOV.SAT dst, tmp
*
* would be transformed into
*
* ADD.SAT tmp, src0, src1
* MOV dst, tmp
*/
static bool
opt_saturate_propagation_local(const fs_live_variables &live, bblock_t *block)
{
bool progress = false;
int ip = block->end_ip + 1;
foreach_inst_in_block_reverse(fs_inst, inst, block) {
ip--;
if (inst->opcode != BRW_OPCODE_MOV ||
!inst->saturate ||
inst->dst.file != VGRF ||
inst->dst.type != inst->src[0].type ||
inst->src[0].file != VGRF ||
inst->src[0].abs)
continue;
int src_var = live.var_from_reg(inst->src[0]);
int src_end_ip = live.end[src_var];
bool interfered = false;
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
if (scan_inst->exec_size == inst->exec_size &&
regions_overlap(scan_inst->dst, scan_inst->size_written,
inst->src[0], inst->size_read(0))) {
if (scan_inst->is_partial_write() ||
(scan_inst->dst.type != inst->dst.type &&
!scan_inst->can_change_types()))
break;
if (scan_inst->saturate) {
inst->saturate = false;
progress = true;
} else if (src_end_ip == ip || inst->dst.equals(inst->src[0])) {
if (scan_inst->can_do_saturate()) {
if (scan_inst->dst.type != inst->dst.type) {
scan_inst->dst.type = inst->dst.type;
for (int i = 0; i < scan_inst->sources; i++) {
scan_inst->src[i].type = inst->dst.type;
}
}
if (inst->src[0].negate) {
if (scan_inst->opcode == BRW_OPCODE_MUL) {
scan_inst->src[0].negate = !scan_inst->src[0].negate;
inst->src[0].negate = false;
} else if (scan_inst->opcode == BRW_OPCODE_MAD) {
for (int i = 0; i < 2; i++) {
if (scan_inst->src[i].file == IMM) {
brw_negate_immediate(scan_inst->src[i].type,
&scan_inst->src[i].as_brw_reg());
} else {
scan_inst->src[i].negate = !scan_inst->src[i].negate;
}
}
inst->src[0].negate = false;
} else if (scan_inst->opcode == BRW_OPCODE_ADD) {
if (scan_inst->src[1].file == IMM) {
if (!brw_negate_immediate(scan_inst->src[1].type,
&scan_inst->src[1].as_brw_reg())) {
break;
}
} else {
scan_inst->src[1].negate = !scan_inst->src[1].negate;
}
scan_inst->src[0].negate = !scan_inst->src[0].negate;
inst->src[0].negate = false;
} else {
break;
}
}
scan_inst->saturate = true;
inst->saturate = false;
progress = true;
}
}
break;
}
for (int i = 0; i < scan_inst->sources; i++) {
if (scan_inst->src[i].file == VGRF &&
scan_inst->src[i].nr == inst->src[0].nr &&
regions_overlap(
scan_inst->src[i], scan_inst->size_read(i),
inst->src[0], inst->size_read(0))) {
if (scan_inst->opcode != BRW_OPCODE_MOV ||
!scan_inst->saturate ||
scan_inst->src[0].abs ||
scan_inst->src[0].negate ||
scan_inst->src[0].abs != inst->src[0].abs ||
scan_inst->src[0].negate != inst->src[0].negate) {
interfered = true;
break;
}
}
}
if (interfered)
break;
}
}
return progress;
}
bool
fs_visitor::opt_saturate_propagation()
{
const fs_live_variables &live = live_analysis.require();
bool progress = false;
foreach_block (block, cfg) {
progress = opt_saturate_propagation_local(live, block) || progress;
}
/* Live intervals are still valid. */
return progress;
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,229 @@
/*
* Copyright © 2013 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_fs.h"
#include "brw_fs_builder.h"
#include "brw_cfg.h"
/** @file brw_fs_sel_peephole.cpp
*
* This file contains the opt_peephole_sel() optimization pass that replaces
* MOV instructions to the same destination in the "then" and "else" bodies of
* an if statement with SEL instructions.
*/
/* Four MOVs seems to be pretty typical, so I picked the next power of two in
* the hopes that it would handle almost anything possible in a single
* pass.
*/
#define MAX_MOVS 8 /**< The maximum number of MOVs to attempt to match. */
using namespace brw;
/**
* Scans forwards from an IF counting consecutive MOV instructions in the
* "then" and "else" blocks of the if statement.
*
* A pointer to the bblock_t following the IF is passed as the <then_block>
* argument. The function stores pointers to the MOV instructions in the
* <then_mov> and <else_mov> arrays.
*
* \return the minimum number of MOVs found in the two branches or zero if
* an error occurred.
*
* E.g.:
* IF ...
* then_mov[0] = MOV g4, ...
* then_mov[1] = MOV g5, ...
* then_mov[2] = MOV g6, ...
* ELSE ...
* else_mov[0] = MOV g4, ...
* else_mov[1] = MOV g5, ...
* else_mov[2] = MOV g7, ...
* ENDIF
* returns 3.
*/
static int
count_movs_from_if(const intel_device_info *devinfo,
fs_inst *then_mov[MAX_MOVS], fs_inst *else_mov[MAX_MOVS],
bblock_t *then_block, bblock_t *else_block)
{
int then_movs = 0;
foreach_inst_in_block(fs_inst, inst, then_block) {
if (then_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV ||
inst->flags_written(devinfo))
break;
then_mov[then_movs] = inst;
then_movs++;
}
int else_movs = 0;
foreach_inst_in_block(fs_inst, inst, else_block) {
if (else_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV ||
inst->flags_written(devinfo))
break;
else_mov[else_movs] = inst;
else_movs++;
}
return MIN2(then_movs, else_movs);
}
/**
* Try to replace IF/MOV+/ELSE/MOV+/ENDIF with SEL.
*
* Many GLSL shaders contain the following pattern:
*
* x = condition ? foo : bar
*
* or
*
* if (...) a.xyzw = foo.xyzw;
* else a.xyzw = bar.xyzw;
*
* The compiler emits an ir_if tree for this, since each subexpression might be
* a complex tree that could have side-effects or short-circuit logic.
*
* However, the common case is to simply select one of two constants or
* variable values---which is exactly what SEL is for. In this case, the
* assembly looks like:
*
* (+f0) IF
* MOV dst src0
* ...
* ELSE
* MOV dst src1
* ...
* ENDIF
*
* where each pair of MOVs to a common destination and can be easily translated
* into
*
* (+f0) SEL dst src0 src1
*
* If src0 is an immediate value, we promote it to a temporary GRF.
*/
bool
fs_visitor::opt_peephole_sel()
{
bool progress = false;
foreach_block (block, cfg) {
/* IF instructions, by definition, can only be found at the ends of
* basic blocks.
*/
fs_inst *if_inst = (fs_inst *)block->end();
if (if_inst->opcode != BRW_OPCODE_IF)
continue;
fs_inst *else_mov[MAX_MOVS] = { NULL };
fs_inst *then_mov[MAX_MOVS] = { NULL };
bblock_t *then_block = block->next();
bblock_t *else_block = NULL;
foreach_list_typed(bblock_link, child, link, &block->children) {
if (child->block != then_block) {
if (child->block->prev()->end()->opcode == BRW_OPCODE_ELSE) {
else_block = child->block;
}
break;
}
}
if (else_block == NULL)
continue;
int movs = count_movs_from_if(devinfo, then_mov, else_mov, then_block, else_block);
if (movs == 0)
continue;
/* Generate SEL instructions for pairs of MOVs to a common destination. */
for (int i = 0; i < movs; i++) {
if (!then_mov[i] || !else_mov[i])
break;
/* Check that the MOVs are the right form. */
if (!then_mov[i]->dst.equals(else_mov[i]->dst) ||
then_mov[i]->exec_size != else_mov[i]->exec_size ||
then_mov[i]->group != else_mov[i]->group ||
then_mov[i]->force_writemask_all != else_mov[i]->force_writemask_all ||
then_mov[i]->is_partial_write() ||
else_mov[i]->is_partial_write() ||
then_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE ||
else_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE) {
movs = i;
break;
}
/* Check that source types for mov operations match. */
if (then_mov[i]->src[0].type != else_mov[i]->src[0].type) {
movs = i;
break;
}
}
if (movs == 0)
continue;
for (int i = 0; i < movs; i++) {
const fs_builder ibld = fs_builder(this, then_block, then_mov[i])
.at(block, if_inst);
if (then_mov[i]->src[0].equals(else_mov[i]->src[0])) {
ibld.MOV(then_mov[i]->dst, then_mov[i]->src[0]);
} else {
/* Only the last source register can be a constant, so if the MOV
* in the "then" clause uses a constant, we need to put it in a
* temporary.
*/
fs_reg src0(then_mov[i]->src[0]);
if (src0.file == IMM) {
src0 = ibld.vgrf(then_mov[i]->src[0].type);
ibld.MOV(src0, then_mov[i]->src[0]);
}
/* 64-bit immediates can't be placed in src1. */
fs_reg src1(else_mov[i]->src[0]);
if (src1.file == IMM && type_sz(src1.type) == 8) {
src1 = ibld.vgrf(else_mov[i]->src[0].type);
ibld.MOV(src1, else_mov[i]->src[0]);
}
set_predicate_inv(if_inst->predicate, if_inst->predicate_inverse,
ibld.SEL(then_mov[i]->dst, src0, src1));
}
then_mov[i]->remove(then_block);
else_mov[i]->remove(else_block);
}
progress = true;
}
if (progress)
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
return progress;
}

View file

@ -0,0 +1,605 @@
/*
* Copyright © 2006-2022 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_fs.h"
#include "brw_fs_builder.h"
using namespace brw;
vs_thread_payload::vs_thread_payload(const fs_visitor &v)
{
unsigned r = 0;
/* R0: Thread header. */
r += reg_unit(v.devinfo);
/* R1: URB handles. */
urb_handles = brw_ud8_grf(r, 0);
r += reg_unit(v.devinfo);
num_regs = r;
}
tcs_thread_payload::tcs_thread_payload(const fs_visitor &v)
{
struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(v.prog_data);
struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(v.prog_data);
struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) v.key;
if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH) {
patch_urb_output = brw_ud1_grf(0, 0);
primitive_id = brw_vec1_grf(0, 1);
/* r1-r4 contain the ICP handles. */
icp_handle_start = brw_ud8_grf(1, 0);
num_regs = 5;
} else {
assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH);
assert(tcs_key->input_vertices <= BRW_MAX_TCS_INPUT_VERTICES);
unsigned r = 0;
r += reg_unit(v.devinfo);
patch_urb_output = brw_ud8_grf(r, 0);
r += reg_unit(v.devinfo);
if (tcs_prog_data->include_primitive_id) {
primitive_id = brw_vec8_grf(r, 0);
r += reg_unit(v.devinfo);
}
/* ICP handles occupy the next 1-32 registers. */
icp_handle_start = brw_ud8_grf(r, 0);
r += brw_tcs_prog_key_input_vertices(tcs_key) * reg_unit(v.devinfo);
num_regs = r;
}
}
tes_thread_payload::tes_thread_payload(const fs_visitor &v)
{
unsigned r = 0;
/* R0: Thread Header. */
patch_urb_input = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
primitive_id = brw_vec1_grf(0, 1);
r += reg_unit(v.devinfo);
/* R1-3: gl_TessCoord.xyz. */
for (unsigned i = 0; i < 3; i++) {
coords[i] = brw_vec8_grf(r, 0);
r += reg_unit(v.devinfo);
}
/* R4: URB output handles. */
urb_output = brw_ud8_grf(r, 0);
r += reg_unit(v.devinfo);
num_regs = r;
}
gs_thread_payload::gs_thread_payload(fs_visitor &v)
{
struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(v.prog_data);
struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(v.prog_data);
const fs_builder bld = fs_builder(&v).at_end();
/* R0: thread header. */
unsigned r = reg_unit(v.devinfo);
/* R1: output URB handles. */
urb_handles = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.AND(urb_handles, brw_ud8_grf(r, 0),
v.devinfo->ver >= 20 ? brw_imm_ud(0xFFFFFF) : brw_imm_ud(0xFFFF));
/* R1: Instance ID stored in bits 31:27 */
instance_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.SHR(instance_id, brw_ud8_grf(r, 0), brw_imm_ud(27u));
r += reg_unit(v.devinfo);
if (gs_prog_data->include_primitive_id) {
primitive_id = brw_ud8_grf(r, 0);
r += reg_unit(v.devinfo);
}
/* Always enable VUE handles so we can safely use pull model if needed.
*
* The push model for a GS uses a ton of register space even for trivial
* scenarios with just a few inputs, so just make things easier and a bit
* safer by always having pull model available.
*/
gs_prog_data->base.include_vue_handles = true;
/* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
icp_handle_start = brw_ud8_grf(r, 0);
r += v.nir->info.gs.vertices_in * reg_unit(v.devinfo);
num_regs = r;
/* Use a maximum of 24 registers for push-model inputs. */
const unsigned max_push_components = 24;
/* If pushing our inputs would take too many registers, reduce the URB read
* length (which is in HWords, or 8 registers), and resort to pulling.
*
* Note that the GS reads <URB Read Length> HWords for every vertex - so we
* have to multiply by VerticesIn to obtain the total storage requirement.
*/
if (8 * vue_prog_data->urb_read_length * v.nir->info.gs.vertices_in >
max_push_components) {
vue_prog_data->urb_read_length =
ROUND_DOWN_TO(max_push_components / v.nir->info.gs.vertices_in, 8) / 8;
}
}
static inline void
setup_fs_payload_gfx20(fs_thread_payload &payload,
const fs_visitor &v,
bool &source_depth_to_render_target)
{
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(v.prog_data);
const unsigned payload_width = 16;
assert(v.dispatch_width % payload_width == 0);
assert(v.devinfo->ver >= 20);
for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
/* R0-1: PS thread payload header, masks and pixel X/Y coordinates. */
payload.num_regs++;
payload.subspan_coord_reg[j] = payload.num_regs++;
}
for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
/* R2-13: Barycentric interpolation coordinates. These appear
* in the same order that they appear in the brw_barycentric_mode
* enum. Each set of coordinates occupies 2 64B registers per
* SIMD16 half. Coordinates only appear if they were enabled
* using the "Barycentric Interpolation Mode" bits in WM_STATE.
*/
for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
if (prog_data->barycentric_interp_modes & (1 << i)) {
payload.barycentric_coord_reg[i][j] = payload.num_regs;
payload.num_regs += payload_width / 4;
}
}
/* R14: Interpolated depth if "Pixel Shader Uses Source Depth" is set. */
if (prog_data->uses_src_depth) {
payload.source_depth_reg[j] = payload.num_regs;
payload.num_regs += payload_width / 8;
}
/* R15: Interpolated W if "Pixel Shader Uses Source W" is set. */
if (prog_data->uses_src_w) {
payload.source_w_reg[j] = payload.num_regs;
payload.num_regs += payload_width / 8;
}
/* R16: MSAA input coverage mask if "Pixel Shader Uses Input
* Coverage Mask" is set.
*/
if (prog_data->uses_sample_mask) {
payload.sample_mask_in_reg[j] = payload.num_regs;
payload.num_regs += payload_width / 8;
}
/* R19: MSAA position XY offsets if "Position XY Offset Select"
* is either POSOFFSET_CENTROID or POSOFFSET_SAMPLE. Note that
* this is delivered as a single SIMD32 vector, inconsistently
* with most other PS payload fields.
*/
if (prog_data->uses_pos_offset && j == 0) {
for (unsigned k = 0; k < 2; k++) {
payload.sample_pos_reg[k] = payload.num_regs;
payload.num_regs++;
}
}
}
if (prog_data->uses_depth_w_coefficients) {
assert(v.max_polygons == 1);
payload.depth_w_coef_reg = payload.num_regs;
payload.num_regs += 2;
}
if (v.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
source_depth_to_render_target = true;
}
}
static inline void
setup_fs_payload_gfx6(fs_thread_payload &payload,
const fs_visitor &v,
bool &source_depth_to_render_target)
{
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(v.prog_data);
const unsigned payload_width = MIN2(16, v.dispatch_width);
assert(v.dispatch_width % payload_width == 0);
assert(v.devinfo->ver >= 6 && v.devinfo->ver < 20);
payload.num_regs = 0;
/* R0: PS thread payload header. */
payload.num_regs++;
for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
/* R1: masks, pixel X/Y coordinates. */
payload.subspan_coord_reg[j] = payload.num_regs++;
}
for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
/* R3-26: barycentric interpolation coordinates. These appear in the
* same order that they appear in the brw_barycentric_mode enum. Each
* set of coordinates occupies 2 registers if dispatch width == 8 and 4
* registers if dispatch width == 16. Coordinates only appear if they
* were enabled using the "Barycentric Interpolation Mode" bits in
* WM_STATE.
*/
for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
if (prog_data->barycentric_interp_modes & (1 << i)) {
payload.barycentric_coord_reg[i][j] = payload.num_regs;
payload.num_regs += payload_width / 4;
}
}
/* R27-28: interpolated depth if uses source depth */
if (prog_data->uses_src_depth) {
payload.source_depth_reg[j] = payload.num_regs;
payload.num_regs += payload_width / 8;
}
/* R29-30: interpolated W set if GFX6_WM_USES_SOURCE_W. */
if (prog_data->uses_src_w) {
payload.source_w_reg[j] = payload.num_regs;
payload.num_regs += payload_width / 8;
}
/* R31: MSAA position offsets. */
if (prog_data->uses_pos_offset) {
payload.sample_pos_reg[j] = payload.num_regs;
payload.num_regs++;
}
/* R32-33: MSAA input coverage mask */
if (prog_data->uses_sample_mask) {
assert(v.devinfo->ver >= 7);
payload.sample_mask_in_reg[j] = payload.num_regs;
payload.num_regs += payload_width / 8;
}
}
/* R66: Source Depth and/or W Attribute Vertex Deltas */
if (prog_data->uses_depth_w_coefficients) {
assert(v.max_polygons == 1);
payload.depth_w_coef_reg = payload.num_regs;
payload.num_regs++;
}
if (v.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
source_depth_to_render_target = true;
}
}
#undef P /* prompted depth */
#undef C /* computed */
#undef N /* non-promoted? */
#define P 0
#define C 1
#define N 2
static const struct {
GLuint mode:2;
GLuint sd_present:1;
GLuint sd_to_rt:1;
GLuint dd_present:1;
GLuint ds_present:1;
} wm_iz_table[BRW_WM_IZ_BIT_MAX] =
{
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ N, 1, 1, 0, 0 },
{ N, 0, 1, 0, 0 },
{ N, 0, 1, 0, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ C, 0, 1, 1, 0 },
{ C, 0, 1, 1, 0 },
{ P, 0, 0, 0, 0 },
{ N, 1, 1, 0, 0 },
{ C, 0, 1, 1, 0 },
{ C, 0, 1, 1, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ N, 1, 1, 0, 0 },
{ N, 0, 1, 0, 0 },
{ N, 0, 1, 0, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ C, 0, 1, 1, 0 },
{ C, 0, 1, 1, 0 },
{ P, 0, 0, 0, 0 },
{ N, 1, 1, 0, 0 },
{ C, 0, 1, 1, 0 },
{ C, 0, 1, 1, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ N, 1, 1, 0, 1 },
{ N, 0, 1, 0, 1 },
{ N, 0, 1, 0, 1 },
{ P, 0, 0, 0, 0 },
{ P, 0, 0, 0, 0 },
{ C, 0, 1, 1, 1 },
{ C, 0, 1, 1, 1 },
{ P, 0, 0, 0, 0 },
{ N, 1, 1, 0, 1 },
{ C, 0, 1, 1, 1 },
{ C, 0, 1, 1, 1 },
{ P, 0, 0, 0, 0 },
{ C, 0, 0, 0, 1 },
{ P, 0, 0, 0, 0 },
{ C, 0, 1, 0, 1 },
{ P, 0, 0, 0, 0 },
{ C, 1, 1, 0, 1 },
{ C, 0, 1, 0, 1 },
{ C, 0, 1, 0, 1 },
{ P, 0, 0, 0, 0 },
{ C, 1, 1, 1, 1 },
{ C, 0, 1, 1, 1 },
{ C, 0, 1, 1, 1 },
{ P, 0, 0, 0, 0 },
{ C, 1, 1, 1, 1 },
{ C, 0, 1, 1, 1 },
{ C, 0, 1, 1, 1 }
};
/**
* \param line_aa BRW_NEVER, BRW_ALWAYS or BRW_SOMETIMES
* \param lookup bitmask of BRW_WM_IZ_* flags
*/
static inline void
setup_fs_payload_gfx4(fs_thread_payload &payload,
const fs_visitor &v,
bool &source_depth_to_render_target,
bool &runtime_check_aads_emit)
{
assert(v.dispatch_width <= 16);
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(v.prog_data);
brw_wm_prog_key *key = (brw_wm_prog_key *) v.key;
GLuint reg = 1;
bool kill_stats_promoted_workaround = false;
int lookup = key->iz_lookup;
assert(lookup < BRW_WM_IZ_BIT_MAX);
/* Crazy workaround in the windowizer, which we need to track in
* our register allocation and render target writes. See the "If
* statistics are enabled..." paragraph of 11.5.3.2: Early Depth
* Test Cases [Pre-DevGT] of the 3D Pipeline - Windower B-Spec.
*/
if (key->stats_wm &&
(lookup & BRW_WM_IZ_PS_KILL_ALPHATEST_BIT) &&
wm_iz_table[lookup].mode == P) {
kill_stats_promoted_workaround = true;
}
payload.subspan_coord_reg[0] = reg++;
if (wm_iz_table[lookup].sd_present || prog_data->uses_src_depth ||
kill_stats_promoted_workaround) {
payload.source_depth_reg[0] = reg;
reg += 2;
}
if (wm_iz_table[lookup].sd_to_rt || kill_stats_promoted_workaround)
source_depth_to_render_target = true;
if (wm_iz_table[lookup].ds_present || key->line_aa != BRW_NEVER) {
payload.aa_dest_stencil_reg[0] = reg;
runtime_check_aads_emit =
!wm_iz_table[lookup].ds_present && key->line_aa == BRW_SOMETIMES;
reg++;
}
if (wm_iz_table[lookup].dd_present) {
payload.dest_depth_reg[0] = reg;
reg+=2;
}
payload.num_regs = reg;
}
#undef P /* prompted depth */
#undef C /* computed */
#undef N /* non-promoted? */
fs_thread_payload::fs_thread_payload(const fs_visitor &v,
bool &source_depth_to_render_target,
bool &runtime_check_aads_emit)
: subspan_coord_reg(),
source_depth_reg(),
source_w_reg(),
aa_dest_stencil_reg(),
dest_depth_reg(),
sample_pos_reg(),
sample_mask_in_reg(),
depth_w_coef_reg(),
barycentric_coord_reg()
{
if (v.devinfo->ver >= 20)
setup_fs_payload_gfx20(*this, v, source_depth_to_render_target);
else if (v.devinfo->ver >= 6)
setup_fs_payload_gfx6(*this, v, source_depth_to_render_target);
else
setup_fs_payload_gfx4(*this, v, source_depth_to_render_target,
runtime_check_aads_emit);
}
cs_thread_payload::cs_thread_payload(const fs_visitor &v)
{
struct brw_cs_prog_data *prog_data = brw_cs_prog_data(v.prog_data);
unsigned r = reg_unit(v.devinfo);
/* See nir_setup_uniforms for subgroup_id in earlier versions. */
if (v.devinfo->verx10 >= 125) {
subgroup_id_ = brw_ud1_grf(0, 2);
for (int i = 0; i < 3; i++) {
if (prog_data->generate_local_id & (1 << i)) {
local_invocation_id[i] = brw_uw8_grf(r, 0);
r += reg_unit(v.devinfo);
if (v.devinfo->ver < 20 && v.dispatch_width == 32)
r += reg_unit(v.devinfo);
} else {
local_invocation_id[i] = brw_imm_uw(0);
}
}
/* TODO: Fill out uses_btd_stack_ids automatically */
if (prog_data->uses_btd_stack_ids)
r += reg_unit(v.devinfo);
}
num_regs = r;
}
void
cs_thread_payload::load_subgroup_id(const fs_builder &bld,
fs_reg &dest) const
{
auto devinfo = bld.shader->devinfo;
dest = retype(dest, BRW_REGISTER_TYPE_UD);
if (subgroup_id_.file != BAD_FILE) {
assert(devinfo->verx10 >= 125);
bld.AND(dest, subgroup_id_, brw_imm_ud(INTEL_MASK(7, 0)));
} else {
assert(devinfo->verx10 < 125);
assert(gl_shader_stage_is_compute(bld.shader->stage));
int index = brw_get_subgroup_id_param_index(devinfo,
bld.shader->stage_prog_data);
bld.MOV(dest, fs_reg(UNIFORM, index, BRW_REGISTER_TYPE_UD));
}
}
task_mesh_thread_payload::task_mesh_thread_payload(fs_visitor &v)
: cs_thread_payload(v)
{
/* Task and Mesh Shader Payloads (SIMD8 and SIMD16)
*
* R0: Header
* R1: Local_ID.X[0-7 or 0-15]
* R2: Inline Parameter
*
* Task and Mesh Shader Payloads (SIMD32)
*
* R0: Header
* R1: Local_ID.X[0-15]
* R2: Local_ID.X[16-31]
* R3: Inline Parameter
*
* Local_ID.X values are 16 bits.
*
* Inline parameter is optional but always present since we use it to pass
* the address to descriptors.
*/
const fs_builder bld = fs_builder(&v).at_end();
unsigned r = 0;
assert(subgroup_id_.file != BAD_FILE);
extended_parameter_0 = retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD);
if (v.devinfo->ver >= 20) {
urb_output = brw_ud1_grf(1, 0);
} else {
urb_output = bld.vgrf(BRW_REGISTER_TYPE_UD);
/* In both mesh and task shader payload, lower 16 bits of g0.6 is
* an offset within Slice's Local URB, which says where shader is
* supposed to output its data.
*/
bld.AND(urb_output, brw_ud1_grf(0, 6), brw_imm_ud(0xFFFF));
}
if (v.stage == MESA_SHADER_MESH) {
/* g0.7 is Task Shader URB Entry Offset, which contains both an offset
* within Slice's Local USB (bits 0:15) and a slice selector
* (bits 16:24). Slice selector can be non zero when mesh shader
* is spawned on slice other than the one where task shader was run.
* Bit 24 says that Slice ID is present and bits 16:23 is the Slice ID.
*/
task_urb_input = brw_ud1_grf(0, 7);
}
r += reg_unit(v.devinfo);
local_index = brw_uw8_grf(r, 0);
r += reg_unit(v.devinfo);
if (v.devinfo->ver < 20 && v.dispatch_width == 32)
r += reg_unit(v.devinfo);
inline_parameter = brw_ud1_grf(r, 0);
r += reg_unit(v.devinfo);
num_regs = r;
}
bs_thread_payload::bs_thread_payload(const fs_visitor &v)
{
unsigned r = 0;
/* R0: Thread header. */
r += reg_unit(v.devinfo);
/* R1: Stack IDs. */
r += reg_unit(v.devinfo);
/* R2: Inline Parameter. Used for argument addresses. */
global_arg_ptr = brw_ud1_grf(r, 0);
local_arg_ptr = brw_ud1_grf(r, 2);
r += reg_unit(v.devinfo);
num_regs = r;
}
void
bs_thread_payload::load_shader_type(const fs_builder &bld, fs_reg &dest) const
{
fs_reg ud_dest = retype(dest, BRW_REGISTER_TYPE_UD);
bld.MOV(ud_dest, retype(brw_vec1_grf(0, 3), ud_dest.type));
bld.AND(ud_dest, ud_dest, brw_imm_ud(0xf));
}

View file

@ -0,0 +1,199 @@
/*
* Copyright © 2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
/** @file brw_fs_validate.cpp
*
* Implements a pass that validates various invariants of the IR. The current
* pass only validates that GRF's uses are sane. More can be added later.
*/
#include "brw_fs.h"
#include "brw_cfg.h"
#define fsv_assert(assertion) \
{ \
if (!(assertion)) { \
fprintf(stderr, "ASSERT: Scalar %s validation failed!\n", \
_mesa_shader_stage_to_abbrev(stage)); \
dump_instruction(inst, stderr); \
fprintf(stderr, "%s:%d: '%s' failed\n", __FILE__, __LINE__, #assertion); \
abort(); \
} \
}
#define fsv_assert_eq(first, second) \
{ \
unsigned f = (first); \
unsigned s = (second); \
if (f != s) { \
fprintf(stderr, "ASSERT: Scalar %s validation failed!\n", \
_mesa_shader_stage_to_abbrev(stage)); \
dump_instruction(inst, stderr); \
fprintf(stderr, "%s:%d: A == B failed\n", __FILE__, __LINE__); \
fprintf(stderr, " A = %s = %u\n", #first, f); \
fprintf(stderr, " B = %s = %u\n", #second, s); \
abort(); \
} \
}
#define fsv_assert_ne(first, second) \
{ \
unsigned f = (first); \
unsigned s = (second); \
if (f == s) { \
fprintf(stderr, "ASSERT: Scalar %s validation failed!\n", \
_mesa_shader_stage_to_abbrev(stage)); \
dump_instruction(inst, stderr); \
fprintf(stderr, "%s:%d: A != B failed\n", __FILE__, __LINE__); \
fprintf(stderr, " A = %s = %u\n", #first, f); \
fprintf(stderr, " B = %s = %u\n", #second, s); \
abort(); \
} \
}
#define fsv_assert_lte(first, second) \
{ \
unsigned f = (first); \
unsigned s = (second); \
if (f > s) { \
fprintf(stderr, "ASSERT: Scalar %s validation failed!\n", \
_mesa_shader_stage_to_abbrev(stage)); \
dump_instruction(inst, stderr); \
fprintf(stderr, "%s:%d: A <= B failed\n", __FILE__, __LINE__); \
fprintf(stderr, " A = %s = %u\n", #first, f); \
fprintf(stderr, " B = %s = %u\n", #second, s); \
abort(); \
} \
}
#ifndef NDEBUG
void
fs_visitor::validate()
{
cfg->validate(_mesa_shader_stage_to_abbrev(stage));
foreach_block_and_inst (block, fs_inst, inst, cfg) {
switch (inst->opcode) {
case SHADER_OPCODE_SEND:
fsv_assert(is_uniform(inst->src[0]) && is_uniform(inst->src[1]));
break;
case BRW_OPCODE_MOV:
fsv_assert(inst->sources == 1);
break;
default:
break;
}
if (inst->is_3src(compiler)) {
const unsigned integer_sources =
brw_reg_type_is_integer(inst->src[0].type) +
brw_reg_type_is_integer(inst->src[1].type) +
brw_reg_type_is_integer(inst->src[2].type);
const unsigned float_sources =
brw_reg_type_is_floating_point(inst->src[0].type) +
brw_reg_type_is_floating_point(inst->src[1].type) +
brw_reg_type_is_floating_point(inst->src[2].type);
fsv_assert((integer_sources == 3 && float_sources == 0) ||
(integer_sources == 0 && float_sources == 3));
if (devinfo->ver >= 10) {
for (unsigned i = 0; i < 3; i++) {
if (inst->src[i].file == BRW_IMMEDIATE_VALUE)
continue;
switch (inst->src[i].vstride) {
case BRW_VERTICAL_STRIDE_0:
case BRW_VERTICAL_STRIDE_4:
case BRW_VERTICAL_STRIDE_8:
case BRW_VERTICAL_STRIDE_16:
break;
case BRW_VERTICAL_STRIDE_1:
fsv_assert_lte(12, devinfo->ver);
break;
case BRW_VERTICAL_STRIDE_2:
fsv_assert_lte(devinfo->ver, 11);
break;
default:
fsv_assert(!"invalid vstride");
break;
}
}
} else if (grf_used != 0) {
/* Only perform the pre-Gfx10 checks after register allocation has
* occured.
*
* Many passes (e.g., constant copy propagation) will genenerate
* invalid 3-source instructions with the expectation that later
* passes (e.g., combine constants) will fix them.
*/
for (unsigned i = 0; i < 3; i++) {
fsv_assert_ne(inst->src[i].file, BRW_IMMEDIATE_VALUE);
/* A stride of 1 (the usual case) or 0, with a special
* "repctrl" bit, is allowed. The repctrl bit doesn't work for
* 64-bit datatypes, so if the source type is 64-bit then only
* a stride of 1 is allowed. From the Broadwell PRM, Volume 7
* "3D Media GPGPU", page 944:
*
* This is applicable to 32b datatypes and 16b datatype. 64b
* datatypes cannot use the replicate control.
*/
fsv_assert_lte(inst->src[i].vstride, 1);
if (type_sz(inst->src[i].type) > 4)
fsv_assert_eq(inst->src[i].vstride, 1);
}
}
}
if (inst->dst.file == VGRF) {
fsv_assert_lte(inst->dst.offset / REG_SIZE + regs_written(inst),
alloc.sizes[inst->dst.nr]);
}
for (unsigned i = 0; i < inst->sources; i++) {
if (inst->src[i].file == VGRF) {
fsv_assert_lte(inst->src[i].offset / REG_SIZE + regs_read(inst, i),
alloc.sizes[inst->src[i].nr]);
}
}
/* Accumulator Registers, bspec 47251:
*
* "When destination is accumulator with offset 0, destination
* horizontal stride must be 1."
*/
if (intel_needs_workaround(devinfo, 14014617373) &&
inst->dst.is_accumulator() &&
inst->dst.offset == 0) {
fsv_assert_eq(inst->dst.stride, 1);
}
}
}
#endif

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,108 @@
/*
* Copyright © 2013 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_compiler.h"
#include "compiler/nir/nir.h"
static char const *get_qual_name(int mode)
{
switch (mode) {
case INTERP_MODE_NONE: return "none";
case INTERP_MODE_FLAT: return "flat";
case INTERP_MODE_SMOOTH: return "smooth";
case INTERP_MODE_NOPERSPECTIVE: return "nopersp";
default: return "???";
}
}
static void
gfx4_frag_prog_set_interp_modes(struct brw_wm_prog_data *prog_data,
const struct intel_vue_map *vue_map,
unsigned location, unsigned slot_count,
enum glsl_interp_mode interp)
{
for (unsigned k = 0; k < slot_count; k++) {
unsigned slot = vue_map->varying_to_slot[location + k];
if (slot != -1 && prog_data->interp_mode[slot] == INTERP_MODE_NONE) {
prog_data->interp_mode[slot] = interp;
if (prog_data->interp_mode[slot] == INTERP_MODE_FLAT) {
prog_data->contains_flat_varying = true;
} else if (prog_data->interp_mode[slot] == INTERP_MODE_NOPERSPECTIVE) {
prog_data->contains_noperspective_varying = true;
}
}
}
}
/* Set up interpolation modes for every element in the VUE */
void
brw_setup_vue_interpolation(const struct intel_vue_map *vue_map, nir_shader *nir,
struct brw_wm_prog_data *prog_data)
{
/* Initialise interp_mode. INTERP_MODE_NONE == 0 */
memset(prog_data->interp_mode, 0, sizeof(prog_data->interp_mode));
if (!vue_map)
return;
/* HPOS always wants noperspective. setting it up here allows
* us to not need special handling in the SF program.
*/
unsigned pos_slot = vue_map->varying_to_slot[VARYING_SLOT_POS];
if (pos_slot != -1) {;
prog_data->interp_mode[pos_slot] = INTERP_MODE_NOPERSPECTIVE;
prog_data->contains_noperspective_varying = true;
}
nir_foreach_shader_in_variable(var, nir) {
unsigned location = var->data.location;
unsigned slot_count = glsl_count_attribute_slots(var->type, false);
gfx4_frag_prog_set_interp_modes(prog_data, vue_map, location, slot_count,
var->data.interpolation);
if (location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1) {
location = location + VARYING_SLOT_BFC0 - VARYING_SLOT_COL0;
gfx4_frag_prog_set_interp_modes(prog_data, vue_map, location,
slot_count, var->data.interpolation);
}
}
const bool debug = false;
if (debug) {
fprintf(stderr, "VUE map:\n");
for (int i = 0; i < vue_map->num_slots; i++) {
int varying = vue_map->slot_to_varying[i];
if (varying == -1) {
fprintf(stderr, "%d: --\n", i);
continue;
}
fprintf(stderr, "%d: %d %s ofs %d\n",
i, varying,
get_qual_name(prog_data->interp_mode[i]),
brw_vue_slot_to_offset(i));
}
}
}

View file

@ -0,0 +1,216 @@
/* -*- c++ -*- */
/*
* Copyright © 2010-2016 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_IR_H
#define BRW_IR_H
#include <assert.h>
#include "brw_reg.h"
#include "compiler/glsl/list.h"
#define MAX_SAMPLER_MESSAGE_SIZE 11
/* The sampler can return a vec5 when sampling with sparse residency. In
* SIMD32, each component takes up 4 GRFs, so we need to allow up to size-20
* VGRFs to hold the result.
*/
#define MAX_VGRF_SIZE(devinfo) ((devinfo)->ver >= 20 ? 40 : 20)
#ifdef __cplusplus
struct backend_reg : private brw_reg
{
backend_reg() {}
backend_reg(const struct brw_reg &reg) : brw_reg(reg), offset(0) {}
const brw_reg &as_brw_reg() const
{
assert(file == ARF || file == FIXED_GRF || file == MRF || file == IMM);
assert(offset == 0);
return static_cast<const brw_reg &>(*this);
}
brw_reg &as_brw_reg()
{
assert(file == ARF || file == FIXED_GRF || file == MRF || file == IMM);
assert(offset == 0);
return static_cast<brw_reg &>(*this);
}
bool equals(const backend_reg &r) const;
bool negative_equals(const backend_reg &r) const;
bool is_zero() const;
bool is_one() const;
bool is_negative_one() const;
bool is_null() const;
bool is_accumulator() const;
/** Offset from the start of the (virtual) register in bytes. */
uint16_t offset;
using brw_reg::type;
using brw_reg::file;
using brw_reg::negate;
using brw_reg::abs;
using brw_reg::address_mode;
using brw_reg::subnr;
using brw_reg::nr;
using brw_reg::swizzle;
using brw_reg::writemask;
using brw_reg::indirect_offset;
using brw_reg::vstride;
using brw_reg::width;
using brw_reg::hstride;
using brw_reg::df;
using brw_reg::f;
using brw_reg::d;
using brw_reg::ud;
using brw_reg::d64;
using brw_reg::u64;
};
struct bblock_t;
struct backend_instruction : public exec_node {
bool is_3src(const struct brw_compiler *compiler) const;
bool is_math() const;
bool is_control_flow_begin() const;
bool is_control_flow_end() const;
bool is_control_flow() const;
bool is_commutative() const;
bool can_do_source_mods() const;
bool can_do_saturate() const;
bool can_do_cmod() const;
bool reads_accumulator_implicitly() const;
bool writes_accumulator_implicitly(const struct intel_device_info *devinfo) const;
/**
* Instructions that use indirect addressing have additional register
* regioning restrictions.
*/
bool uses_indirect_addressing() const;
void remove(bblock_t *block, bool defer_later_block_ip_updates = false);
void insert_after(bblock_t *block, backend_instruction *inst);
void insert_before(bblock_t *block, backend_instruction *inst);
/**
* True if the instruction has side effects other than writing to
* its destination registers. You are expected not to reorder or
* optimize these out unless you know what you are doing.
*/
bool has_side_effects() const;
/**
* True if the instruction might be affected by side effects of other
* instructions.
*/
bool is_volatile() const;
#else
struct backend_instruction {
struct exec_node link;
#endif
/** @{
* Annotation for the generated IR. One of the two can be set.
*/
const void *ir;
const char *annotation;
/** @} */
/**
* Execution size of the instruction. This is used by the generator to
* generate the correct binary for the given instruction. Current valid
* values are 1, 4, 8, 16, 32.
*/
uint8_t exec_size;
/**
* Channel group from the hardware execution and predication mask that
* should be applied to the instruction. The subset of channel enable
* signals (calculated from the EU control flow and predication state)
* given by [group, group + exec_size) will be used to mask GRF writes and
* any other side effects of the instruction.
*/
uint8_t group;
uint32_t offset; /**< spill/unspill offset or texture offset bitfield */
uint8_t mlen; /**< SEND message length */
uint8_t ex_mlen; /**< SENDS extended message length */
int8_t base_mrf; /**< First MRF in the SEND message, if mlen is nonzero. */
uint8_t target; /**< MRT target. */
uint8_t sfid; /**< SFID for SEND instructions */
uint32_t desc; /**< SEND[S] message descriptor immediate */
uint32_t ex_desc; /**< SEND[S] extended message descriptor immediate */
unsigned size_written; /**< Data written to the destination register in bytes. */
enum opcode opcode; /* BRW_OPCODE_* or FS_OPCODE_* */
enum brw_conditional_mod conditional_mod; /**< BRW_CONDITIONAL_* */
enum brw_predicate predicate;
bool predicate_inverse:1;
bool writes_accumulator:1; /**< instruction implicitly writes accumulator */
bool force_writemask_all:1;
bool no_dd_clear:1;
bool no_dd_check:1;
bool saturate:1;
bool shadow_compare:1;
bool check_tdr:1; /**< Only valid for SEND; turns it into a SENDC */
bool send_has_side_effects:1; /**< Only valid for SHADER_OPCODE_SEND */
bool send_is_volatile:1; /**< Only valid for SHADER_OPCODE_SEND */
bool send_ex_desc_scratch:1; /**< Only valid for SHADER_OPCODE_SEND, use
* the scratch surface offset to build
* extended descriptor
*/
bool send_ex_bso:1; /**< Only for SHADER_OPCODE_SEND, use extended bindless
* surface offset (26bits instead of 20bits)
*/
bool predicate_trivial:1; /**< The predication mask applied to this
* instruction is guaranteed to be uniform and
* a superset of the execution mask of the
* present block, no currently enabled channels
* will be disabled by the predicate.
*/
bool eot:1;
/* Chooses which flag subregister (f0.0 to f3.1) is used for conditional
* mod and predication.
*/
unsigned flag_subreg:3;
/**
* Systolic depth used by DPAS instruction.
*/
unsigned sdepth:4;
/**
* Repeat count used by DPAS instruction.
*/
unsigned rcount:4;
/** The number of hardware registers used for a message header. */
uint8_t header_size;
};
#endif

View file

@ -0,0 +1,92 @@
/* -*- c++ -*- */
/*
* Copyright © 2010-2014 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_IR_ALLOCATOR_H
#define BRW_IR_ALLOCATOR_H
#include "util/compiler.h"
#include "util/glheader.h"
#include "util/macros.h"
#include "util/rounding.h"
#include "util/u_math.h"
namespace brw {
/**
* Simple allocator used to keep track of virtual GRFs.
*/
class simple_allocator {
public:
simple_allocator() :
sizes(NULL), offsets(NULL), count(0), total_size(0), capacity(0)
{
}
~simple_allocator()
{
free(offsets);
free(sizes);
}
unsigned
allocate(unsigned size)
{
assert(size > 0);
if (capacity <= count) {
capacity = MAX2(16, capacity * 2);
sizes = (unsigned *)realloc(sizes, capacity * sizeof(unsigned));
offsets = (unsigned *)realloc(offsets, capacity * sizeof(unsigned));
}
sizes[count] = size;
offsets[count] = total_size;
total_size += size;
return count++;
}
/**
* Array of sizes for each allocation. The allocation unit is up to the
* back-end, but it's expected to be one scalar value in the FS back-end
* and one vec4 in the VEC4 back-end.
*/
unsigned *sizes;
/**
* Array of offsets from the start of the VGRF space in allocation
* units.
*/
unsigned *offsets;
/** Total number of VGRFs allocated. */
unsigned count;
/** Cumulative size in allocation units. */
unsigned total_size;
private:
unsigned capacity;
};
}
#endif

View file

@ -0,0 +1,192 @@
/* -*- c++ -*- */
/*
* Copyright © 2016 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_IR_ANALYSIS_H
#define BRW_IR_ANALYSIS_H
namespace brw {
/**
* Bitset of state categories that can influence the result of IR analysis
* passes.
*/
enum analysis_dependency_class {
/**
* The analysis doesn't depend on the IR, its result is effectively a
* constant during the compilation.
*/
DEPENDENCY_NOTHING = 0,
/**
* The analysis depends on the set of instructions in the program and
* their naming. Note that because instructions are named sequentially
* by IP this implies a dependency on the control flow edges between
* instructions. This will be signaled whenever instructions are
* inserted, removed or reordered in the program.
*/
DEPENDENCY_INSTRUCTION_IDENTITY = 0x1,
/**
* The analysis is sensitive to the detailed semantics of instructions
* in the program, where "detailed" means any change in the instruction
* data structures other than the linked-list pointers (which are
* already covered by DEPENDENCY_INSTRUCTION_IDENTITY). E.g. changing
* the negate or abs flags of an instruction source would signal this
* flag alone because it would preserve all other instruction dependency
* classes.
*/
DEPENDENCY_INSTRUCTION_DETAIL = 0x2,
/**
* The analysis depends on the set of data flow edges between
* instructions. This will be signaled whenever the dataflow relation
* between instructions has potentially changed, e.g. when the VGRF
* index of an instruction source or destination changes (in which case
* it will appear in combination with DEPENDENCY_INSTRUCTION_DETAIL), or
* when data-dependent instructions are reordered (in which case it will
* appear in combination with DEPENDENCY_INSTRUCTION_IDENTITY).
*/
DEPENDENCY_INSTRUCTION_DATA_FLOW = 0x4,
/**
* The analysis depends on all instruction dependency classes. These
* will typically be signaled simultaneously when inserting or removing
* instructions in the program (or if you're feeling too lazy to read
* through your optimization pass to figure out which of the instruction
* dependency classes above it invalidates).
*/
DEPENDENCY_INSTRUCTIONS = 0x7,
/**
* The analysis depends on the set of VGRFs in the program and their
* naming. This will be signaled when VGRFs are allocated or released.
*/
DEPENDENCY_VARIABLES = 0x8,
/**
* The analysis depends on the set of basic blocks in the program, their
* control flow edges and naming.
*/
DEPENDENCY_BLOCKS = 0x10,
/**
* The analysis depends on the program being literally the same (good
* luck...), any change in the input invalidates previous analysis
* computations.
*/
DEPENDENCY_EVERYTHING = ~0
};
inline analysis_dependency_class
operator|(analysis_dependency_class x, analysis_dependency_class y)
{
return static_cast<analysis_dependency_class>(
static_cast<unsigned>(x) | static_cast<unsigned>(y));
}
}
/**
* Instantiate a program analysis class \p L which can calculate an object of
* type \p T as result. \p C is a closure that encapsulates whatever
* information is required as argument to run the analysis pass. The purpose
* of this class is to make sure that:
*
* - The analysis pass is executed lazily whenever it's needed and multiple
* executions are optimized out as long as the cached result remains marked
* up-to-date.
*
* - There is no way to access the cached analysis result without first
* calling L::require(), which makes sure that the analysis pass is rerun
* if necessary.
*
* - The cached result doesn't become inconsistent with the program for as
* long as it remains marked up-to-date. (This is only enforced in debug
* builds for performance reasons)
*
* The requirements on \p T are the following:
*
* - Constructible with a single argument, as in 'x = T(c)' for \p c of type
* \p C.
*
* - 'x.dependency_class()' on const \p x returns a bitset of
* brw::analysis_dependency_class specifying the set of IR objects that are
* required to remain invariant for the cached analysis result to be
* considered valid.
*
* - 'x.validate(c)' on const \p x returns a boolean result specifying
* whether the analysis result \p x is consistent with the input IR. This
* is currently only used for validation in debug builds.
*/
template<class T, class C>
class brw_analysis {
public:
/**
* Construct a program analysis. \p c is an arbitrary object
* passed as argument to the constructor of the analysis result
* object of type \p T.
*/
brw_analysis(const C *c) : c(c), p(NULL) {}
/**
* Destroy a program analysis.
*/
~brw_analysis()
{
delete p;
}
/**
* Obtain the result of a program analysis. This gives a
* guaranteed up-to-date result, the analysis pass will be
* rerun implicitly if it has become stale.
*/
T &
require()
{
if (p)
assert(p->validate(c));
else
p = new T(c);
return *p;
}
const T &
require() const
{
return const_cast<brw_analysis<T, C> *>(this)->require();
}
/**
* Report that dependencies of the analysis pass may have changed
* since the last calculation and the cached analysis result may
* have to be discarded.
*/
void
invalidate(brw::analysis_dependency_class c)
{
if (p && (c & p->dependency_class())) {
delete p;
p = NULL;
}
}
private:
const C *c;
T *p;
};
#endif

View file

@ -0,0 +1,737 @@
/* -*- c++ -*- */
/*
* Copyright © 2010-2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_IR_FS_H
#define BRW_IR_FS_H
#include "brw_shader.h"
class fs_inst;
class fs_reg : public backend_reg {
public:
DECLARE_RALLOC_CXX_OPERATORS(fs_reg)
void init();
fs_reg();
fs_reg(struct ::brw_reg reg);
fs_reg(enum brw_reg_file file, unsigned nr);
fs_reg(enum brw_reg_file file, unsigned nr, enum brw_reg_type type);
bool equals(const fs_reg &r) const;
bool negative_equals(const fs_reg &r) const;
bool is_contiguous() const;
/**
* Return the size in bytes of a single logical component of the
* register assuming the given execution width.
*/
unsigned component_size(unsigned width) const;
/** Register region horizontal stride */
uint8_t stride;
};
static inline fs_reg
negate(fs_reg reg)
{
assert(reg.file != IMM);
reg.negate = !reg.negate;
return reg;
}
static inline fs_reg
retype(fs_reg reg, enum brw_reg_type type)
{
reg.type = type;
return reg;
}
static inline fs_reg
byte_offset(fs_reg reg, unsigned delta)
{
switch (reg.file) {
case BAD_FILE:
break;
case VGRF:
case ATTR:
case UNIFORM:
reg.offset += delta;
break;
case MRF: {
const unsigned suboffset = reg.offset + delta;
reg.nr += suboffset / REG_SIZE;
reg.offset = suboffset % REG_SIZE;
break;
}
case ARF:
case FIXED_GRF: {
const unsigned suboffset = reg.subnr + delta;
reg.nr += suboffset / REG_SIZE;
reg.subnr = suboffset % REG_SIZE;
break;
}
case IMM:
default:
assert(delta == 0);
}
return reg;
}
static inline fs_reg
horiz_offset(const fs_reg &reg, unsigned delta)
{
switch (reg.file) {
case BAD_FILE:
case UNIFORM:
case IMM:
/* These only have a single component that is implicitly splatted. A
* horizontal offset should be a harmless no-op.
* XXX - Handle vector immediates correctly.
*/
return reg;
case VGRF:
case MRF:
case ATTR:
return byte_offset(reg, delta * reg.stride * type_sz(reg.type));
case ARF:
case FIXED_GRF:
if (reg.is_null()) {
return reg;
} else {
const unsigned hstride = reg.hstride ? 1 << (reg.hstride - 1) : 0;
const unsigned vstride = reg.vstride ? 1 << (reg.vstride - 1) : 0;
const unsigned width = 1 << reg.width;
if (delta % width == 0) {
return byte_offset(reg, delta / width * vstride * type_sz(reg.type));
} else {
assert(vstride == hstride * width);
return byte_offset(reg, delta * hstride * type_sz(reg.type));
}
}
}
unreachable("Invalid register file");
}
static inline fs_reg
offset(fs_reg reg, unsigned width, unsigned delta)
{
switch (reg.file) {
case BAD_FILE:
break;
case ARF:
case FIXED_GRF:
case MRF:
case VGRF:
case ATTR:
case UNIFORM:
return byte_offset(reg, delta * reg.component_size(width));
case IMM:
assert(delta == 0);
}
return reg;
}
/**
* Get the scalar channel of \p reg given by \p idx and replicate it to all
* channels of the result.
*/
static inline fs_reg
component(fs_reg reg, unsigned idx)
{
reg = horiz_offset(reg, idx);
reg.stride = 0;
if (reg.file == ARF || reg.file == FIXED_GRF) {
reg.vstride = BRW_VERTICAL_STRIDE_0;
reg.width = BRW_WIDTH_1;
reg.hstride = BRW_HORIZONTAL_STRIDE_0;
}
return reg;
}
/**
* Return an integer identifying the discrete address space a register is
* contained in. A register is by definition fully contained in the single
* reg_space it belongs to, so two registers with different reg_space ids are
* guaranteed not to overlap. Most register files are a single reg_space of
* its own, only the VGRF and ATTR files are composed of multiple discrete
* address spaces, one for each allocation and input attribute respectively.
*/
static inline uint32_t
reg_space(const fs_reg &r)
{
return r.file << 16 | (r.file == VGRF || r.file == ATTR ? r.nr : 0);
}
/**
* Return the base offset in bytes of a register relative to the start of its
* reg_space().
*/
static inline unsigned
reg_offset(const fs_reg &r)
{
return (r.file == VGRF || r.file == IMM || r.file == ATTR ? 0 : r.nr) *
(r.file == UNIFORM ? 4 : REG_SIZE) + r.offset +
(r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0);
}
/**
* Return the amount of padding in bytes left unused between individual
* components of register \p r due to a (horizontal) stride value greater than
* one, or zero if components are tightly packed in the register file.
*/
static inline unsigned
reg_padding(const fs_reg &r)
{
const unsigned stride = ((r.file != ARF && r.file != FIXED_GRF) ? r.stride :
r.hstride == 0 ? 0 :
1 << (r.hstride - 1));
return (MAX2(1, stride) - 1) * type_sz(r.type);
}
/* Do not call this directly. Call regions_overlap() instead. */
static inline bool
regions_overlap_MRF(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
{
if (r.nr & BRW_MRF_COMPR4) {
fs_reg t = r;
t.nr &= ~BRW_MRF_COMPR4;
/* COMPR4 regions are translated by the hardware during decompression
* into two separate half-regions 4 MRFs apart from each other.
*
* Note: swapping s and t in this parameter list eliminates one possible
* level of recursion (since the s in the called versions of
* regions_overlap_MRF can't be COMPR4), and that makes the compiled
* code a lot smaller.
*/
return regions_overlap_MRF(s, ds, t, dr / 2) ||
regions_overlap_MRF(s, ds, byte_offset(t, 4 * REG_SIZE), dr / 2);
} else if (s.nr & BRW_MRF_COMPR4) {
return regions_overlap_MRF(s, ds, r, dr);
}
return !((r.nr * REG_SIZE + r.offset + dr) <= (s.nr * REG_SIZE + s.offset) ||
(s.nr * REG_SIZE + s.offset + ds) <= (r.nr * REG_SIZE + r.offset));
}
/**
* Return whether the register region starting at \p r and spanning \p dr
* bytes could potentially overlap the register region starting at \p s and
* spanning \p ds bytes.
*/
static inline bool
regions_overlap(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
{
if (r.file != s.file)
return false;
if (r.file == VGRF) {
return r.nr == s.nr &&
!(r.offset + dr <= s.offset || s.offset + ds <= r.offset);
} else if (r.file != MRF) {
return !(reg_offset(r) + dr <= reg_offset(s) ||
reg_offset(s) + ds <= reg_offset(r));
} else {
return regions_overlap_MRF(r, dr, s, ds);
}
}
/**
* Check that the register region given by r [r.offset, r.offset + dr[
* is fully contained inside the register region given by s
* [s.offset, s.offset + ds[.
*/
static inline bool
region_contained_in(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
{
return reg_space(r) == reg_space(s) &&
reg_offset(r) >= reg_offset(s) &&
reg_offset(r) + dr <= reg_offset(s) + ds;
}
/**
* Return whether the given register region is n-periodic, i.e. whether the
* original region remains invariant after shifting it by \p n scalar
* channels.
*/
static inline bool
is_periodic(const fs_reg &reg, unsigned n)
{
if (reg.file == BAD_FILE || reg.is_null()) {
return true;
} else if (reg.file == IMM) {
const unsigned period = (reg.type == BRW_REGISTER_TYPE_UV ||
reg.type == BRW_REGISTER_TYPE_V ? 8 :
reg.type == BRW_REGISTER_TYPE_VF ? 4 :
1);
return n % period == 0;
} else if (reg.file == ARF || reg.file == FIXED_GRF) {
const unsigned period = (reg.hstride == 0 && reg.vstride == 0 ? 1 :
reg.vstride == 0 ? 1 << reg.width :
~0);
return n % period == 0;
} else {
return reg.stride == 0;
}
}
static inline bool
is_uniform(const fs_reg &reg)
{
return is_periodic(reg, 1);
}
/**
* Get the specified 8-component quarter of a register.
*/
static inline fs_reg
quarter(const fs_reg &reg, unsigned idx)
{
assert(idx < 4);
return horiz_offset(reg, 8 * idx);
}
/**
* Reinterpret each channel of register \p reg as a vector of values of the
* given smaller type and take the i-th subcomponent from each.
*/
static inline fs_reg
subscript(fs_reg reg, brw_reg_type type, unsigned i)
{
assert((i + 1) * type_sz(type) <= type_sz(reg.type));
if (reg.file == ARF || reg.file == FIXED_GRF) {
/* The stride is encoded inconsistently for fixed GRF and ARF registers
* as the log2 of the actual vertical and horizontal strides.
*/
const int delta = util_logbase2(type_sz(reg.type)) -
util_logbase2(type_sz(type));
reg.hstride += (reg.hstride ? delta : 0);
reg.vstride += (reg.vstride ? delta : 0);
} else if (reg.file == IMM) {
unsigned bit_size = type_sz(type) * 8;
reg.u64 >>= i * bit_size;
reg.u64 &= BITFIELD64_MASK(bit_size);
if (bit_size <= 16)
reg.u64 |= reg.u64 << 16;
return retype(reg, type);
} else {
reg.stride *= type_sz(reg.type) / type_sz(type);
}
return byte_offset(retype(reg, type), i * type_sz(type));
}
static inline fs_reg
horiz_stride(fs_reg reg, unsigned s)
{
reg.stride *= s;
return reg;
}
static const fs_reg reg_undef;
class fs_inst : public backend_instruction {
fs_inst &operator=(const fs_inst &);
void init(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
const fs_reg *src, unsigned sources);
public:
DECLARE_RALLOC_CXX_OPERATORS(fs_inst)
fs_inst();
fs_inst(enum opcode opcode, uint8_t exec_size);
fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst);
fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
const fs_reg &src0);
fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
const fs_reg &src0, const fs_reg &src1);
fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
const fs_reg &src0, const fs_reg &src1, const fs_reg &src2);
fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
const fs_reg src[], unsigned sources);
fs_inst(const fs_inst &that);
~fs_inst();
void resize_sources(uint8_t num_sources);
bool is_send_from_grf() const;
bool is_payload(unsigned arg) const;
bool is_partial_write() const;
unsigned components_read(unsigned i) const;
unsigned size_read(int arg) const;
bool can_do_source_mods(const struct intel_device_info *devinfo) const;
bool can_do_cmod();
bool can_change_types() const;
bool has_source_and_destination_hazard() const;
unsigned implied_mrf_writes() const;
/**
* Return whether \p arg is a control source of a virtual instruction which
* shouldn't contribute to the execution type and usual regioning
* restriction calculations of arithmetic instructions.
*/
bool is_control_source(unsigned arg) const;
/**
* Return the subset of flag registers read by the instruction as a bitset
* with byte granularity.
*/
unsigned flags_read(const intel_device_info *devinfo) const;
/**
* Return the subset of flag registers updated by the instruction (either
* partially or fully) as a bitset with byte granularity.
*/
unsigned flags_written(const intel_device_info *devinfo) const;
/**
* Return true if this instruction is a sampler message gathering residency
* data.
*/
bool has_sampler_residency() const;
fs_reg dst;
fs_reg *src;
uint8_t sources; /**< Number of fs_reg sources. */
bool last_rt:1;
bool pi_noperspective:1; /**< Pixel interpolator noperspective flag */
bool keep_payload_trailing_zeros;
tgl_swsb sched; /**< Scheduling info. */
};
/**
* Make the execution of \p inst dependent on the evaluation of a possibly
* inverted predicate.
*/
static inline fs_inst *
set_predicate_inv(enum brw_predicate pred, bool inverse,
fs_inst *inst)
{
inst->predicate = pred;
inst->predicate_inverse = inverse;
return inst;
}
/**
* Make the execution of \p inst dependent on the evaluation of a predicate.
*/
static inline fs_inst *
set_predicate(enum brw_predicate pred, fs_inst *inst)
{
return set_predicate_inv(pred, false, inst);
}
/**
* Write the result of evaluating the condition given by \p mod to a flag
* register.
*/
static inline fs_inst *
set_condmod(enum brw_conditional_mod mod, fs_inst *inst)
{
inst->conditional_mod = mod;
return inst;
}
/**
* Clamp the result of \p inst to the saturation range of its destination
* datatype.
*/
static inline fs_inst *
set_saturate(bool saturate, fs_inst *inst)
{
inst->saturate = saturate;
return inst;
}
/**
* Return the number of dataflow registers written by the instruction (either
* fully or partially) counted from 'floor(reg_offset(inst->dst) /
* register_size)'. The somewhat arbitrary register size unit is 4B for the
* UNIFORM and IMM files and 32B for all other files.
*/
inline unsigned
regs_written(const fs_inst *inst)
{
assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE +
inst->size_written -
MIN2(inst->size_written, reg_padding(inst->dst)),
REG_SIZE);
}
/**
* Return the number of dataflow registers read by the instruction (either
* fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
* register_size)'. The somewhat arbitrary register size unit is 4B for the
* UNIFORM files and 32B for all other files.
*/
inline unsigned
regs_read(const fs_inst *inst, unsigned i)
{
if (inst->src[i].file == IMM)
return 1;
const unsigned reg_size = inst->src[i].file == UNIFORM ? 4 : REG_SIZE;
return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size +
inst->size_read(i) -
MIN2(inst->size_read(i), reg_padding(inst->src[i])),
reg_size);
}
static inline enum brw_reg_type
get_exec_type(const fs_inst *inst)
{
brw_reg_type exec_type = BRW_REGISTER_TYPE_B;
for (int i = 0; i < inst->sources; i++) {
if (inst->src[i].file != BAD_FILE &&
!inst->is_control_source(i)) {
const brw_reg_type t = get_exec_type(inst->src[i].type);
if (type_sz(t) > type_sz(exec_type))
exec_type = t;
else if (type_sz(t) == type_sz(exec_type) &&
brw_reg_type_is_floating_point(t))
exec_type = t;
}
}
if (exec_type == BRW_REGISTER_TYPE_B)
exec_type = inst->dst.type;
assert(exec_type != BRW_REGISTER_TYPE_B);
/* Promotion of the execution type to 32-bit for conversions from or to
* half-float seems to be consistent with the following text from the
* Cherryview PRM Vol. 7, "Execution Data Type":
*
* "When single precision and half precision floats are mixed between
* source operands or between source and destination operand [..] single
* precision float is the execution datatype."
*
* and from "Register Region Restrictions":
*
* "Conversion between Integer and HF (Half Float) must be DWord aligned
* and strided by a DWord on the destination."
*/
if (type_sz(exec_type) == 2 &&
inst->dst.type != exec_type) {
if (exec_type == BRW_REGISTER_TYPE_HF)
exec_type = BRW_REGISTER_TYPE_F;
else if (inst->dst.type == BRW_REGISTER_TYPE_HF)
exec_type = BRW_REGISTER_TYPE_D;
}
return exec_type;
}
static inline unsigned
get_exec_type_size(const fs_inst *inst)
{
return type_sz(get_exec_type(inst));
}
static inline bool
is_send(const fs_inst *inst)
{
return inst->mlen || inst->is_send_from_grf();
}
/**
* Return whether the instruction isn't an ALU instruction and cannot be
* assumed to complete in-order.
*/
static inline bool
is_unordered(const intel_device_info *devinfo, const fs_inst *inst)
{
return is_send(inst) || (devinfo->ver < 20 && inst->is_math()) ||
inst->opcode == BRW_OPCODE_DPAS ||
(devinfo->has_64bit_float_via_math_pipe &&
(get_exec_type(inst) == BRW_REGISTER_TYPE_DF ||
inst->dst.type == BRW_REGISTER_TYPE_DF));
}
/**
* Return whether the following regioning restriction applies to the specified
* instruction. From the Cherryview PRM Vol 7. "Register Region
* Restrictions":
*
* "When source or destination datatype is 64b or operation is integer DWord
* multiply, regioning in Align1 must follow these rules:
*
* 1. Source and Destination horizontal stride must be aligned to the same qword.
* 2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride.
* 3. Source and Destination offset must be the same, except the case of
* scalar source."
*/
static inline bool
has_dst_aligned_region_restriction(const intel_device_info *devinfo,
const fs_inst *inst,
brw_reg_type dst_type)
{
const brw_reg_type exec_type = get_exec_type(inst);
/* Even though the hardware spec claims that "integer DWord multiply"
* operations are restricted, empirical evidence and the behavior of the
* simulator suggest that only 32x32-bit integer multiplication is
* restricted.
*/
const bool is_dword_multiply = !brw_reg_type_is_floating_point(exec_type) &&
((inst->opcode == BRW_OPCODE_MUL &&
MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4) ||
(inst->opcode == BRW_OPCODE_MAD &&
MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4));
if (type_sz(dst_type) > 4 || type_sz(exec_type) > 4 ||
(type_sz(exec_type) == 4 && is_dword_multiply))
return devinfo->platform == INTEL_PLATFORM_CHV ||
intel_device_info_is_9lp(devinfo) ||
devinfo->verx10 >= 125;
else if (brw_reg_type_is_floating_point(dst_type))
return devinfo->verx10 >= 125;
else
return false;
}
static inline bool
has_dst_aligned_region_restriction(const intel_device_info *devinfo,
const fs_inst *inst)
{
return has_dst_aligned_region_restriction(devinfo, inst, inst->dst.type);
}
/**
* Return whether the LOAD_PAYLOAD instruction is a plain copy of bits from
* the specified register file into a VGRF.
*
* This implies identity register regions without any source-destination
* overlap, but otherwise has no implications on the location of sources and
* destination in the register file: Gathering any number of portions from
* multiple virtual registers in any order is allowed.
*/
inline bool
is_copy_payload(brw_reg_file file, const fs_inst *inst)
{
if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD ||
inst->is_partial_write() || inst->saturate ||
inst->dst.file != VGRF)
return false;
for (unsigned i = 0; i < inst->sources; i++) {
if (inst->src[i].file != file ||
inst->src[i].abs || inst->src[i].negate)
return false;
if (!inst->src[i].is_contiguous())
return false;
if (regions_overlap(inst->dst, inst->size_written,
inst->src[i], inst->size_read(i)))
return false;
}
return true;
}
/**
* Like is_copy_payload(), but the instruction is required to copy a single
* contiguous block of registers from the given register file into the
* destination without any reordering.
*/
inline bool
is_identity_payload(brw_reg_file file, const fs_inst *inst) {
if (is_copy_payload(file, inst)) {
fs_reg reg = inst->src[0];
for (unsigned i = 0; i < inst->sources; i++) {
reg.type = inst->src[i].type;
if (!inst->src[i].equals(reg))
return false;
reg = byte_offset(reg, inst->size_read(i));
}
return true;
} else {
return false;
}
}
/**
* Like is_copy_payload(), but the instruction is required to source data from
* at least two disjoint VGRFs.
*
* This doesn't necessarily rule out the elimination of this instruction
* through register coalescing, but due to limitations of the register
* coalesce pass it might be impossible to do so directly until a later stage,
* when the LOAD_PAYLOAD instruction is unrolled into a sequence of MOV
* instructions.
*/
inline bool
is_multi_copy_payload(const fs_inst *inst) {
if (is_copy_payload(VGRF, inst)) {
for (unsigned i = 0; i < inst->sources; i++) {
if (inst->src[i].nr != inst->src[0].nr)
return true;
}
}
return false;
}
/**
* Like is_identity_payload(), but the instruction is required to copy the
* whole contents of a single VGRF into the destination.
*
* This means that there is a good chance that the instruction will be
* eliminated through register coalescing, but it's neither a necessary nor a
* sufficient condition for that to happen -- E.g. consider the case where
* source and destination registers diverge due to other instructions in the
* program overwriting part of their contents, which isn't something we can
* predict up front based on a cheap strictly local test of the copy
* instruction.
*/
inline bool
is_coalescing_payload(const brw::simple_allocator &alloc, const fs_inst *inst)
{
return is_identity_payload(VGRF, inst) &&
inst->src[0].offset == 0 &&
alloc.sizes[inst->src[0].nr] * REG_SIZE == inst->size_written;
}
bool
has_bank_conflict(const struct brw_isa_info *isa, const fs_inst *inst);
#endif

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,86 @@
/* -*- c++ -*- */
/*
* Copyright © 2020 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_IR_PERFORMANCE_H
#define BRW_IR_PERFORMANCE_H
class fs_visitor;
namespace brw {
class vec4_visitor;
/**
* Various estimates of the performance of a shader based on static
* analysis.
*/
struct performance {
performance(const fs_visitor *v);
performance(const vec4_visitor *v);
~performance();
analysis_dependency_class
dependency_class() const
{
return (DEPENDENCY_INSTRUCTIONS |
DEPENDENCY_BLOCKS);
}
bool
validate(const backend_shader *) const
{
return true;
}
/**
* Array containing estimates of the runtime of each basic block of the
* program in cycle units.
*/
unsigned *block_latency;
/**
* Estimate of the runtime of the whole program in cycle units assuming
* uncontended execution.
*/
unsigned latency;
/**
* Estimate of the throughput of the whole program in
* invocations-per-cycle units.
*
* Note that this might be lower than the ratio between the dispatch
* width of the program and its latency estimate in cases where
* performance doesn't scale without limits as a function of its thread
* parallelism, e.g. due to the existence of a bottleneck in a shared
* function.
*/
float throughput;
private:
performance(const performance &perf);
performance &
operator=(performance u);
};
}
#endif

View file

@ -0,0 +1,475 @@
/* -*- c++ -*- */
/*
* Copyright © 2011-2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_IR_VEC4_H
#define BRW_IR_VEC4_H
#include "brw_shader.h"
namespace brw {
class dst_reg;
class src_reg : public backend_reg
{
public:
DECLARE_RALLOC_CXX_OPERATORS(src_reg)
void init();
src_reg(enum brw_reg_file file, int nr, const glsl_type *type);
src_reg();
src_reg(struct ::brw_reg reg);
bool equals(const src_reg &r) const;
bool negative_equals(const src_reg &r) const;
src_reg(class vec4_visitor *v, const struct glsl_type *type);
src_reg(class vec4_visitor *v, const struct glsl_type *type, int size);
explicit src_reg(const dst_reg &reg);
src_reg *reladdr;
};
static inline src_reg
retype(src_reg reg, enum brw_reg_type type)
{
reg.type = type;
return reg;
}
namespace detail {
static inline void
add_byte_offset(backend_reg *reg, unsigned bytes)
{
switch (reg->file) {
case BAD_FILE:
break;
case VGRF:
case ATTR:
case UNIFORM:
reg->offset += bytes;
assert(reg->offset % 16 == 0);
break;
case MRF: {
const unsigned suboffset = reg->offset + bytes;
reg->nr += suboffset / REG_SIZE;
reg->offset = suboffset % REG_SIZE;
assert(reg->offset % 16 == 0);
break;
}
case ARF:
case FIXED_GRF: {
const unsigned suboffset = reg->subnr + bytes;
reg->nr += suboffset / REG_SIZE;
reg->subnr = suboffset % REG_SIZE;
assert(reg->subnr % 16 == 0);
break;
}
default:
assert(bytes == 0);
}
}
} /* namespace detail */
static inline src_reg
byte_offset(src_reg reg, unsigned bytes)
{
detail::add_byte_offset(&reg, bytes);
return reg;
}
static inline src_reg
offset(src_reg reg, unsigned width, unsigned delta)
{
const unsigned stride = (reg.file == UNIFORM ? 0 : 4);
const unsigned num_components = MAX2(width / 4 * stride, 4);
return byte_offset(reg, num_components * type_sz(reg.type) * delta);
}
static inline src_reg
horiz_offset(src_reg reg, unsigned delta)
{
return byte_offset(reg, delta * type_sz(reg.type));
}
/**
* Reswizzle a given source register.
* \sa brw_swizzle().
*/
static inline src_reg
swizzle(src_reg reg, unsigned swizzle)
{
if (reg.file == IMM)
reg.ud = brw_swizzle_immediate(reg.type, reg.ud, swizzle);
else
reg.swizzle = brw_compose_swizzle(swizzle, reg.swizzle);
return reg;
}
static inline src_reg
negate(src_reg reg)
{
assert(reg.file != IMM);
reg.negate = !reg.negate;
return reg;
}
static inline bool
is_uniform(const src_reg &reg)
{
return (reg.file == IMM || reg.file == UNIFORM || reg.is_null()) &&
(!reg.reladdr || is_uniform(*reg.reladdr));
}
class dst_reg : public backend_reg
{
public:
DECLARE_RALLOC_CXX_OPERATORS(dst_reg)
void init();
dst_reg();
dst_reg(enum brw_reg_file file, int nr);
dst_reg(enum brw_reg_file file, int nr, const glsl_type *type,
unsigned writemask);
dst_reg(enum brw_reg_file file, int nr, brw_reg_type type,
unsigned writemask);
dst_reg(struct ::brw_reg reg);
dst_reg(class vec4_visitor *v, const struct glsl_type *type);
explicit dst_reg(const src_reg &reg);
bool equals(const dst_reg &r) const;
src_reg *reladdr;
};
static inline dst_reg
retype(dst_reg reg, enum brw_reg_type type)
{
reg.type = type;
return reg;
}
static inline dst_reg
byte_offset(dst_reg reg, unsigned bytes)
{
detail::add_byte_offset(&reg, bytes);
return reg;
}
static inline dst_reg
offset(dst_reg reg, unsigned width, unsigned delta)
{
const unsigned stride = (reg.file == UNIFORM ? 0 : 4);
const unsigned num_components = MAX2(width / 4 * stride, 4);
return byte_offset(reg, num_components * type_sz(reg.type) * delta);
}
static inline dst_reg
horiz_offset(const dst_reg &reg, unsigned delta)
{
if (is_uniform(src_reg(reg)))
return reg;
else
return byte_offset(reg, delta * type_sz(reg.type));
}
static inline dst_reg
writemask(dst_reg reg, unsigned mask)
{
assert(reg.file != IMM);
assert((reg.writemask & mask) != 0);
reg.writemask &= mask;
return reg;
}
/**
* Return an integer identifying the discrete address space a register is
* contained in. A register is by definition fully contained in the single
* reg_space it belongs to, so two registers with different reg_space ids are
* guaranteed not to overlap. Most register files are a single reg_space of
* its own, only the VGRF file is composed of multiple discrete address
* spaces, one for each VGRF allocation.
*/
static inline uint32_t
reg_space(const backend_reg &r)
{
return r.file << 16 | (r.file == VGRF ? r.nr : 0);
}
/**
* Return the base offset in bytes of a register relative to the start of its
* reg_space().
*/
static inline unsigned
reg_offset(const backend_reg &r)
{
return (r.file == VGRF || r.file == IMM ? 0 : r.nr) *
(r.file == UNIFORM ? 16 : REG_SIZE) + r.offset +
(r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0);
}
/**
* Return whether the register region starting at \p r and spanning \p dr
* bytes could potentially overlap the register region starting at \p s and
* spanning \p ds bytes.
*/
static inline bool
regions_overlap(const backend_reg &r, unsigned dr,
const backend_reg &s, unsigned ds)
{
if (r.file == MRF && (r.nr & BRW_MRF_COMPR4)) {
/* COMPR4 regions are translated by the hardware during decompression
* into two separate half-regions 4 MRFs apart from each other.
*/
backend_reg t0 = r;
t0.nr &= ~BRW_MRF_COMPR4;
backend_reg t1 = t0;
t1.offset += 4 * REG_SIZE;
return regions_overlap(t0, dr / 2, s, ds) ||
regions_overlap(t1, dr / 2, s, ds);
} else if (s.file == MRF && (s.nr & BRW_MRF_COMPR4)) {
return regions_overlap(s, ds, r, dr);
} else {
return reg_space(r) == reg_space(s) &&
!(reg_offset(r) + dr <= reg_offset(s) ||
reg_offset(s) + ds <= reg_offset(r));
}
}
class vec4_instruction : public backend_instruction {
public:
DECLARE_RALLOC_CXX_OPERATORS(vec4_instruction)
vec4_instruction(enum opcode opcode,
const dst_reg &dst = dst_reg(),
const src_reg &src0 = src_reg(),
const src_reg &src1 = src_reg(),
const src_reg &src2 = src_reg());
dst_reg dst;
src_reg src[3];
enum brw_urb_write_flags urb_write_flags;
unsigned sol_binding; /**< gfx6: SOL binding table index */
bool sol_final_write; /**< gfx6: send commit message */
unsigned sol_vertex; /**< gfx6: used for setting dst index in SVB header */
bool is_send_from_grf() const;
unsigned size_read(unsigned arg) const;
bool can_reswizzle(const struct intel_device_info *devinfo,
int dst_writemask,
int swizzle, int swizzle_mask);
void reswizzle(int dst_writemask, int swizzle);
bool can_do_source_mods(const struct intel_device_info *devinfo);
bool can_do_cmod();
bool can_do_writemask(const struct intel_device_info *devinfo);
bool can_change_types() const;
bool has_source_and_destination_hazard() const;
unsigned implied_mrf_writes() const;
bool is_align1_partial_write()
{
return opcode == VEC4_OPCODE_SET_LOW_32BIT ||
opcode == VEC4_OPCODE_SET_HIGH_32BIT;
}
bool reads_flag() const
{
return predicate || opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2;
}
bool reads_flag(unsigned c)
{
if (opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2)
return true;
switch (predicate) {
case BRW_PREDICATE_NONE:
return false;
case BRW_PREDICATE_ALIGN16_REPLICATE_X:
return c == 0;
case BRW_PREDICATE_ALIGN16_REPLICATE_Y:
return c == 1;
case BRW_PREDICATE_ALIGN16_REPLICATE_Z:
return c == 2;
case BRW_PREDICATE_ALIGN16_REPLICATE_W:
return c == 3;
default:
return true;
}
}
bool writes_flag(const intel_device_info *devinfo) const
{
return (conditional_mod && ((opcode != BRW_OPCODE_SEL || devinfo->ver <= 5) &&
opcode != BRW_OPCODE_CSEL &&
opcode != BRW_OPCODE_IF &&
opcode != BRW_OPCODE_WHILE));
}
bool reads_g0_implicitly() const
{
switch (opcode) {
case SHADER_OPCODE_TEX:
case SHADER_OPCODE_TXL:
case SHADER_OPCODE_TXD:
case SHADER_OPCODE_TXF:
case SHADER_OPCODE_TXF_CMS_W:
case SHADER_OPCODE_TXF_CMS:
case SHADER_OPCODE_TXF_MCS:
case SHADER_OPCODE_TXS:
case SHADER_OPCODE_TG4:
case SHADER_OPCODE_TG4_OFFSET:
case SHADER_OPCODE_SAMPLEINFO:
case VS_OPCODE_PULL_CONSTANT_LOAD:
case GS_OPCODE_SET_PRIMITIVE_ID:
case GS_OPCODE_GET_INSTANCE_ID:
case SHADER_OPCODE_GFX4_SCRATCH_READ:
case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
return true;
default:
return false;
}
}
};
/**
* Make the execution of \p inst dependent on the evaluation of a possibly
* inverted predicate.
*/
inline vec4_instruction *
set_predicate_inv(enum brw_predicate pred, bool inverse,
vec4_instruction *inst)
{
inst->predicate = pred;
inst->predicate_inverse = inverse;
return inst;
}
/**
* Make the execution of \p inst dependent on the evaluation of a predicate.
*/
inline vec4_instruction *
set_predicate(enum brw_predicate pred, vec4_instruction *inst)
{
return set_predicate_inv(pred, false, inst);
}
/**
* Write the result of evaluating the condition given by \p mod to a flag
* register.
*/
inline vec4_instruction *
set_condmod(enum brw_conditional_mod mod, vec4_instruction *inst)
{
inst->conditional_mod = mod;
return inst;
}
/**
* Clamp the result of \p inst to the saturation range of its destination
* datatype.
*/
inline vec4_instruction *
set_saturate(bool saturate, vec4_instruction *inst)
{
inst->saturate = saturate;
return inst;
}
/**
* Return the number of dataflow registers written by the instruction (either
* fully or partially) counted from 'floor(reg_offset(inst->dst) /
* register_size)'. The somewhat arbitrary register size unit is 16B for the
* UNIFORM and IMM files and 32B for all other files.
*/
inline unsigned
regs_written(const vec4_instruction *inst)
{
assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE + inst->size_written,
REG_SIZE);
}
/**
* Return the number of dataflow registers read by the instruction (either
* fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
* register_size)'. The somewhat arbitrary register size unit is 16B for the
* UNIFORM and IMM files and 32B for all other files.
*/
inline unsigned
regs_read(const vec4_instruction *inst, unsigned i)
{
const unsigned reg_size =
inst->src[i].file == UNIFORM || inst->src[i].file == IMM ? 16 : REG_SIZE;
return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size + inst->size_read(i),
reg_size);
}
static inline enum brw_reg_type
get_exec_type(const vec4_instruction *inst)
{
enum brw_reg_type exec_type = BRW_REGISTER_TYPE_B;
for (int i = 0; i < 3; i++) {
if (inst->src[i].file != BAD_FILE) {
const brw_reg_type t = get_exec_type(brw_reg_type(inst->src[i].type));
if (type_sz(t) > type_sz(exec_type))
exec_type = t;
else if (type_sz(t) == type_sz(exec_type) &&
brw_reg_type_is_floating_point(t))
exec_type = t;
}
}
if (exec_type == BRW_REGISTER_TYPE_B)
exec_type = inst->dst.type;
/* TODO: We need to handle half-float conversions. */
assert(exec_type != BRW_REGISTER_TYPE_HF ||
inst->dst.type == BRW_REGISTER_TYPE_HF);
assert(exec_type != BRW_REGISTER_TYPE_B);
return exec_type;
}
static inline unsigned
get_exec_type_size(const vec4_instruction *inst)
{
return type_sz(get_exec_type(inst));
}
} /* namespace brw */
#endif

View file

@ -0,0 +1,86 @@
/*
* Copyright © 2022 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* on the rights to use, copy, modify, merge, publish, distribute, sub
* license, and/or sell copies of the Software, and to permit persons to whom
* the Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef BRW_ISA_ENCODING_H
#define BRW_ISA_ENCODING_H
#include "dev/intel_device_info.h"
#include "brw_eu_defines.h"
#ifdef __cplusplus
extern "C" {
#endif
struct opcode_desc;
struct brw_isa_info {
const struct intel_device_info *devinfo;
/* A mapping from enum opcode to the corresponding opcode_desc */
const struct opcode_desc *ir_to_descs[NUM_BRW_OPCODES];
/** A mapping from a HW opcode encoding to the corresponding opcode_desc */
const struct opcode_desc *hw_to_descs[128];
};
void brw_init_isa_info(struct brw_isa_info *isa,
const struct intel_device_info *devinfo);
struct opcode_desc {
unsigned ir;
unsigned hw;
const char *name;
int nsrc;
int ndst;
int gfx_vers;
};
const struct opcode_desc *
brw_opcode_desc(const struct brw_isa_info *isa, enum opcode opcode);
const struct opcode_desc *
brw_opcode_desc_from_hw(const struct brw_isa_info *isa, unsigned hw);
static inline unsigned
brw_opcode_encode(const struct brw_isa_info *isa, enum opcode opcode)
{
return brw_opcode_desc(isa, opcode)->hw;
}
static inline enum opcode
brw_opcode_decode(const struct brw_isa_info *isa, unsigned hw)
{
const struct opcode_desc *desc = brw_opcode_desc_from_hw(isa, hw);
return desc ? (enum opcode)desc->ir : BRW_OPCODE_ILLEGAL;
}
static inline bool
is_3src(const struct brw_isa_info *isa, enum opcode opcode)
{
const struct opcode_desc *desc = brw_opcode_desc(isa, opcode);
return desc && desc->nsrc == 3;
}
#ifdef __cplusplus
}
#endif
#endif

View file

@ -0,0 +1,790 @@
/*
* Copyright © 2020 Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_kernel.h"
#include "brw_nir.h"
#include "intel_nir.h"
#include "intel_nir.h"
#include "nir_clc_helpers.h"
#include "compiler/nir/nir_builder.h"
#include "compiler/spirv/nir_spirv.h"
#include "dev/intel_debug.h"
#include "util/u_atomic.h"
#include "util/u_dynarray.h"
static const nir_shader *
load_clc_shader(struct brw_compiler *compiler, struct disk_cache *disk_cache,
const nir_shader_compiler_options *nir_options,
const struct spirv_to_nir_options *spirv_options)
{
if (compiler->clc_shader)
return compiler->clc_shader;
nir_shader *nir = nir_load_libclc_shader(64, disk_cache,
spirv_options, nir_options,
disk_cache != NULL);
if (nir == NULL)
return NULL;
const nir_shader *old_nir =
p_atomic_cmpxchg(&compiler->clc_shader, NULL, nir);
if (old_nir == NULL) {
/* We won the race */
ralloc_steal(compiler, nir);
return nir;
} else {
/* Someone else built the shader first */
ralloc_free(nir);
return old_nir;
}
}
static nir_builder
builder_init_new_impl(nir_function *func)
{
nir_function_impl *impl = nir_function_impl_create(func);
return nir_builder_at(nir_before_impl(impl));
}
static void
implement_atomic_builtin(nir_function *func, nir_atomic_op atomic_op,
enum glsl_base_type data_base_type,
nir_variable_mode mode)
{
nir_builder b = builder_init_new_impl(func);
const struct glsl_type *data_type = glsl_scalar_type(data_base_type);
unsigned p = 0;
nir_deref_instr *ret = NULL;
ret = nir_build_deref_cast(&b, nir_load_param(&b, p++),
nir_var_function_temp, data_type, 0);
nir_intrinsic_op op = nir_intrinsic_deref_atomic;
nir_intrinsic_instr *atomic = nir_intrinsic_instr_create(b.shader, op);
nir_intrinsic_set_atomic_op(atomic, atomic_op);
for (unsigned i = 0; i < nir_intrinsic_infos[op].num_srcs; i++) {
nir_def *src = nir_load_param(&b, p++);
if (i == 0) {
/* The first source is our deref */
assert(nir_intrinsic_infos[op].src_components[i] == -1);
src = &nir_build_deref_cast(&b, src, mode, data_type, 0)->def;
}
atomic->src[i] = nir_src_for_ssa(src);
}
nir_def_init_for_type(&atomic->instr, &atomic->def, data_type);
nir_builder_instr_insert(&b, &atomic->instr);
nir_store_deref(&b, ret, &atomic->def, ~0);
}
static void
implement_sub_group_ballot_builtin(nir_function *func)
{
nir_builder b = builder_init_new_impl(func);
nir_deref_instr *ret =
nir_build_deref_cast(&b, nir_load_param(&b, 0),
nir_var_function_temp, glsl_uint_type(), 0);
nir_def *cond = nir_load_param(&b, 1);
nir_intrinsic_instr *ballot =
nir_intrinsic_instr_create(b.shader, nir_intrinsic_ballot);
ballot->src[0] = nir_src_for_ssa(cond);
ballot->num_components = 1;
nir_def_init(&ballot->instr, &ballot->def, 1, 32);
nir_builder_instr_insert(&b, &ballot->instr);
nir_store_deref(&b, ret, &ballot->def, ~0);
}
static bool
implement_intel_builtins(nir_shader *nir)
{
bool progress = false;
nir_foreach_function(func, nir) {
if (strcmp(func->name, "_Z10atomic_minPU3AS1Vff") == 0) {
/* float atom_min(__global float volatile *p, float val) */
implement_atomic_builtin(func, nir_atomic_op_fmin,
GLSL_TYPE_FLOAT, nir_var_mem_global);
progress = true;
} else if (strcmp(func->name, "_Z10atomic_maxPU3AS1Vff") == 0) {
/* float atom_max(__global float volatile *p, float val) */
implement_atomic_builtin(func, nir_atomic_op_fmax,
GLSL_TYPE_FLOAT, nir_var_mem_global);
progress = true;
} else if (strcmp(func->name, "_Z10atomic_minPU3AS3Vff") == 0) {
/* float atomic_min(__shared float volatile *, float) */
implement_atomic_builtin(func, nir_atomic_op_fmin,
GLSL_TYPE_FLOAT, nir_var_mem_shared);
progress = true;
} else if (strcmp(func->name, "_Z10atomic_maxPU3AS3Vff") == 0) {
/* float atomic_max(__shared float volatile *, float) */
implement_atomic_builtin(func, nir_atomic_op_fmax,
GLSL_TYPE_FLOAT, nir_var_mem_shared);
progress = true;
} else if (strcmp(func->name, "intel_sub_group_ballot") == 0) {
implement_sub_group_ballot_builtin(func);
progress = true;
}
}
nir_shader_preserve_all_metadata(nir);
return progress;
}
static bool
lower_kernel_intrinsics(nir_shader *nir)
{
nir_function_impl *impl = nir_shader_get_entrypoint(nir);
bool progress = false;
unsigned kernel_sysvals_start = 0;
unsigned kernel_arg_start = sizeof(struct brw_kernel_sysvals);
nir->num_uniforms += kernel_arg_start;
nir_builder b = nir_builder_create(impl);
nir_foreach_block(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_load_kernel_input: {
b.cursor = nir_instr_remove(&intrin->instr);
nir_intrinsic_instr *load =
nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform);
load->num_components = intrin->num_components;
load->src[0] = nir_src_for_ssa(nir_u2u32(&b, intrin->src[0].ssa));
nir_intrinsic_set_base(load, kernel_arg_start);
nir_intrinsic_set_range(load, nir->num_uniforms);
nir_def_init(&load->instr, &load->def,
intrin->def.num_components,
intrin->def.bit_size);
nir_builder_instr_insert(&b, &load->instr);
nir_def_rewrite_uses(&intrin->def, &load->def);
progress = true;
break;
}
case nir_intrinsic_load_constant_base_ptr: {
b.cursor = nir_instr_remove(&intrin->instr);
nir_def *const_data_base_addr = nir_pack_64_2x32_split(&b,
nir_load_reloc_const_intel(&b, BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW),
nir_load_reloc_const_intel(&b, BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH));
nir_def_rewrite_uses(&intrin->def, const_data_base_addr);
progress = true;
break;
}
case nir_intrinsic_load_num_workgroups: {
b.cursor = nir_instr_remove(&intrin->instr);
nir_intrinsic_instr *load =
nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform);
load->num_components = 3;
load->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
nir_intrinsic_set_base(load, kernel_sysvals_start +
offsetof(struct brw_kernel_sysvals, num_work_groups));
nir_intrinsic_set_range(load, 3 * 4);
nir_def_init(&load->instr, &load->def, 3, 32);
nir_builder_instr_insert(&b, &load->instr);
nir_def_rewrite_uses(&intrin->def, &load->def);
progress = true;
break;
}
default:
break;
}
}
}
if (progress) {
nir_metadata_preserve(impl, nir_metadata_block_index |
nir_metadata_dominance);
} else {
nir_metadata_preserve(impl, nir_metadata_all);
}
return progress;
}
bool
brw_kernel_from_spirv(struct brw_compiler *compiler,
struct disk_cache *disk_cache,
struct brw_kernel *kernel,
void *log_data, void *mem_ctx,
const uint32_t *spirv, size_t spirv_size,
const char *entrypoint_name,
char **error_str)
{
const struct intel_device_info *devinfo = compiler->devinfo;
const nir_shader_compiler_options *nir_options =
compiler->nir_options[MESA_SHADER_KERNEL];
struct spirv_to_nir_options spirv_options = {
.environment = NIR_SPIRV_OPENCL,
.caps = {
.address = true,
.float16 = devinfo->ver >= 8,
.float64 = devinfo->ver >= 8,
.groups = true,
.image_write_without_format = true,
.int8 = devinfo->ver >= 8,
.int16 = devinfo->ver >= 8,
.int64 = devinfo->ver >= 8,
.int64_atomics = devinfo->ver >= 9,
.kernel = true,
.linkage = true, /* We receive linked kernel from clc */
.float_controls = devinfo->ver >= 8,
.generic_pointers = true,
.storage_8bit = devinfo->ver >= 8,
.storage_16bit = devinfo->ver >= 8,
.subgroup_arithmetic = true,
.subgroup_basic = true,
.subgroup_ballot = true,
.subgroup_dispatch = true,
.subgroup_quad = true,
.subgroup_shuffle = true,
.subgroup_vote = true,
.intel_subgroup_shuffle = true,
.intel_subgroup_buffer_block_io = true,
},
.shared_addr_format = nir_address_format_62bit_generic,
.global_addr_format = nir_address_format_62bit_generic,
.temp_addr_format = nir_address_format_62bit_generic,
.constant_addr_format = nir_address_format_64bit_global,
};
spirv_options.clc_shader = load_clc_shader(compiler, disk_cache,
nir_options, &spirv_options);
if (spirv_options.clc_shader == NULL) {
fprintf(stderr, "ERROR: libclc shader missing."
" Consider installing the libclc package\n");
abort();
}
assert(spirv_size % 4 == 0);
nir_shader *nir =
spirv_to_nir(spirv, spirv_size / 4, NULL, 0, MESA_SHADER_KERNEL,
entrypoint_name, &spirv_options, nir_options);
nir_validate_shader(nir, "after spirv_to_nir");
nir_validate_ssa_dominance(nir, "after spirv_to_nir");
ralloc_steal(mem_ctx, nir);
nir->info.name = ralloc_strdup(nir, entrypoint_name);
if (INTEL_DEBUG(DEBUG_CS)) {
/* Re-index SSA defs so we print more sensible numbers. */
nir_foreach_function_impl(impl, nir) {
nir_index_ssa_defs(impl);
}
fprintf(stderr, "NIR (from SPIR-V) for kernel\n");
nir_print_shader(nir, stderr);
}
NIR_PASS_V(nir, implement_intel_builtins);
NIR_PASS_V(nir, nir_link_shader_functions, spirv_options.clc_shader);
/* We have to lower away local constant initializers right before we
* inline functions. That way they get properly initialized at the top
* of the function and not at the top of its caller.
*/
NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
NIR_PASS_V(nir, nir_lower_returns);
NIR_PASS_V(nir, nir_inline_functions);
NIR_PASS_V(nir, nir_copy_prop);
NIR_PASS_V(nir, nir_opt_deref);
/* Pick off the single entrypoint that we want */
nir_remove_non_entrypoints(nir);
/* Now that we've deleted all but the main function, we can go ahead and
* lower the rest of the constant initializers. We do this here so that
* nir_remove_dead_variables and split_per_member_structs below see the
* corresponding stores.
*/
NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);
/* LLVM loves take advantage of the fact that vec3s in OpenCL are 16B
* aligned and so it can just read/write them as vec4s. This results in a
* LOT of vec4->vec3 casts on loads and stores. One solution to this
* problem is to get rid of all vec3 variables.
*/
NIR_PASS_V(nir, nir_lower_vec3_to_vec4,
nir_var_shader_temp | nir_var_function_temp |
nir_var_mem_shared | nir_var_mem_global|
nir_var_mem_constant);
/* We assign explicit types early so that the optimizer can take advantage
* of that information and hopefully get rid of some of our memcpys.
*/
NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
nir_var_uniform |
nir_var_shader_temp | nir_var_function_temp |
nir_var_mem_shared | nir_var_mem_global,
glsl_get_cl_type_size_align);
struct brw_nir_compiler_opts opts = {};
brw_preprocess_nir(compiler, nir, &opts);
int max_arg_idx = -1;
nir_foreach_uniform_variable(var, nir) {
assert(var->data.location < 256);
max_arg_idx = MAX2(max_arg_idx, var->data.location);
}
kernel->args_size = nir->num_uniforms;
kernel->arg_count = max_arg_idx + 1;
/* No bindings */
struct brw_kernel_arg_desc *args =
rzalloc_array(mem_ctx, struct brw_kernel_arg_desc, kernel->arg_count);
kernel->args = args;
nir_foreach_uniform_variable(var, nir) {
struct brw_kernel_arg_desc arg_desc = {
.offset = var->data.driver_location,
.size = glsl_get_explicit_size(var->type, false),
};
assert(arg_desc.offset + arg_desc.size <= nir->num_uniforms);
assert(var->data.location >= 0);
args[var->data.location] = arg_desc;
}
NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_all, NULL);
/* Lower again, this time after dead-variables to get more compact variable
* layouts.
*/
nir->global_mem_size = 0;
nir->scratch_size = 0;
nir->info.shared_size = 0;
NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
nir_var_shader_temp | nir_var_function_temp |
nir_var_mem_shared | nir_var_mem_global | nir_var_mem_constant,
glsl_get_cl_type_size_align);
if (nir->constant_data_size > 0) {
assert(nir->constant_data == NULL);
nir->constant_data = rzalloc_size(nir, nir->constant_data_size);
nir_gather_explicit_io_initializers(nir, nir->constant_data,
nir->constant_data_size,
nir_var_mem_constant);
}
if (INTEL_DEBUG(DEBUG_CS)) {
/* Re-index SSA defs so we print more sensible numbers. */
nir_foreach_function_impl(impl, nir) {
nir_index_ssa_defs(impl);
}
fprintf(stderr, "NIR (before I/O lowering) for kernel\n");
nir_print_shader(nir, stderr);
}
NIR_PASS_V(nir, nir_lower_memcpy);
NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_constant,
nir_address_format_64bit_global);
NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_uniform,
nir_address_format_32bit_offset_as_64bit);
NIR_PASS_V(nir, nir_lower_explicit_io,
nir_var_shader_temp | nir_var_function_temp |
nir_var_mem_shared | nir_var_mem_global,
nir_address_format_62bit_generic);
NIR_PASS_V(nir, nir_lower_convert_alu_types, NULL);
NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics, devinfo, NULL);
NIR_PASS_V(nir, lower_kernel_intrinsics);
struct brw_cs_prog_key key = { };
memset(&kernel->prog_data, 0, sizeof(kernel->prog_data));
kernel->prog_data.base.nr_params = DIV_ROUND_UP(nir->num_uniforms, 4);
struct brw_compile_cs_params params = {
.base = {
.nir = nir,
.stats = kernel->stats,
.log_data = log_data,
.mem_ctx = mem_ctx,
},
.key = &key,
.prog_data = &kernel->prog_data,
};
kernel->code = brw_compile_cs(compiler, &params);
if (error_str)
*error_str = params.base.error_str;
return kernel->code != NULL;
}
static nir_def *
rebuild_value_from_store(struct util_dynarray *stores,
nir_def *value, unsigned read_offset)
{
unsigned read_size = value->num_components * value->bit_size / 8;
util_dynarray_foreach(stores, nir_intrinsic_instr *, _store) {
nir_intrinsic_instr *store = *_store;
unsigned write_offset = nir_src_as_uint(store->src[1]);
unsigned write_size = nir_src_num_components(store->src[0]) *
nir_src_bit_size(store->src[0]) / 8;
if (write_offset <= read_offset &&
(write_offset + write_size) >= (read_offset + read_size)) {
assert(nir_block_dominates(store->instr.block, value->parent_instr->block));
assert(write_size == read_size);
return store->src[0].ssa;
}
}
unreachable("Matching scratch store not found");
}
/**
* Remove temporary variables stored to scratch to be then reloaded
* immediately. Remap the load to the store SSA value.
*
* This workaround is only meant to be applied to shaders in src/intel/shaders
* were we know there should be no issue. More complex cases might not work
* with this approach.
*/
static bool
nir_remove_llvm17_scratch(nir_shader *nir)
{
struct util_dynarray scratch_stores;
void *mem_ctx = ralloc_context(NULL);
util_dynarray_init(&scratch_stores, mem_ctx);
nir_foreach_function_impl(func, nir) {
nir_foreach_block(block, func) {
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_store_scratch)
continue;
nir_const_value *offset = nir_src_as_const_value(intrin->src[1]);
if (offset != NULL) {
util_dynarray_append(&scratch_stores, nir_intrinsic_instr *, intrin);
}
}
}
}
bool progress = false;
if (util_dynarray_num_elements(&scratch_stores, nir_intrinsic_instr *) > 0) {
nir_foreach_function_impl(func, nir) {
nir_foreach_block(block, func) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_scratch)
continue;
nir_const_value *offset = nir_src_as_const_value(intrin->src[0]);
if (offset == NULL)
continue;
nir_def_rewrite_uses(&intrin->def,
rebuild_value_from_store(
&scratch_stores, &intrin->def,
nir_src_as_uint(intrin->src[0])));
nir_instr_remove(instr);
progress = true;
}
}
}
}
util_dynarray_foreach(&scratch_stores, nir_intrinsic_instr *, _store) {
nir_intrinsic_instr *store = *_store;
nir_instr_remove(&store->instr);
}
/* Quick sanity check */
assert(util_dynarray_num_elements(&scratch_stores, nir_intrinsic_instr *) == 0 ||
progress);
ralloc_free(mem_ctx);
return progress;
}
static void
cleanup_llvm17_scratch(nir_shader *nir)
{
{
bool progress;
do {
progress = false;
NIR_PASS(progress, nir, nir_copy_prop);
NIR_PASS(progress, nir, nir_opt_dce);
NIR_PASS(progress, nir, nir_opt_constant_folding);
NIR_PASS(progress, nir, nir_opt_cse);
NIR_PASS(progress, nir, nir_opt_algebraic);
} while (progress);
}
nir_remove_llvm17_scratch(nir);
{
bool progress;
do {
progress = false;
NIR_PASS(progress, nir, nir_copy_prop);
NIR_PASS(progress, nir, nir_opt_dce);
NIR_PASS(progress, nir, nir_opt_constant_folding);
NIR_PASS(progress, nir, nir_opt_cse);
NIR_PASS(progress, nir, nir_opt_algebraic);
} while (progress);
}
}
nir_shader *
brw_nir_from_spirv(void *mem_ctx, const uint32_t *spirv, size_t spirv_size,
bool llvm17_wa)
{
struct spirv_to_nir_options spirv_options = {
.environment = NIR_SPIRV_OPENCL,
.caps = {
.address = true,
.groups = true,
.image_write_without_format = true,
.int8 = true,
.int16 = true,
.int64 = true,
.int64_atomics = true,
.kernel = true,
.linkage = true, /* We receive linked kernel from clc */
.float_controls = true,
.generic_pointers = true,
.storage_8bit = true,
.storage_16bit = true,
.subgroup_arithmetic = true,
.subgroup_basic = true,
.subgroup_ballot = true,
.subgroup_dispatch = true,
.subgroup_quad = true,
.subgroup_shuffle = true,
.subgroup_vote = true,
.intel_subgroup_shuffle = true,
.intel_subgroup_buffer_block_io = true,
},
.shared_addr_format = nir_address_format_62bit_generic,
.global_addr_format = nir_address_format_62bit_generic,
.temp_addr_format = nir_address_format_62bit_generic,
.constant_addr_format = nir_address_format_64bit_global,
.create_library = true,
};
assert(spirv_size % 4 == 0);
nir_shader *nir =
spirv_to_nir(spirv, spirv_size / 4, NULL, 0, MESA_SHADER_KERNEL,
"library", &spirv_options, &brw_scalar_nir_options);
nir_validate_shader(nir, "after spirv_to_nir");
nir_validate_ssa_dominance(nir, "after spirv_to_nir");
ralloc_steal(mem_ctx, nir);
nir->info.name = ralloc_strdup(nir, "library");
if (INTEL_DEBUG(DEBUG_CS)) {
/* Re-index SSA defs so we print more sensible numbers. */
nir_foreach_function_impl(impl, nir) {
nir_index_ssa_defs(impl);
}
fprintf(stderr, "NIR (from SPIR-V) for kernel\n");
nir_print_shader(nir, stderr);
}
NIR_PASS_V(nir, implement_intel_builtins);
NIR_PASS_V(nir, nir_link_shader_functions, spirv_options.clc_shader);
/* We have to lower away local constant initializers right before we
* inline functions. That way they get properly initialized at the top
* of the function and not at the top of its caller.
*/
NIR_PASS_V(nir, nir_lower_variable_initializers, ~(nir_var_shader_temp |
nir_var_function_temp));
NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo |
nir_var_mem_constant | nir_var_function_temp | nir_var_image, NULL);
{
bool progress;
do
{
progress = false;
NIR_PASS(progress, nir, nir_copy_prop);
NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
NIR_PASS(progress, nir, nir_opt_deref);
NIR_PASS(progress, nir, nir_opt_dce);
NIR_PASS(progress, nir, nir_opt_undef);
NIR_PASS(progress, nir, nir_opt_constant_folding);
NIR_PASS(progress, nir, nir_opt_cse);
NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
NIR_PASS(progress, nir, nir_opt_algebraic);
} while (progress);
}
NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
NIR_PASS_V(nir, nir_lower_returns);
NIR_PASS_V(nir, nir_inline_functions);
assert(nir->scratch_size == 0);
NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp, glsl_get_cl_type_size_align);
{
bool progress;
do
{
progress = false;
NIR_PASS(progress, nir, nir_copy_prop);
NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
NIR_PASS(progress, nir, nir_opt_deref);
NIR_PASS(progress, nir, nir_opt_dce);
NIR_PASS(progress, nir, nir_opt_undef);
NIR_PASS(progress, nir, nir_opt_constant_folding);
NIR_PASS(progress, nir, nir_opt_cse);
NIR_PASS(progress, nir, nir_split_var_copies);
NIR_PASS(progress, nir, nir_lower_var_copies);
NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
NIR_PASS(progress, nir, nir_opt_algebraic);
NIR_PASS(progress, nir, nir_opt_if, nir_opt_if_optimize_phi_true_false);
NIR_PASS(progress, nir, nir_opt_dead_cf);
NIR_PASS(progress, nir, nir_opt_remove_phis);
NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
NIR_PASS(progress, nir, nir_lower_vec3_to_vec4, nir_var_mem_generic | nir_var_uniform);
NIR_PASS(progress, nir, nir_opt_memcpy);
} while (progress);
}
NIR_PASS_V(nir, nir_scale_fdiv);
NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo |
nir_var_mem_constant | nir_var_function_temp | nir_var_image, NULL);
NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_mem_shared | nir_var_function_temp, NULL);
nir->scratch_size = 0;
NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
nir_var_mem_shared | nir_var_function_temp | nir_var_shader_temp |
nir_var_mem_global | nir_var_mem_constant,
glsl_get_cl_type_size_align);
// Lower memcpy - needs to wait until types are sized
{
bool progress;
do {
progress = false;
NIR_PASS(progress, nir, nir_opt_memcpy);
NIR_PASS(progress, nir, nir_copy_prop);
NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
NIR_PASS(progress, nir, nir_opt_deref);
NIR_PASS(progress, nir, nir_opt_dce);
NIR_PASS(progress, nir, nir_split_var_copies);
NIR_PASS(progress, nir, nir_lower_var_copies);
NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
NIR_PASS(progress, nir, nir_opt_constant_folding);
NIR_PASS(progress, nir, nir_opt_cse);
} while (progress);
}
NIR_PASS_V(nir, nir_lower_memcpy);
NIR_PASS_V(nir, nir_lower_explicit_io,
nir_var_mem_shared | nir_var_function_temp | nir_var_shader_temp | nir_var_uniform,
nir_address_format_32bit_offset_as_64bit);
NIR_PASS_V(nir, nir_lower_system_values);
/* Hopefully we can drop this once lower_vars_to_ssa has improved to not
* lower everything to scratch.
*/
if (llvm17_wa)
cleanup_llvm17_scratch(nir);
/* Lower again, this time after dead-variables to get more compact variable
* layouts.
*/
nir->global_mem_size = 0;
nir->scratch_size = 0;
nir->info.shared_size = 0;
NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
nir_var_mem_shared | nir_var_mem_global | nir_var_mem_constant,
glsl_get_cl_type_size_align);
if (nir->constant_data_size > 0) {
assert(nir->constant_data == NULL);
nir->constant_data = rzalloc_size(nir, nir->constant_data_size);
nir_gather_explicit_io_initializers(nir, nir->constant_data,
nir->constant_data_size,
nir_var_mem_constant);
}
NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_constant,
nir_address_format_64bit_global);
NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_uniform,
nir_address_format_32bit_offset_as_64bit);
NIR_PASS_V(nir, nir_lower_explicit_io,
nir_var_shader_temp | nir_var_function_temp |
nir_var_mem_shared | nir_var_mem_global,
nir_address_format_62bit_generic);
if (INTEL_DEBUG(DEBUG_CS)) {
/* Re-index SSA defs so we print more sensible numbers. */
nir_foreach_function_impl(impl, nir) {
nir_index_ssa_defs(impl);
}
fprintf(stderr, "NIR (before I/O lowering) for kernel\n");
nir_print_shader(nir, stderr);
}
return nir;
}

View file

@ -0,0 +1,78 @@
/*
* Copyright © 2020 Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_KERNEL_H
#define BRW_KERNEL_H
#include "brw_compiler.h"
struct disk_cache;
#ifdef __cplusplus
extern "C" {
#endif
/** Software interface for system values in kernels
*
* These are intended to go at the start of the kernel argument buffer.
*/
struct brw_kernel_sysvals {
uint32_t num_work_groups[3];
uint32_t pad[5];
};
struct brw_kernel_arg_desc {
uint16_t offset;
uint16_t size;
};
struct brw_kernel {
struct brw_cs_prog_data prog_data;
struct brw_compile_stats stats[3];
uint16_t args_size;
uint16_t arg_count;
const struct brw_kernel_arg_desc *args;
const void *code;
};
bool
brw_kernel_from_spirv(struct brw_compiler *compiler,
struct disk_cache *disk_cache,
struct brw_kernel *kernel,
void *log_data, void *mem_ctx,
const uint32_t *spirv, size_t spirv_size,
const char *entrypoint_name,
char **error_str);
nir_shader *
brw_nir_from_spirv(void *mem_ctx, const uint32_t *spirv, size_t spirv_size,
bool llvm17_wa);
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* BRW_KERNEL_H */

View file

@ -0,0 +1,465 @@
%option yylineno
%option nounput
%{
#include <string.h>
#include "brw_asm.h"
#undef ALIGN16
#include "brw_gram.tab.h"
/* Locations */
int yycolumn = 1;
int saved_state = 0;
extern char *input_filename;
#define YY_NO_INPUT
#define YY_USER_ACTION \
yylloc.first_line = yylloc.last_line = yylineno; \
yylloc.first_column = yycolumn; \
yylloc.last_column = yycolumn + yyleng - 1; \
yycolumn += yyleng;
%}
%x BLOCK_COMMENT
%x FILENAME
%x CHANNEL
%x REG
%x DOTSEL
%x LABEL
%x MSGDESC
%%
/* eat up single line comment */
\/\/.*[\r\n] { yycolumn = 1; }
/* eat up multiline comment */
\/\* { saved_state = YYSTATE; BEGIN(BLOCK_COMMENT); }
<BLOCK_COMMENT>\*\/ { BEGIN(saved_state); }
<BLOCK_COMMENT>. { }
<BLOCK_COMMENT>[\r\n] { }
<FILENAME>\"[^\"]+\" {
char *name = malloc(yyleng - 1);
memmove(name, yytext + 1, yyleng - 2);
name[yyleng-1] = '\0';
input_filename = name;
}
/* null register */
null { BEGIN(REG); return NULL_TOKEN; }
/* Opcodes */
add { yylval.integer = BRW_OPCODE_ADD; return ADD; }
add3 { yylval.integer = BRW_OPCODE_ADD3; return ADD3; }
addc { yylval.integer = BRW_OPCODE_ADDC; return ADDC; }
and { yylval.integer = BRW_OPCODE_AND; return AND; }
asr { yylval.integer = BRW_OPCODE_ASR; return ASR; }
avg { yylval.integer = BRW_OPCODE_AVG; return AVG; }
bfe { yylval.integer = BRW_OPCODE_BFE; return BFE; }
bfi1 { yylval.integer = BRW_OPCODE_BFI1; return BFI1; }
bfi2 { yylval.integer = BRW_OPCODE_BFI2; return BFI2; }
bfrev { yylval.integer = BRW_OPCODE_BFREV; return BFREV; }
brc { yylval.integer = BRW_OPCODE_BRC; return BRC; }
brd { yylval.integer = BRW_OPCODE_BRD; return BRD; }
break { yylval.integer = BRW_OPCODE_BREAK; return BREAK; }
call { yylval.integer = BRW_OPCODE_CALL; return CALL; }
calla { yylval.integer = BRW_OPCODE_CALLA; return CALLA; }
case { yylval.integer = BRW_OPCODE_CASE; return CASE; }
cbit { yylval.integer = BRW_OPCODE_CBIT; return CBIT; }
cmp { yylval.integer = BRW_OPCODE_CMP; return CMP; }
cmpn { yylval.integer = BRW_OPCODE_CMPN; return CMPN; }
cont { yylval.integer = BRW_OPCODE_CONTINUE; return CONT; }
csel { yylval.integer = BRW_OPCODE_CSEL; return CSEL; }
dim { yylval.integer = BRW_OPCODE_DIM; return DIM; }
do { yylval.integer = BRW_OPCODE_DO; return DO; }
dp2 { yylval.integer = BRW_OPCODE_DP2; return DP2; }
dp3 { yylval.integer = BRW_OPCODE_DP3; return DP3; }
dp4 { yylval.integer = BRW_OPCODE_DP4; return DP4; }
dp4a { yylval.integer = BRW_OPCODE_DP4A; return DP4A; }
dph { yylval.integer = BRW_OPCODE_DPH; return DPH; }
else { yylval.integer = BRW_OPCODE_ELSE; return ELSE; }
endif { yylval.integer = BRW_OPCODE_ENDIF; return ENDIF; }
f16to32 { yylval.integer = BRW_OPCODE_F16TO32; return F16TO32; }
f32to16 { yylval.integer = BRW_OPCODE_F32TO16; return F32TO16; }
fbh { yylval.integer = BRW_OPCODE_FBH; return FBH; }
fbl { yylval.integer = BRW_OPCODE_FBL; return FBL; }
fork { yylval.integer = BRW_OPCODE_FORK; return FORK; }
frc { yylval.integer = BRW_OPCODE_FRC; return FRC; }
goto { yylval.integer = BRW_OPCODE_GOTO; return GOTO; }
halt { yylval.integer = BRW_OPCODE_HALT; return HALT; }
if { yylval.integer = BRW_OPCODE_IF; return IF; }
iff { yylval.integer = BRW_OPCODE_IFF; return IFF; }
illegal { yylval.integer = BRW_OPCODE_ILLEGAL; return ILLEGAL; }
jmpi { yylval.integer = BRW_OPCODE_JMPI; return JMPI; }
line { yylval.integer = BRW_OPCODE_LINE; return LINE; }
lrp { yylval.integer = BRW_OPCODE_LRP; return LRP; }
lzd { yylval.integer = BRW_OPCODE_LZD; return LZD; }
mac { yylval.integer = BRW_OPCODE_MAC; return MAC; }
mach { yylval.integer = BRW_OPCODE_MACH; return MACH; }
mad { yylval.integer = BRW_OPCODE_MAD; return MAD; }
madm { yylval.integer = BRW_OPCODE_MADM; return MADM; }
mov { yylval.integer = BRW_OPCODE_MOV; return MOV; }
movi { yylval.integer = BRW_OPCODE_MOVI; return MOVI; }
mul { yylval.integer = BRW_OPCODE_MUL; return MUL; }
mrest { yylval.integer = BRW_OPCODE_MREST; return MREST; }
msave { yylval.integer = BRW_OPCODE_MSAVE; return MSAVE; }
nenop { yylval.integer = BRW_OPCODE_NENOP; return NENOP; }
nop { yylval.integer = BRW_OPCODE_NOP; return NOP; }
not { yylval.integer = BRW_OPCODE_NOT; return NOT; }
or { yylval.integer = BRW_OPCODE_OR; return OR; }
pln { yylval.integer = BRW_OPCODE_PLN; return PLN; }
pop { yylval.integer = BRW_OPCODE_POP; return POP; }
push { yylval.integer = BRW_OPCODE_PUSH; return PUSH; }
ret { yylval.integer = BRW_OPCODE_RET; return RET; }
rndd { yylval.integer = BRW_OPCODE_RNDD; return RNDD; }
rnde { yylval.integer = BRW_OPCODE_RNDE; return RNDE; }
rndu { yylval.integer = BRW_OPCODE_RNDU; return RNDU; }
rndz { yylval.integer = BRW_OPCODE_RNDZ; return RNDZ; }
rol { yylval.integer = BRW_OPCODE_ROL; return ROL; }
ror { yylval.integer = BRW_OPCODE_ROR; return ROR; }
sad2 { yylval.integer = BRW_OPCODE_SAD2; return SAD2; }
sada2 { yylval.integer = BRW_OPCODE_SADA2; return SADA2; }
sel { yylval.integer = BRW_OPCODE_SEL; return SEL; }
send {
yylval.integer = BRW_OPCODE_SEND;
return p->devinfo->ver < 12 ? SEND_GFX4 : SEND_GFX12;
}
sendc {
yylval.integer = BRW_OPCODE_SENDC;
return p->devinfo->ver < 12 ? SENDC_GFX4 : SENDC_GFX12;
}
sends { yylval.integer = BRW_OPCODE_SENDS; return SENDS; }
sendsc { yylval.integer = BRW_OPCODE_SENDSC; return SENDSC; }
shl { yylval.integer = BRW_OPCODE_SHL; return SHL; }
shr { yylval.integer = BRW_OPCODE_SHR; return SHR; }
smov { yylval.integer = BRW_OPCODE_SMOV; return SMOV; }
subb { yylval.integer = BRW_OPCODE_SUBB; return SUBB; }
wait { yylval.integer = BRW_OPCODE_WAIT; return WAIT; }
while { yylval.integer = BRW_OPCODE_WHILE; return WHILE; }
xor { yylval.integer = BRW_OPCODE_XOR; return XOR; }
sync { yylval.integer = BRW_OPCODE_SYNC; return SYNC; }
/* extended math functions */
cos { yylval.integer = BRW_MATH_FUNCTION_COS; return COS; }
exp { yylval.integer = BRW_MATH_FUNCTION_EXP; return EXP; }
fdiv { yylval.integer = BRW_MATH_FUNCTION_FDIV; return FDIV; }
inv { yylval.integer = BRW_MATH_FUNCTION_INV; return INV; }
invm { yylval.integer = GFX8_MATH_FUNCTION_INVM; return INVM; }
intdiv {
yylval.integer = BRW_MATH_FUNCTION_INT_DIV_QUOTIENT;
return INTDIV;
}
intdivmod {
yylval.integer =
BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER;
return INTDIVMOD;
}
intmod {
yylval.integer = BRW_MATH_FUNCTION_INT_DIV_REMAINDER;
return INTMOD;
}
log { yylval.integer = BRW_MATH_FUNCTION_LOG; return LOG; }
pow { yylval.integer = BRW_MATH_FUNCTION_POW; return POW; }
rsq { yylval.integer = BRW_MATH_FUNCTION_RSQ; return RSQ; }
rsqrtm { yylval.integer = GFX8_MATH_FUNCTION_RSQRTM; return RSQRTM; }
sin { yylval.integer = BRW_MATH_FUNCTION_SIN; return SIN; }
sqrt { yylval.integer = BRW_MATH_FUNCTION_SQRT; return SQRT; }
sincos { yylval.integer = BRW_MATH_FUNCTION_SINCOS; return SINCOS; }
/* sync instruction */
allrd { yylval.integer = TGL_SYNC_ALLRD; return ALLRD; }
allwr { yylval.integer = TGL_SYNC_ALLWR; return ALLWR; }
fence { yylval.integer = TGL_SYNC_FENCE; return FENCE; }
bar { yylval.integer = TGL_SYNC_BAR; return BAR; }
host { yylval.integer = TGL_SYNC_HOST; return HOST; }
/* shared functions for send instruction */
sampler { return SAMPLER; }
dp_sampler { return DP_SAMPLER; }
gateway { return GATEWAY; }
urb { return URB; }
thread_spawner { return THREAD_SPAWNER; }
render { return RENDER; }
const { return CONST; }
data { return DATA; }
cre { return CRE; }
math { return MATH; }
read { return READ; }
write { return WRITE; }
vme { return VME; }
"pixel interp" { return PIXEL_INTERP; }
"dp data 1" { return DP_DATA_1; }
"rt accel" { return RT_ACCEL; }
slm { return SLM; }
tgm { return TGM; }
ugm { return UGM; }
";" { return SEMICOLON; }
":" { return COLON; }
"(" { return LPAREN; }
")" { return RPAREN; }
"{" { return LCURLY; }
"}" { return RCURLY; }
"[" { return LSQUARE; }
"]" { return RSQUARE; }
"<" { return LANGLE; }
">" { return RANGLE; }
"," { return COMMA; }
"." { return DOT; }
"+" { return PLUS; }
"-" { return MINUS; }
"~" { return MINUS; }
"(abs)" { return ABS; }
"VxH" { return VxH; }
<REG>"<" { return LANGLE; }
<REG>[0-9][0-9]* {
yylval.integer = strtoul(yytext, NULL, 10);
return INTEGER;
}
<REG>">" { return RANGLE; }
<REG>"," { return COMMA; }
<REG>"." { BEGIN(DOTSEL); return DOT; }
<REG>";" { return SEMICOLON; }
<DOTSEL>"x" { yylval.integer = BRW_CHANNEL_X; return X; }
<DOTSEL>"y" { yylval.integer = BRW_CHANNEL_Y; return Y; }
<DOTSEL>"z" { yylval.integer = BRW_CHANNEL_Z; return Z; }
<DOTSEL>"w" { yylval.integer = BRW_CHANNEL_W; return W; }
<DOTSEL>[0-9][0-9]* {
yylval.integer = strtoul(yytext, NULL, 10);
BEGIN(REG);
return INTEGER;
}
<DOTSEL>. { yyless(0); BEGIN(INITIAL); }
<REG>. { yyless(0); BEGIN(INITIAL); }
/* Access mode */
"align1" { return ALIGN1; }
"align16" { return ALIGN16; }
/* Accumulator write control */
AccWrEnable { return ACCWREN; }
/* Mask control (formerly WECtrl/Write Enable Control) */
"WE_all" { return WECTRL; }
/* Compaction control */
compacted { return CMPTCTRL; }
/* Debug control */
breakpoint { return BREAKPOINT; }
/* Dependency control */
NoDDClr { return NODDCLR; }
NoDDChk { return NODDCHK; }
/* End of thread */
EOT { return EOT; }
/* Mask control */
nomask { return MASK_DISABLE; }
/* Channel */
<CHANNEL>"x" { yylval.integer = BRW_CHANNEL_X; return X; }
<CHANNEL>"y" { yylval.integer = BRW_CHANNEL_Y; return Y; }
<CHANNEL>"z" { yylval.integer = BRW_CHANNEL_Z; return Z; }
<CHANNEL>"w" { yylval.integer = BRW_CHANNEL_W; return W; }
<CHANNEL>[0-9][0-9]* {
yylval.integer = strtoul(yytext, NULL, 10);
return INTEGER;
}
<CHANNEL>"." { return DOT; }
<CHANNEL>. { yyless(0); BEGIN(INITIAL); }
/* Predicate Control */
<CHANNEL>".anyv" { yylval.integer = BRW_PREDICATE_ALIGN1_ANYV; return ANYV; }
<CHANNEL>".allv" { yylval.integer = BRW_PREDICATE_ALIGN1_ALLV; return ALLV; }
<CHANNEL>".any2h" { yylval.integer = BRW_PREDICATE_ALIGN1_ANY2H; return ANY2H; }
<CHANNEL>".all2h" { yylval.integer = BRW_PREDICATE_ALIGN1_ALL2H; return ALL2H; }
<CHANNEL>".any4h" { yylval.integer = BRW_PREDICATE_ALIGN16_ANY4H; return ANY4H; }
<CHANNEL>".all4h" { yylval.integer = BRW_PREDICATE_ALIGN16_ALL4H; return ALL4H; }
<CHANNEL>".any8h" { yylval.integer = BRW_PREDICATE_ALIGN1_ANY8H; return ANY8H; }
<CHANNEL>".all8h" { yylval.integer = BRW_PREDICATE_ALIGN1_ALL8H; return ALL8H; }
<CHANNEL>".any16h" { yylval.integer = BRW_PREDICATE_ALIGN1_ANY16H; return ANY16H; }
<CHANNEL>".all16h" { yylval.integer = BRW_PREDICATE_ALIGN1_ALL16H; return ALL16H; }
<CHANNEL>".any32h" { yylval.integer = BRW_PREDICATE_ALIGN1_ANY32H; return ANY32H; }
<CHANNEL>".all32h" { yylval.integer = BRW_PREDICATE_ALIGN1_ALL32H; return ALL32H; }
/* Saturation */
".sat" { return SATURATE; }
/* Thread control */
atomic { return ATOMIC; }
switch { return SWITCH; }
/* compression control */
compr { return COMPR; }
compr4 { return COMPR4; }
sechalf { return SECHALF; }
/* Quarter Control */
1[HNQ] { }
"2Q" { return QTR_2Q; }
"3Q" { return QTR_3Q; }
"4Q" { return QTR_4Q; }
"2H" { return QTR_2H; }
"2N" { return QTR_2N; }
"3N" { return QTR_3N; }
"4N" { return QTR_4N; }
"5N" { return QTR_5N; }
"6N" { return QTR_6N; }
"7N" { return QTR_7N; }
"8N" { return QTR_8N; }
/* data types */
:?B { return TYPE_B; }
:?D { return TYPE_D; }
:?DF { return TYPE_DF; }
:?F { return TYPE_F; }
:?HF { return TYPE_HF; }
:?NF { return TYPE_NF; }
:?Q { return TYPE_Q; }
:?UB { return TYPE_UB; }
:?UD { return TYPE_UD; }
:?UW { return TYPE_UW; }
:?UQ { return TYPE_UQ; }
:?UV { return TYPE_UV; }
:?V { return TYPE_V; }
:?VF { return TYPE_VF; }
:?W { return TYPE_W; }
/* Address registers */
"a0" { return ADDRREG; }
/* accumulator registers */
"acc"[0-9]+ { yylval.integer = atoi(yytext + 3); return ACCREG; }
/* channel enable registers */
"ce0" { return CHANNELENABLEREG; }
/* control registers */
"cr0" { return CONTROLREG; }
/* flag registers */
"f"[0|1] { BEGIN(CHANNEL); yylval.integer = atoi(yytext + 1); return FLAGREG; }
/* message control registers */
"m" { return MSGREGFILE; }
m[0-9]+ { yylval.integer = atoi(yytext + 1); BEGIN(REG); return MSGREG; }
/* state register */
sr[0-9]+ { yylval.integer = atoi(yytext + 2); return STATEREG; }
/* notification registers */
"n0" { BEGIN(REG); return NOTIFYREG; }
/* IP register */
"ip" { return IPREG; }
/* Thread control register */
"tdr0" { return THREADREG; }
/* performance register */
"tm0" { BEGIN(REG); return PERFORMANCEREG; }
[gr][0-9]+ {
yylval.integer = atoi(yytext + 1);
BEGIN(REG); return GENREG;
}
[gr] { return GENREGFILE; }
"mask"[0-9]+ { yylval.integer = atoi(yytext + 4); return MASKREG; }
/* Conditional modifiers */
".e" { yylval.integer = BRW_CONDITIONAL_Z; return EQUAL; }
".g" { yylval.integer = BRW_CONDITIONAL_G; return GREATER; }
".ge" { yylval.integer = BRW_CONDITIONAL_GE; return GREATER_EQUAL; }
".l" { yylval.integer = BRW_CONDITIONAL_L; return LESS; }
".le" { yylval.integer = BRW_CONDITIONAL_LE; return LESS_EQUAL; }
".ne" { yylval.integer = BRW_CONDITIONAL_NZ; return NOT_EQUAL; }
".nz" { yylval.integer = BRW_CONDITIONAL_NZ; return NOT_ZERO; }
".o" { yylval.integer = BRW_CONDITIONAL_O; return OVERFLOW; }
".r" { yylval.integer = BRW_CONDITIONAL_R; return ROUND_INCREMENT; }
".u" { yylval.integer = BRW_CONDITIONAL_U; return UNORDERED; }
".z" { yylval.integer = BRW_CONDITIONAL_Z; return ZERO; }
/* Eat up JIP and UIP token, their values will be parsed
* in numeric section
*/
"JIP: " { BEGIN(LABEL); }
"UIP: " { BEGIN(LABEL); }
"Jump: " { }
"Pop: " { }
[ \t]+ { }
"MsgDesc: " { BEGIN(MSGDESC); return MSGDESC_BEGIN; }
<MSGDESC>ex_bso { return EX_BSO; }
<MSGDESC>src1_len { return SRC1_LEN; }
<MSGDESC>"=" { return ASSIGN; }
<MSGDESC>[0-9][0-9]* {
yylval.integer = strtoul(yytext, NULL, 10);
return INTEGER;
}
<MSGDESC>"{" { yyless(0); BEGIN(INITIAL); return MSGDESC_END; }
<MSGDESC>. { }
"0x"[0-9a-f][0-9a-f]* {
yylval.llint = strtoull(yytext + 2, NULL, 16);
return LONG;
}
[0-9][0-9]* {
yylval.llint = strtoll(yytext, NULL, 10);
return LONG;
}
/* jump label target */
[a-zA-Z_][0-9a-zA-Z_]*":" {
yylval.string = ralloc_strdup(p->mem_ctx, yytext);
/* Stomp the trailing ':' */
yylval.string[yyleng - 1] = '\0';
return JUMP_LABEL_TARGET;
}
/* jump label */
<LABEL>[a-zA-Z_][0-9a-zA-Z_]* {
yylval.string = ralloc_strdup(p->mem_ctx, yytext);
BEGIN(INITIAL);
return JUMP_LABEL;
}
/* SWSB */
"@"[1-7] { yylval.integer = atoi(yytext + 1); return REG_DIST_CURRENT; }
"F@"[1-7] { yylval.integer = atoi(yytext + 2); return REG_DIST_FLOAT; }
"I@"[1-7] { yylval.integer = atoi(yytext + 2); return REG_DIST_INT; }
"L@"[1-7] { yylval.integer = atoi(yytext + 2); return REG_DIST_LONG; }
"A@"[1-7] { yylval.integer = atoi(yytext + 2); return REG_DIST_ALL; }
"$"[0-9]* { yylval.integer = atoi(yytext + 1); return SBID_ALLOC; }
"$"[0-9]*".src" { yylval.integer = atoi(yytext + 1); return SBID_WAIT_SRC; }
"$"[0-9]*".dst" { yylval.integer = atoi(yytext + 1); return SBID_WAIT_DST; }
\n { yycolumn = 1; }
. {
fprintf(stderr, "%s: %d: %s: at \"%s\"\n",
input_filename, yylineno,
"unexpected token", lex_text());
}
%%
char *
lex_text(void)
{
return yytext;
}
#ifndef yywrap
int yywrap()
{
return -1;
}
#endif

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,298 @@
/*
* Copyright © 2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_NIR_H
#define BRW_NIR_H
#include "brw_reg.h"
#include "compiler/nir/nir.h"
#include "brw_compiler.h"
#include "nir_builder.h"
#ifdef __cplusplus
extern "C" {
#endif
extern const struct nir_shader_compiler_options brw_scalar_nir_options;
extern const struct nir_shader_compiler_options brw_vector_nir_options;
int type_size_vec4(const struct glsl_type *type, bool bindless);
int type_size_dvec4(const struct glsl_type *type, bool bindless);
static inline int
type_size_scalar_bytes(const struct glsl_type *type, bool bindless)
{
return glsl_count_dword_slots(type, bindless) * 4;
}
static inline int
type_size_vec4_bytes(const struct glsl_type *type, bool bindless)
{
return type_size_vec4(type, bindless) * 16;
}
/* Flags set in the instr->pass_flags field by i965 analysis passes */
enum {
BRW_NIR_NON_BOOLEAN = 0x0,
/* Indicates that the given instruction's destination is a boolean
* value but that it needs to be resolved before it can be used.
* On Gen <= 5, CMP instructions return a 32-bit value where the bottom
* bit represents the actual true/false value of the compare and the top
* 31 bits are undefined. In order to use this value, we have to do a
* "resolve" operation by replacing the value of the CMP with -(x & 1)
* to sign-extend the bottom bit to 0/~0.
*/
BRW_NIR_BOOLEAN_NEEDS_RESOLVE = 0x1,
/* Indicates that the given instruction's destination is a boolean
* value that has intentionally been left unresolved. Not all boolean
* values need to be resolved immediately. For instance, if we have
*
* CMP r1 r2 r3
* CMP r4 r5 r6
* AND r7 r1 r4
*
* We don't have to resolve the result of the two CMP instructions
* immediately because the AND still does an AND of the bottom bits.
* Instead, we can save ourselves instructions by delaying the resolve
* until after the AND. The result of the two CMP instructions is left
* as BRW_NIR_BOOLEAN_UNRESOLVED.
*/
BRW_NIR_BOOLEAN_UNRESOLVED = 0x2,
/* Indicates a that the given instruction's destination is a boolean
* value that does not need a resolve. For instance, if you AND two
* values that are BRW_NIR_BOOLEAN_NEEDS_RESOLVE then we know that both
* values will be 0/~0 before we get them and the result of the AND is
* also guaranteed to be 0/~0 and does not need a resolve.
*/
BRW_NIR_BOOLEAN_NO_RESOLVE = 0x3,
/* A mask to mask the boolean status values off of instr->pass_flags */
BRW_NIR_BOOLEAN_MASK = 0x3,
};
void brw_nir_analyze_boolean_resolves(nir_shader *nir);
struct brw_nir_compiler_opts {
/* Soft floating point implementation shader */
const nir_shader *softfp64;
/* Whether robust image access is enabled */
bool robust_image_access;
/* Input vertices for TCS stage (0 means dynamic) */
unsigned input_vertices;
};
/* UBO surface index can come in 2 flavors :
* - nir_intrinsic_resource_intel
* - anything else
*
* In the first case, checking that the surface index is const requires
* checking resource_intel::src[1]. In any other case it's a simple
* nir_src_is_const().
*
* This function should only be called on src[0] of load_ubo intrinsics.
*/
static inline bool
brw_nir_ubo_surface_index_is_pushable(nir_src src)
{
nir_intrinsic_instr *intrin =
src.ssa->parent_instr->type == nir_instr_type_intrinsic ?
nir_instr_as_intrinsic(src.ssa->parent_instr) : NULL;
if (intrin && intrin->intrinsic == nir_intrinsic_resource_intel) {
return (nir_intrinsic_resource_access_intel(intrin) &
nir_resource_intel_pushable);
}
return nir_src_is_const(src);
}
static inline unsigned
brw_nir_ubo_surface_index_get_push_block(nir_src src)
{
if (nir_src_is_const(src))
return nir_src_as_uint(src);
if (!brw_nir_ubo_surface_index_is_pushable(src))
return UINT32_MAX;
assert(src.ssa->parent_instr->type == nir_instr_type_intrinsic);
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src.ssa->parent_instr);
assert(intrin->intrinsic == nir_intrinsic_resource_intel);
return nir_intrinsic_resource_block_intel(intrin);
}
/* This helper return the binding table index of a surface access (any
* buffer/image/etc...). It works off the source of one of the intrinsics
* (load_ubo, load_ssbo, store_ssbo, load_image, store_image, etc...).
*
* If the source is constant, then this is the binding table index. If we're
* going through a resource_intel intel intrinsic, then we need to check
* src[1] of that intrinsic.
*/
static inline unsigned
brw_nir_ubo_surface_index_get_bti(nir_src src)
{
if (nir_src_is_const(src))
return nir_src_as_uint(src);
assert(src.ssa->parent_instr->type == nir_instr_type_intrinsic);
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src.ssa->parent_instr);
if (!intrin || intrin->intrinsic != nir_intrinsic_resource_intel)
return UINT32_MAX;
/* In practice we could even drop this intrinsic because the bindless
* access always operate from a base offset coming from a push constant, so
* they can never be constant.
*/
if (nir_intrinsic_resource_access_intel(intrin) &
nir_resource_intel_bindless)
return UINT32_MAX;
if (!nir_src_is_const(intrin->src[1]))
return UINT32_MAX;
return nir_src_as_uint(intrin->src[1]);
}
void brw_preprocess_nir(const struct brw_compiler *compiler,
nir_shader *nir,
const struct brw_nir_compiler_opts *opts);
void
brw_nir_link_shaders(const struct brw_compiler *compiler,
nir_shader *producer, nir_shader *consumer);
bool brw_nir_lower_cs_intrinsics(nir_shader *nir,
const struct intel_device_info *devinfo,
struct brw_cs_prog_data *prog_data);
bool brw_nir_lower_alpha_to_coverage(nir_shader *shader,
const struct brw_wm_prog_key *key,
const struct brw_wm_prog_data *prog_data);
void brw_nir_lower_vs_inputs(nir_shader *nir,
bool edgeflag_is_last,
const uint8_t *vs_attrib_wa_flags);
void brw_nir_lower_vue_inputs(nir_shader *nir,
const struct intel_vue_map *vue_map);
void brw_nir_lower_tes_inputs(nir_shader *nir, const struct intel_vue_map *vue);
void brw_nir_lower_fs_inputs(nir_shader *nir,
const struct intel_device_info *devinfo,
const struct brw_wm_prog_key *key);
void brw_nir_lower_vue_outputs(nir_shader *nir);
void brw_nir_lower_tcs_outputs(nir_shader *nir, const struct intel_vue_map *vue,
enum tess_primitive_mode tes_primitive_mode);
void brw_nir_lower_fs_outputs(nir_shader *nir);
bool brw_nir_lower_cmat(nir_shader *nir, unsigned subgroup_size);
bool brw_nir_lower_shading_rate_output(nir_shader *nir);
bool brw_nir_lower_sparse_intrinsics(nir_shader *nir);
struct brw_nir_lower_storage_image_opts {
const struct intel_device_info *devinfo;
bool lower_loads;
bool lower_stores;
bool lower_atomics;
bool lower_get_size;
};
bool brw_nir_lower_storage_image(nir_shader *nir,
const struct brw_nir_lower_storage_image_opts *opts);
bool brw_nir_lower_mem_access_bit_sizes(nir_shader *shader,
const struct
intel_device_info *devinfo);
void brw_postprocess_nir(nir_shader *nir,
const struct brw_compiler *compiler,
bool debug_enabled,
enum brw_robustness_flags robust_flags);
bool brw_nir_apply_attribute_workarounds(nir_shader *nir,
const uint8_t *attrib_wa_flags);
bool brw_nir_apply_trig_workarounds(nir_shader *nir);
bool brw_nir_limit_trig_input_range_workaround(nir_shader *nir);
void brw_nir_apply_key(nir_shader *nir,
const struct brw_compiler *compiler,
const struct brw_base_prog_key *key,
unsigned max_subgroup_size);
unsigned brw_nir_api_subgroup_size(const nir_shader *nir,
unsigned hw_subgroup_size);
enum brw_conditional_mod brw_cmod_for_nir_comparison(nir_op op);
enum lsc_opcode lsc_aop_for_nir_intrinsic(const nir_intrinsic_instr *atomic);
enum brw_reg_type brw_type_for_nir_type(const struct intel_device_info *devinfo,
nir_alu_type type);
bool brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
unsigned bit_size,
unsigned num_components,
nir_intrinsic_instr *low,
nir_intrinsic_instr *high,
void *data);
void brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
nir_shader *nir,
struct brw_ubo_range out_ranges[4]);
void brw_nir_optimize(nir_shader *nir, bool is_scalar,
const struct intel_device_info *devinfo);
nir_shader *brw_nir_create_passthrough_tcs(void *mem_ctx,
const struct brw_compiler *compiler,
const struct brw_tcs_prog_key *key);
#define BRW_NIR_FRAG_OUTPUT_INDEX_SHIFT 0
#define BRW_NIR_FRAG_OUTPUT_INDEX_MASK INTEL_MASK(0, 0)
#define BRW_NIR_FRAG_OUTPUT_LOCATION_SHIFT 1
#define BRW_NIR_FRAG_OUTPUT_LOCATION_MASK INTEL_MASK(31, 1)
bool brw_nir_move_interpolation_to_top(nir_shader *nir);
nir_def *brw_nir_load_global_const(nir_builder *b,
nir_intrinsic_instr *load_uniform,
nir_def *base_addr,
unsigned off);
const struct glsl_type *brw_nir_get_var_type(const struct nir_shader *nir,
nir_variable *var);
void brw_nir_adjust_payload(nir_shader *shader);
#ifdef __cplusplus
}
#endif
#endif /* BRW_NIR_H */

View file

@ -0,0 +1,258 @@
/*
* Copyright © 2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_nir.h"
/*
* This file implements an analysis pass that determines when we have to do
* a boolean resolve on Gen <= 5. Instructions that need a boolean resolve
* will have the booleans portion of the instr->pass_flags field set to
* BRW_NIR_BOOLEAN_NEEDS_RESOLVE.
*/
/** Returns the resolve status for the given source
*
* If the source has a parent instruction then the resolve status is the
* status of the parent instruction. If the source does not have a parent
* instruction then we don't know so we return NON_BOOLEAN.
*/
static uint8_t
get_resolve_status_for_src(nir_src *src)
{
nir_instr *src_instr = src->ssa->parent_instr;
uint8_t resolve_status = src_instr->pass_flags & BRW_NIR_BOOLEAN_MASK;
/* If the source instruction needs resolve, then from the perspective
* of the user, it's a true boolean.
*/
if (resolve_status == BRW_NIR_BOOLEAN_NEEDS_RESOLVE)
resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE;
return resolve_status;
}
/** Marks the given source as needing a resolve
*
* If the given source corresponds to an unresolved boolean it marks it as
* needing a resolve. Otherwise, we leave it alone.
*/
static bool
src_mark_needs_resolve(nir_src *src, void *void_state)
{
nir_instr *src_instr = src->ssa->parent_instr;
uint8_t resolve_status = src_instr->pass_flags & BRW_NIR_BOOLEAN_MASK;
/* If the source instruction is unresolved, then mark it as needing
* to be resolved.
*/
if (resolve_status == BRW_NIR_BOOLEAN_UNRESOLVED) {
src_instr->pass_flags &= ~BRW_NIR_BOOLEAN_MASK;
src_instr->pass_flags |= BRW_NIR_BOOLEAN_NEEDS_RESOLVE;
}
return true;
}
static bool
analyze_boolean_resolves_block(nir_block *block)
{
nir_foreach_instr(instr, block) {
switch (instr->type) {
case nir_instr_type_alu: {
/* For ALU instructions, the resolve status is handled in a
* three-step process.
*
* 1) Look at the instruction type and sources and determine if it
* can be left unresolved.
*
* 2) Look at the destination and see if we have to resolve
* anyway. (This is the case if this instruction is not the
* only instruction writing to a given register.)
*
* 3) If the instruction has a resolve status other than
* BOOL_UNRESOLVED or BOOL_NEEDS_RESOLVE then we walk through
* the sources and ensure that they are also resolved. This
* ensures that we don't end up with any stray unresolved
* booleans going into ADDs or something like that.
*/
uint8_t resolve_status;
nir_alu_instr *alu = nir_instr_as_alu(instr);
switch (alu->op) {
case nir_op_b32all_fequal2:
case nir_op_b32all_iequal2:
case nir_op_b32all_fequal3:
case nir_op_b32all_iequal3:
case nir_op_b32all_fequal4:
case nir_op_b32all_iequal4:
case nir_op_b32any_fnequal2:
case nir_op_b32any_inequal2:
case nir_op_b32any_fnequal3:
case nir_op_b32any_inequal3:
case nir_op_b32any_fnequal4:
case nir_op_b32any_inequal4:
/* These are only implemented by the vec4 backend and its
* implementation emits resolved booleans. At some point in the
* future, this may change and we'll have to remove some of the
* above cases.
*/
resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE;
break;
case nir_op_mov:
case nir_op_inot:
/* This is a single-source instruction. Just copy the resolve
* status from the source.
*/
resolve_status = get_resolve_status_for_src(&alu->src[0].src);
break;
case nir_op_b32csel:
case nir_op_iand:
case nir_op_ior:
case nir_op_ixor: {
const unsigned first = alu->op == nir_op_b32csel ? 1 : 0;
uint8_t src0_status = get_resolve_status_for_src(&alu->src[first + 0].src);
uint8_t src1_status = get_resolve_status_for_src(&alu->src[first + 1].src);
/* src0 of a bcsel is evaluated as a Boolean with the expectation
* that it has already been resolved. Mark it as such.
*/
if (alu->op == nir_op_b32csel)
src_mark_needs_resolve(&alu->src[0].src, NULL);
if (src0_status == src1_status) {
resolve_status = src0_status;
} else if (src0_status == BRW_NIR_NON_BOOLEAN ||
src1_status == BRW_NIR_NON_BOOLEAN) {
/* If one of the sources is a non-boolean then the whole
* thing is a non-boolean.
*/
resolve_status = BRW_NIR_NON_BOOLEAN;
} else {
/* At this point one of them is a true boolean and one is a
* boolean that needs a resolve. We could either resolve the
* unresolved source or we could resolve here. If we resolve
* the unresolved source then we get two resolves for the price
* of one. Just set this one to BOOLEAN_NO_RESOLVE and we'll
* let the code below force a resolve on the unresolved source.
*/
resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE;
}
break;
}
default:
if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) == nir_type_bool) {
/* This instructions will turn into a CMP when we actually emit
* them so the result will have to be resolved before it can be
* used.
*/
resolve_status = BRW_NIR_BOOLEAN_UNRESOLVED;
/* Even though the destination is allowed to be left
* unresolved, the sources are treated as regular integers or
* floats so they need to be resolved.
*/
nir_foreach_src(instr, src_mark_needs_resolve, NULL);
} else {
resolve_status = BRW_NIR_NON_BOOLEAN;
}
}
/* Go ahead allow unresolved booleans. */
instr->pass_flags = (instr->pass_flags & ~BRW_NIR_BOOLEAN_MASK) |
resolve_status;
/* Finally, resolve sources if it's needed */
switch (resolve_status) {
case BRW_NIR_BOOLEAN_NEEDS_RESOLVE:
case BRW_NIR_BOOLEAN_UNRESOLVED:
/* This instruction is either unresolved or we're doing the
* resolve here; leave the sources alone.
*/
break;
case BRW_NIR_BOOLEAN_NO_RESOLVE:
case BRW_NIR_NON_BOOLEAN:
nir_foreach_src(instr, src_mark_needs_resolve, NULL);
break;
default:
unreachable("Invalid boolean flag");
}
break;
}
case nir_instr_type_load_const: {
nir_load_const_instr *load = nir_instr_as_load_const(instr);
/* For load_const instructions, it's a boolean exactly when it holds
* one of the values NIR_TRUE or NIR_FALSE.
*
* Since load_const instructions don't have any sources, we don't
* have to worry about resolving them.
*/
instr->pass_flags &= ~BRW_NIR_BOOLEAN_MASK;
if (load->value[0].u32 == NIR_TRUE || load->value[0].u32 == NIR_FALSE) {
instr->pass_flags |= BRW_NIR_BOOLEAN_NO_RESOLVE;
} else {
instr->pass_flags |= BRW_NIR_NON_BOOLEAN;
}
continue;
}
default:
/* Everything else is an unknown non-boolean value and needs to
* have all sources resolved.
*/
instr->pass_flags = (instr->pass_flags & ~BRW_NIR_BOOLEAN_MASK) |
BRW_NIR_NON_BOOLEAN;
nir_foreach_src(instr, src_mark_needs_resolve, NULL);
continue;
}
}
nir_if *following_if = nir_block_get_following_if(block);
if (following_if)
src_mark_needs_resolve(&following_if->condition, NULL);
return true;
}
static void
analyze_boolean_resolves_impl(nir_function_impl *impl)
{
nir_foreach_block(block, impl) {
analyze_boolean_resolves_block(block);
}
}
void
brw_nir_analyze_boolean_resolves(nir_shader *shader)
{
nir_foreach_function_impl(impl, shader) {
analyze_boolean_resolves_impl(impl);
}
}

View file

@ -0,0 +1,317 @@
/*
* Copyright © 2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_nir.h"
#include "compiler/nir/nir.h"
#include "util/u_dynarray.h"
/**
* \file brw_nir_analyze_ubo_ranges.c
*
* This pass decides which portions of UBOs to upload as push constants,
* so shaders can access them as part of the thread payload, rather than
* having to issue expensive memory reads to pull the data.
*
* The 3DSTATE_CONSTANT_* mechanism can push data from up to 4 different
* buffers, in GRF (256-bit/32-byte) units.
*
* To do this, we examine NIR load_ubo intrinsics, recording the number of
* loads at each offset. We track offsets at a 32-byte granularity, so even
* fields with a bit of padding between them tend to fall into contiguous
* ranges. We build a list of these ranges, tracking their "cost" (number
* of registers required) and "benefit" (number of pull loads eliminated
* by pushing the range). We then sort the list to obtain the four best
* ranges (most benefit for the least cost).
*/
struct ubo_range_entry
{
struct brw_ubo_range range;
int benefit;
};
static int
score(const struct ubo_range_entry *entry)
{
return 2 * entry->benefit - entry->range.length;
}
/**
* Compares score for two UBO range entries.
*
* For a descending qsort().
*/
static int
cmp_ubo_range_entry(const void *va, const void *vb)
{
const struct ubo_range_entry *a = va;
const struct ubo_range_entry *b = vb;
/* Rank based on scores, descending order */
int delta = score(b) - score(a);
/* Then use the UBO block index as a tie-breaker, descending order */
if (delta == 0)
delta = b->range.block - a->range.block;
/* Finally use the start offset as a second tie-breaker, ascending order */
if (delta == 0)
delta = a->range.start - b->range.start;
return delta;
}
struct ubo_block_info
{
/* Each bit in the offsets bitfield represents a 32-byte section of data.
* If it's set to one, there is interesting UBO data at that offset. If
* not, there's a "hole" - padding between data - or just nothing at all.
*/
uint64_t offsets;
uint8_t uses[64];
};
struct ubo_analysis_state
{
struct hash_table *blocks;
bool uses_regular_uniforms;
};
static struct ubo_block_info *
get_block_info(struct ubo_analysis_state *state, int block)
{
uint32_t hash = block + 1;
void *key = (void *) (uintptr_t) hash;
struct hash_entry *entry =
_mesa_hash_table_search_pre_hashed(state->blocks, hash, key);
if (entry)
return (struct ubo_block_info *) entry->data;
struct ubo_block_info *info =
rzalloc(state->blocks, struct ubo_block_info);
_mesa_hash_table_insert_pre_hashed(state->blocks, hash, key, info);
return info;
}
static void
analyze_ubos_block(struct ubo_analysis_state *state, nir_block *block)
{
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_load_uniform:
case nir_intrinsic_image_deref_load:
case nir_intrinsic_image_deref_store:
case nir_intrinsic_image_deref_atomic:
case nir_intrinsic_image_deref_atomic_swap:
case nir_intrinsic_image_deref_size:
state->uses_regular_uniforms = true;
continue;
case nir_intrinsic_load_ubo:
break; /* Fall through to the analysis below */
default:
continue; /* Not a uniform or UBO intrinsic */
}
if (brw_nir_ubo_surface_index_is_pushable(intrin->src[0]) &&
nir_src_is_const(intrin->src[1])) {
const int block = brw_nir_ubo_surface_index_get_push_block(intrin->src[0]);
const unsigned byte_offset = nir_src_as_uint(intrin->src[1]);
const int offset = byte_offset / 32;
/* Avoid shifting by larger than the width of our bitfield, as this
* is undefined in C. Even if we require multiple bits to represent
* the entire value, it's OK to record a partial value - the backend
* is capable of falling back to pull loads for later components of
* vectors, as it has to shrink ranges for other reasons anyway.
*/
if (offset >= 64)
continue;
/* The value might span multiple 32-byte chunks. */
const int bytes = nir_intrinsic_dest_components(intrin) *
(intrin->def.bit_size / 8);
const int start = ROUND_DOWN_TO(byte_offset, 32);
const int end = ALIGN(byte_offset + bytes, 32);
const int chunks = (end - start) / 32;
/* TODO: should we count uses in loops as higher benefit? */
struct ubo_block_info *info = get_block_info(state, block);
info->offsets |= ((1ull << chunks) - 1) << offset;
info->uses[offset]++;
}
}
}
static void
print_ubo_entry(FILE *file,
const struct ubo_range_entry *entry,
struct ubo_analysis_state *state)
{
struct ubo_block_info *info = get_block_info(state, entry->range.block);
fprintf(file,
"block %2d, start %2d, length %2d, bits = %"PRIx64", "
"benefit %2d, cost %2d, score = %2d\n",
entry->range.block, entry->range.start, entry->range.length,
info->offsets, entry->benefit, entry->range.length, score(entry));
}
void
brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
nir_shader *nir,
struct brw_ubo_range out_ranges[4])
{
void *mem_ctx = ralloc_context(NULL);
struct ubo_analysis_state state = {
.uses_regular_uniforms = false,
.blocks =
_mesa_hash_table_create(mem_ctx, NULL, _mesa_key_pointer_equal),
};
/* Compute shaders use push constants to get the subgroup ID so it's
* best to just assume some system values are pushed.
*/
if (nir->info.stage == MESA_SHADER_COMPUTE)
state.uses_regular_uniforms = true;
/* Walk the IR, recording how many times each UBO block/offset is used. */
nir_foreach_function_impl(impl, nir) {
nir_foreach_block(block, impl) {
analyze_ubos_block(&state, block);
}
}
/* Find ranges: a block, starting 32-byte offset, and length. */
struct util_dynarray ranges;
util_dynarray_init(&ranges, mem_ctx);
hash_table_foreach(state.blocks, entry) {
const int b = entry->hash - 1;
const struct ubo_block_info *info = entry->data;
uint64_t offsets = info->offsets;
/* Walk through the offsets bitfield, finding contiguous regions of
* set bits:
*
* 0000000001111111111111000000000000111111111111110000000011111100
* ^^^^^^^^^^^^^ ^^^^^^^^^^^^^^ ^^^^^^
*
* Each of these will become a UBO range.
*/
while (offsets != 0) {
/* Find the first 1 in the offsets bitfield. This represents the
* start of a range of interesting UBO data. Make it zero-indexed.
*/
int first_bit = ffsll(offsets) - 1;
/* Find the first 0 bit in offsets beyond first_bit. To find the
* first zero bit, we find the first 1 bit in the complement. In
* order to ignore bits before first_bit, we mask off those bits.
*/
int first_hole = ffsll(~offsets & ~((1ull << first_bit) - 1)) - 1;
if (first_hole == -1) {
/* If we didn't find a hole, then set it to the end of the
* bitfield. There are no more ranges to process.
*/
first_hole = 64;
offsets = 0;
} else {
/* We've processed all bits before first_hole. Mask them off. */
offsets &= ~((1ull << first_hole) - 1);
}
struct ubo_range_entry *entry =
util_dynarray_grow(&ranges, struct ubo_range_entry, 1);
entry->range.block = b;
entry->range.start = first_bit;
/* first_hole is one beyond the end, so we don't need to add 1 */
entry->range.length = first_hole - first_bit;
entry->benefit = 0;
for (int i = 0; i < entry->range.length; i++)
entry->benefit += info->uses[first_bit + i];
}
}
int nr_entries = ranges.size / sizeof(struct ubo_range_entry);
if (0) {
util_dynarray_foreach(&ranges, struct ubo_range_entry, entry) {
print_ubo_entry(stderr, entry, &state);
}
}
/* TODO: Consider combining ranges.
*
* We can only push 3-4 ranges via 3DSTATE_CONSTANT_XS. If there are
* more ranges, and two are close by with only a small hole, it may be
* worth combining them. The holes will waste register space, but the
* benefit of removing pulls may outweigh that cost.
*/
/* Sort the list so the most beneficial ranges are at the front. */
if (nr_entries > 0) {
qsort(ranges.data, nr_entries, sizeof(struct ubo_range_entry),
cmp_ubo_range_entry);
}
struct ubo_range_entry *entries = ranges.data;
/* Return the top 4 or so. We drop by one if regular uniforms are in
* use, assuming one push buffer will be dedicated to those. We may
* also only get 3 on Haswell if we can't write INSTPM.
*
* The backend may need to shrink these ranges to ensure that they
* don't exceed the maximum push constant limits. It can simply drop
* the tail of the list, as that's the least valuable portion. We
* unfortunately can't truncate it here, because we don't know what
* the backend is planning to do with regular uniforms.
*/
const int max_ubos = (compiler->constant_buffer_0_is_relative ? 3 : 4) -
state.uses_regular_uniforms;
nr_entries = MIN2(nr_entries, max_ubos);
for (int i = 0; i < nr_entries; i++) {
out_ranges[i] = entries[i].range;
}
for (int i = nr_entries; i < 4; i++) {
out_ranges[i].block = 0;
out_ranges[i].start = 0;
out_ranges[i].length = 0;
}
ralloc_free(ranges.mem_ctx);
}

View file

@ -0,0 +1,132 @@
/*
* Copyright © 2016 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "compiler/nir/nir_builder.h"
#include "brw_nir.h"
/**
* Prior to Haswell, the hardware can't natively support GL_FIXED or
* 2_10_10_10_REV vertex formats. This pass inserts extra shader code
* to produce the correct values.
*/
static bool
apply_attr_wa_instr(nir_builder *b, nir_instr *instr, void *cb_data)
{
const uint8_t *attrib_wa_flags = cb_data;
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_input)
return false;
uint8_t wa_flags = attrib_wa_flags[nir_intrinsic_base(intrin)];
if (wa_flags == 0)
return false;
b->cursor = nir_after_instr(instr);
nir_def *val = &intrin->def;
/* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
* come in as floating point conversions of the integer values.
*/
if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
nir_def *scaled =
nir_fmul_imm(b, val, 1.0f / 65536.0f);
nir_def *comps[4];
for (int i = 0; i < val->num_components; i++) {
bool rescale = i < (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK);
comps[i] = nir_channel(b, rescale ? scaled : val, i);
}
val = nir_vec(b, comps, val->num_components);
}
/* Do sign recovery for 2101010 formats if required. */
if (wa_flags & BRW_ATTRIB_WA_SIGN) {
/* sign recovery shift: <22, 22, 22, 30> */
nir_def *shift = nir_imm_ivec4(b, 22, 22, 22, 30);
val = nir_ishr(b, nir_ishl(b, val, shift), shift);
}
/* Apply BGRA swizzle if required. */
if (wa_flags & BRW_ATTRIB_WA_BGRA) {
val = nir_swizzle(b, val, (unsigned[4]){2,1,0,3}, 4);
}
if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
/* ES 3.0 has different rules for converting signed normalized
* fixed-point numbers than desktop GL.
*/
if (wa_flags & BRW_ATTRIB_WA_SIGN) {
/* According to equation 2.2 of the ES 3.0 specification,
* signed normalization conversion is done by:
*
* f = c / (2^(b-1)-1)
*
* OpenGL 4.2+ uses this equation as well. Since most contexts
* promote to the new higher version, and this is what Haswell+
* hardware does anyway, we just always use this formula.
*/
nir_def *es3_normalize_factor =
nir_imm_vec4(b, 1.0f / ((1 << 9) - 1), 1.0f / ((1 << 9) - 1),
1.0f / ((1 << 9) - 1), 1.0f / ((1 << 1) - 1));
val = nir_fmax(b,
nir_fmul(b, nir_i2f32(b, val), es3_normalize_factor),
nir_imm_float(b, -1.0f));
} else {
/* The following equation is from the OpenGL 3.2 specification:
*
* 2.1 unsigned normalization
* f = c/(2^n-1)
*/
nir_def *normalize_factor =
nir_imm_vec4(b, 1.0f / ((1 << 10) - 1), 1.0f / ((1 << 10) - 1),
1.0f / ((1 << 10) - 1), 1.0f / ((1 << 2) - 1));
val = nir_fmul(b, nir_u2f32(b, val), normalize_factor);
}
}
if (wa_flags & BRW_ATTRIB_WA_SCALE) {
val = (wa_flags & BRW_ATTRIB_WA_SIGN) ? nir_i2f32(b, val)
: nir_u2f32(b, val);
}
nir_def_rewrite_uses_after(&intrin->def, val,
val->parent_instr);
return true;
}
bool
brw_nir_apply_attribute_workarounds(nir_shader *shader,
const uint8_t *attrib_wa_flags)
{
return nir_shader_instructions_pass(shader, apply_attr_wa_instr,
nir_metadata_block_index |
nir_metadata_dominance,
(void *)attrib_wa_flags);
}

View file

@ -0,0 +1,192 @@
/*
* Copyright © 2019 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "compiler/nir/nir_builder.h"
#include "brw_nir.h"
/**
* We need to compute alpha to coverage dithering manually in shader
* and replace sample mask store with the bitwise-AND of sample mask and
* alpha to coverage dithering.
*
* The following formula is used to compute final sample mask:
* m = int(16.0 * clamp(src0_alpha, 0.0, 1.0))
* dither_mask = 0x1111 * ((0xfea80 >> (m & ~3)) & 0xf) |
* 0x0808 * (m & 2) | 0x0100 * (m & 1)
* sample_mask = sample_mask & dither_mask
*
* It gives a number of ones proportional to the alpha for 2, 4, 8 or 16
* least significant bits of the result:
* 0.0000 0000000000000000
* 0.0625 0000000100000000
* 0.1250 0001000000010000
* 0.1875 0001000100010000
* 0.2500 1000100010001000
* 0.3125 1000100110001000
* 0.3750 1001100010011000
* 0.4375 1001100110011000
* 0.5000 1010101010101010
* 0.5625 1010101110101010
* 0.6250 1011101010111010
* 0.6875 1011101110111010
* 0.7500 1110111011101110
* 0.8125 1110111111101110
* 0.8750 1111111011111110
* 0.9375 1111111111111110
* 1.0000 1111111111111111
*/
static nir_def *
build_dither_mask(nir_builder *b, nir_def *color)
{
assert(color->num_components == 4);
nir_def *alpha = nir_channel(b, color, 3);
nir_def *m =
nir_f2i32(b, nir_fmul_imm(b, nir_fsat(b, alpha), 16.0));
nir_def *part_a =
nir_iand_imm(b, nir_ushr(b, nir_imm_int(b, 0xfea80),
nir_iand_imm(b, m, ~3)),
0xf);
nir_def *part_b = nir_iand_imm(b, m, 2);
nir_def *part_c = nir_iand_imm(b, m, 1);
return nir_ior(b, nir_imul_imm(b, part_a, 0x1111),
nir_ior(b, nir_imul_imm(b, part_b, 0x0808),
nir_imul_imm(b, part_c, 0x0100)));
}
bool
brw_nir_lower_alpha_to_coverage(nir_shader *shader,
const struct brw_wm_prog_key *key,
const struct brw_wm_prog_data *prog_data)
{
assert(shader->info.stage == MESA_SHADER_FRAGMENT);
assert(key->alpha_to_coverage != BRW_NEVER);
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
const uint64_t outputs_written = shader->info.outputs_written;
if (!(outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) ||
!(outputs_written & (BITFIELD64_BIT(FRAG_RESULT_COLOR) |
BITFIELD64_BIT(FRAG_RESULT_DATA0))))
goto skip;
nir_intrinsic_instr *sample_mask_write = NULL;
nir_intrinsic_instr *color0_write = NULL;
bool sample_mask_write_first = false;
nir_foreach_block(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_store_output)
continue;
/* We call nir_lower_io_to_temporaries to lower FS outputs to
* temporaries with a copy at the end so this should be the last
* block in the shader.
*/
assert(block->cf_node.parent == &impl->cf_node);
assert(nir_cf_node_is_last(&block->cf_node));
/* See store_output in fs_visitor::nir_emit_fs_intrinsic */
const unsigned store_offset = nir_src_as_uint(intrin->src[1]);
const unsigned driver_location = nir_intrinsic_base(intrin) +
SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION);
/* Extract the FRAG_RESULT */
const unsigned location =
GET_FIELD(driver_location, BRW_NIR_FRAG_OUTPUT_LOCATION);
if (location == FRAG_RESULT_SAMPLE_MASK) {
assert(sample_mask_write == NULL);
sample_mask_write = intrin;
sample_mask_write_first = (color0_write == NULL);
}
if (location == FRAG_RESULT_COLOR ||
location == FRAG_RESULT_DATA0) {
assert(color0_write == NULL);
color0_write = intrin;
}
}
}
/* It's possible that shader_info may be out-of-date and the writes to
* either gl_SampleMask or the first color value may have been removed.
* This can happen if, for instance a nir_undef is written to the
* color value. In that case, just bail and don't do anything rather
* than crashing.
*/
if (color0_write == NULL || sample_mask_write == NULL)
goto skip;
/* It's possible that the color value isn't actually a vec4. In this case,
* assuming an alpha of 1.0 and letting the sample mask pass through
* unaltered seems like the kindest thing to do to apps.
*/
nir_def *color0 = color0_write->src[0].ssa;
if (color0->num_components < 4)
goto skip;
nir_def *sample_mask = sample_mask_write->src[0].ssa;
if (sample_mask_write_first) {
/* If the sample mask write comes before the write to color0, we need
* to move it because it's going to use the value from color0 to
* compute the sample mask.
*/
nir_instr_remove(&sample_mask_write->instr);
nir_instr_insert(nir_after_instr(&color0_write->instr),
&sample_mask_write->instr);
}
nir_builder b = nir_builder_at(nir_before_instr(&sample_mask_write->instr));
/* Combine dither_mask and the gl_SampleMask value */
nir_def *dither_mask = build_dither_mask(&b, color0);
dither_mask = nir_iand(&b, sample_mask, dither_mask);
if (key->alpha_to_coverage == BRW_SOMETIMES) {
nir_def *push_flags =
nir_load_uniform(&b, 1, 32, nir_imm_int(&b, prog_data->msaa_flags_param * 4));
nir_def *alpha_to_coverage =
nir_test_mask(&b, push_flags, INTEL_MSAA_FLAG_ALPHA_TO_COVERAGE);
dither_mask = nir_bcsel(&b, alpha_to_coverage,
dither_mask, sample_mask_write->src[0].ssa);
}
nir_src_rewrite(&sample_mask_write->src[0], dither_mask);
nir_metadata_preserve(impl, nir_metadata_block_index |
nir_metadata_dominance);
return true;
skip:
nir_metadata_preserve(impl, nir_metadata_all);
return false;
}

View file

@ -0,0 +1,818 @@
/*
* Copyright 2023 Intel Corporation
* SPDX-License-Identifier: MIT
*/
/**
* \file brw_nir_lower_cooperative_matrix.c
* Lower cooperative matrix to subgroup operations.
*
* All supported matrix types are assumed to have either 8 rows or 8
* columns. The other dimension of the matrix is typically 8 times the number
* of data elements that can be stored in a 32-bit dword. Matrix data is
* indexed by a combination of an array element and a subgroup invocation ID.
*
* Two layouts for matrix data are used. In the first layout,
* subgroupShuffle(slice[N], ...) accesses row N of the matrix. This will be
* called row-major hereafter. In the other layout,
* subgroupShuffle(slice[...], M) accesses column M of the matrix. This will
* be called column-major hereafter. In cases where a single 32-bit value is
* stored in each entry, these layouts are identical.
*
* The subtle difference arises when multiple values are packed into a single
* 32-bit dword. If two 16-bit values are packed in a single 32-bit value in
* column-major, subgroupShuffle(slice[0], 1) holds matrix entries m[1][1] and
* m[2][1] (in m[row][column] notation). In row-major, that same shuffle holds
* m[0][2] and m[0][3].
*
* There is an alternate way to think about the matrix layouts. Every matrix
* size supported by the Intel driver is either Sx8 (e.g., 16x8 for float16 B
* matrix) or Sx8T (e.g., 8x32 for int8 A matrix). The A matrix and B matrix
* layouts are such that a single 8 dword register hold an entire row of the
* matrix.
*
* Consider a matrix stored starting in register g32. In an A matrix, the
* packed dwords of g32 contain only the data for a single row of the
* matrix. g32 is row 0, g33 is row 1, etc. In a B matrix, the packed dwords
* of g(32+N).X contain only the data for a single column of the
* matrix. g[32:40].0 is column 0, g[32:40].1 is column 1, etc.
*
* This leads to some shenanigans in \c lower_cmat_load_store.
*
* In the common case, A, C, and result matrices are stored row major while B
* matrices are stored column major. This arrangement facilitates efficient
* dot product operations using DPAS or DP4A instructions.
*
* Future optimizations are possible when row and column major are
* flipped. That is, efficient dot products are also possible when A, C, and
* result matrices are column major while B is row major.
*/
#include "brw_nir.h"
struct lower_cmat_state {
nir_shader *shader;
struct hash_table *slice_coop_types;
struct hash_table *vars_to_slice;
unsigned subgroup_size;
};
static void
print_coop_types(struct lower_cmat_state *state)
{
fprintf(stderr, "--- Slices to Cooperative Matrix type table\n");
hash_table_foreach(state->slice_coop_types, e) {
nir_variable *var = (void *)e->key;
const struct glsl_type *t = e->data;
fprintf(stderr, "%p: %s -> %s\n", var, var->name, glsl_get_type_name(t));
}
fprintf(stderr, "\n\n");
}
static const struct glsl_type *
get_coop_type_for_slice(struct lower_cmat_state *state, nir_deref_instr *deref)
{
nir_variable *var = nir_deref_instr_get_variable(deref);
struct hash_entry *entry = _mesa_hash_table_search(state->slice_coop_types, var);
assert(entry != NULL);
return entry->data;
}
static bool
lower_cmat_filter(const nir_instr *instr, const void *_state)
{
if (instr->type == nir_instr_type_deref) {
nir_deref_instr *deref = nir_instr_as_deref(instr);
return glsl_type_is_cmat(deref->type);
}
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_cmat_construct:
case nir_intrinsic_cmat_load:
case nir_intrinsic_cmat_store:
case nir_intrinsic_cmat_length:
case nir_intrinsic_cmat_muladd:
case nir_intrinsic_cmat_unary_op:
case nir_intrinsic_cmat_binary_op:
case nir_intrinsic_cmat_scalar_op:
case nir_intrinsic_cmat_bitcast:
case nir_intrinsic_cmat_insert:
case nir_intrinsic_cmat_extract:
case nir_intrinsic_cmat_copy:
return true;
default:
return false;
}
}
/**
* Get number of matrix elements packed in each component of the slice.
*/
static unsigned
get_packing_factor(const struct glsl_cmat_description desc,
const struct glsl_type *slice_type)
{
const struct glsl_type *slice_element_type = glsl_without_array(slice_type);
assert(!glsl_type_is_cmat(slice_type));
assert(glsl_get_bit_size(slice_element_type) >= glsl_base_type_get_bit_size(desc.element_type));
assert(glsl_get_bit_size(slice_element_type) % glsl_base_type_get_bit_size(desc.element_type) == 0);
return glsl_get_bit_size(slice_element_type) / glsl_base_type_get_bit_size(desc.element_type);
}
static const struct glsl_type *
get_slice_type_from_desc(const struct lower_cmat_state *state,
const struct glsl_cmat_description desc)
{
enum glsl_base_type base_type;
/* Number of matrix elements stored by each subgroup invocation. If the
* data is packed, the slice size will be less than this.
*/
const unsigned elements_per_invocation =
(desc.rows * desc.cols) / state->subgroup_size;
assert(elements_per_invocation > 0);
const unsigned element_bits = 32;
const unsigned bits = glsl_base_type_get_bit_size(desc.element_type);
unsigned packing_factor = MIN2(elements_per_invocation,
element_bits / bits);
/* Adjust the packing factor so that each row of the matrix fills and
* entire GRF.
*
* The in-register layout of B matrices is different, so those are handled
* more like column major (for row major matrices). See the file comment
* for more details.
*/
const unsigned actual_cols = desc.use != GLSL_CMAT_USE_B ? desc.cols : desc.rows;
while ((actual_cols / packing_factor) < 8) {
assert(packing_factor > 1);
packing_factor /= 2;
}
switch (desc.element_type) {
case GLSL_TYPE_FLOAT:
base_type = GLSL_TYPE_FLOAT;
break;
case GLSL_TYPE_UINT:
case GLSL_TYPE_FLOAT16:
case GLSL_TYPE_UINT8:
case GLSL_TYPE_UINT16:
base_type = glsl_get_base_type(glsl_uintN_t_type(packing_factor * bits));
break;
case GLSL_TYPE_INT:
case GLSL_TYPE_INT8:
case GLSL_TYPE_INT16:
base_type = glsl_get_base_type(glsl_intN_t_type(packing_factor * bits));
break;
default:
unreachable("Invalid cooperative matrix element type.");
}
unsigned len = elements_per_invocation / packing_factor;
/* Supported matrix sizes are designed to fill either 4 or 8 SIMD8
* registers. That means:
*
* 4 regsiters 8 registers
* SIMD32 len = 1 len = 2
* SIMD16 len = 2 len = 4
* SIMD8 len = 4 len = 8
*
* If configurations are added that result in other values of len, at the
* very least this assertion will need to be updated. The only value of len
* that makes sense to add would be 16, and that would be a lot of
* registers.
*/
assert(len == 1 || len == 2 || len == 4 || len == 8);
const struct glsl_type *slice_type = glsl_vector_type(base_type, len);
assert(packing_factor == get_packing_factor(desc, slice_type));
return slice_type;
}
static const struct glsl_type *
get_slice_type(const struct lower_cmat_state *state,
const struct glsl_type *type)
{
if (glsl_type_is_array(type)) {
const struct glsl_type *slice_type =
get_slice_type(state, glsl_get_array_element(type));
return glsl_array_type(slice_type, glsl_array_size(type), 0);
}
assert(glsl_type_is_cmat(type));
return get_slice_type_from_desc(state,
*glsl_get_cmat_description(type));
}
static nir_deref_instr *
create_local_slice(struct lower_cmat_state *state, nir_builder *b,
const struct glsl_type *mat_type, const char *name)
{
const struct glsl_type *slice_type = get_slice_type(state, mat_type);
nir_variable *slice_var = nir_local_variable_create(b->impl, slice_type, name);
_mesa_hash_table_insert(state->slice_coop_types, slice_var, (void *)mat_type);
return nir_build_deref_var(b, slice_var);
}
static void
lower_cmat_load_store(nir_builder *b, nir_intrinsic_instr *intrin,
struct lower_cmat_state *state)
{
const bool load = intrin->intrinsic == nir_intrinsic_cmat_load;
const unsigned mat_src = load ? 0 : 1;
const unsigned ptr_src = load ? 1 : 0;
nir_deref_instr *slice = nir_src_as_deref(intrin->src[mat_src]);
const struct glsl_type *mat_type = get_coop_type_for_slice(state, slice);
const struct glsl_cmat_description *desc = glsl_get_cmat_description(mat_type);
nir_def *results[NIR_MAX_VEC_COMPONENTS];
const unsigned num_components = glsl_get_vector_elements(slice->type);
const unsigned packing_factor = get_packing_factor(*desc, slice->type);
nir_deref_instr *pointer = nir_src_as_deref(intrin->src[ptr_src]);
if ((nir_intrinsic_matrix_layout(intrin) == GLSL_MATRIX_LAYOUT_ROW_MAJOR) ==
(desc->use != GLSL_CMAT_USE_B)) {
nir_def *stride = nir_udiv_imm(b, intrin->src[2].ssa, packing_factor);
const struct glsl_type *element_type =
glsl_scalar_type(glsl_get_base_type(slice->type));
pointer = nir_build_deref_cast(b, &pointer->def, pointer->modes,
element_type,
glsl_get_bit_size(element_type) / 8);
nir_def *invocation = nir_load_subgroup_invocation(b);
nir_def *base_offset;
nir_def *step;
if (desc->use != GLSL_CMAT_USE_B) {
base_offset = nir_iadd(b,
nir_imul(b,
nir_udiv_imm(b, invocation, 8),
stride),
nir_umod_imm(b, invocation, 8));
step = nir_imul_imm(b, stride, state->subgroup_size / 8);
} else {
base_offset = nir_iadd(b,
nir_imul(b,
nir_umod_imm(b, invocation, 8),
stride),
nir_udiv_imm(b, invocation, 8));
step = nir_imm_int(b, state->subgroup_size / 8);
}
for (unsigned i = 0; i < num_components; i++) {
nir_def *offset = nir_imul_imm(b, step, i);
nir_deref_instr *memory_deref =
nir_build_deref_ptr_as_array(b, pointer,
nir_i2iN(b,
nir_iadd(b,
base_offset,
offset),
pointer->def.bit_size));
if (load) {
results[i] = nir_load_deref(b, memory_deref);
} else {
nir_def *src = nir_channel(b, nir_load_deref(b, slice), i);
nir_store_deref(b, memory_deref, src, 0x1);
}
}
} else {
nir_def *stride = intrin->src[2].ssa;
const struct glsl_type *element_type = glsl_scalar_type(desc->element_type);
const unsigned element_bits = glsl_base_type_get_bit_size(desc->element_type);
const unsigned element_stride = element_bits / 8;
pointer = nir_build_deref_cast(b, &pointer->def, pointer->modes, element_type,
element_stride);
nir_def *invocation_div_8 = nir_udiv_imm(b, nir_load_subgroup_invocation(b), 8);
nir_def *invocation_mod_8 = nir_umod_imm(b, nir_load_subgroup_invocation(b), 8);
nir_def *packed_stride = nir_imul_imm(b, stride, packing_factor);
for (unsigned i = 0; i < num_components; i++) {
const unsigned i_offset = i * (state->subgroup_size / 8);
nir_def *v[4];
for (unsigned j = 0; j < packing_factor; j++) {
nir_def *j_offset = nir_imul_imm(b, stride, j);
nir_def *offset;
if (desc->use != GLSL_CMAT_USE_B) {
offset = nir_iadd(b,
nir_iadd(b,
nir_imul(b,
invocation_mod_8,
packed_stride),
invocation_div_8),
nir_iadd_imm(b, j_offset, i_offset));
} else {
offset = nir_iadd(b,
nir_iadd(b,
nir_imul(b,
invocation_div_8,
packed_stride),
invocation_mod_8),
nir_iadd(b,
nir_imul_imm(b,
packed_stride,
i_offset),
j_offset));
}
nir_deref_instr *memory_deref =
nir_build_deref_ptr_as_array(b, pointer,
nir_i2iN(b,
offset,
pointer->def.bit_size));
if (load) {
v[j] = nir_load_deref(b, memory_deref);
} else {
nir_def *src = nir_channel(b, nir_load_deref(b, slice), i);
nir_def *v =
nir_channel(b, nir_unpack_bits(b, src, element_bits), j);
nir_store_deref(b, memory_deref, v, 0x1);
}
}
if (load) {
results[i] = nir_pack_bits(b, nir_vec(b, v, packing_factor),
packing_factor * element_bits);
}
}
}
if (load)
nir_store_deref(b, slice, nir_vec(b, results, num_components),
nir_component_mask(num_components));
}
static void
lower_cmat_unary_op(nir_builder *b, nir_intrinsic_instr *intrin,
struct lower_cmat_state *state)
{
nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
nir_deref_instr *src_slice = nir_src_as_deref(intrin->src[1]);
nir_def *results[NIR_MAX_VEC_COMPONENTS];
const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
const struct glsl_type *dst_mat_type =
get_coop_type_for_slice(state, dst_slice);
const struct glsl_type *src_mat_type =
get_coop_type_for_slice(state, src_slice);
const struct glsl_cmat_description dst_desc =
*glsl_get_cmat_description(dst_mat_type);
const struct glsl_cmat_description src_desc =
*glsl_get_cmat_description(src_mat_type);
const unsigned dst_bits = glsl_base_type_bit_size(dst_desc.element_type);
const unsigned src_bits = glsl_base_type_bit_size(src_desc.element_type);
/* The type of the returned slice may be different from the type of the
* input slice.
*/
const unsigned dst_packing_factor =
get_packing_factor(dst_desc, dst_slice->type);
const unsigned src_packing_factor =
get_packing_factor(src_desc, src_slice->type);
const nir_op op = nir_intrinsic_alu_op(intrin);
/* There are three possible cases:
*
* 1. dst_packing_factor == src_packing_factor. This is the common case,
* and handling it is straightforward.
*
* 2. dst_packing_factor > src_packing_factor. This occurs when converting a
* float32_t matrix slice to a packed float16_t slice. Loop over the size
* of the destination slice, but read multiple entries from the source
* slice on each iteration.
*
* 3. dst_packing_factor < src_packing_factor. This occurs when converting a
* packed int8_t matrix slice to an int32_t slice. Loop over the size of
* the source slice, but write multiple entries to the destination slice
* on each iteration.
*
* Handle all cases by iterating over the total (non-packed) number of
* elements in the slice. When dst_packing_factor values have been
* calculated, store them.
*/
assert((dst_packing_factor * glsl_get_vector_elements(dst_slice->type)) ==
(src_packing_factor * glsl_get_vector_elements(src_slice->type)));
/* Stores at most dst_packing_factor partial results. */
nir_def *v[4];
assert(dst_packing_factor <= 4);
for (unsigned i = 0; i < num_components * dst_packing_factor; i++) {
const unsigned dst_chan_index = i % dst_packing_factor;
const unsigned src_chan_index = i % src_packing_factor;
const unsigned dst_index = i / dst_packing_factor;
const unsigned src_index = i / src_packing_factor;
nir_def *src =
nir_channel(b,
nir_unpack_bits(b,
nir_channel(b,
nir_load_deref(b, src_slice),
src_index),
src_bits),
src_chan_index);
v[dst_chan_index] = nir_build_alu1(b, op, src);
if (dst_chan_index == (dst_packing_factor - 1)) {
results[dst_index] =
nir_pack_bits(b, nir_vec(b, v, dst_packing_factor),
dst_packing_factor * dst_bits);
}
}
nir_store_deref(b, dst_slice, nir_vec(b, results, num_components),
nir_component_mask(num_components));
}
static void
lower_cmat_binary_op(nir_builder *b, nir_intrinsic_instr *intrin,
struct lower_cmat_state *state)
{
nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
nir_deref_instr *src_a_slice = nir_src_as_deref(intrin->src[1]);
nir_deref_instr *src_b_slice = nir_src_as_deref(intrin->src[2]);
nir_def *src_a = nir_load_deref(b, src_a_slice);
nir_def *src_b = nir_load_deref(b, src_b_slice);
nir_def *results[NIR_MAX_VEC_COMPONENTS];
const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
const struct glsl_type *dst_mat_type = get_coop_type_for_slice(state, dst_slice);
ASSERTED const struct glsl_type *src_a_mat_type = get_coop_type_for_slice(state, src_a_slice);
ASSERTED const struct glsl_type *src_b_mat_type = get_coop_type_for_slice(state, src_b_slice);
const struct glsl_cmat_description desc =
*glsl_get_cmat_description(dst_mat_type);
assert(dst_mat_type == src_a_mat_type);
assert(dst_mat_type == src_b_mat_type);
const unsigned bits = glsl_base_type_bit_size(desc.element_type);
const unsigned packing_factor = get_packing_factor(desc, dst_slice->type);
for (unsigned i = 0; i < num_components; i++) {
nir_def *val_a = nir_channel(b, src_a, i);
nir_def *val_b = nir_channel(b, src_b, i);
results[i] =
nir_pack_bits(b, nir_build_alu2(b, nir_intrinsic_alu_op(intrin),
nir_unpack_bits(b, val_a, bits),
nir_unpack_bits(b, val_b, bits)),
packing_factor * bits);
}
nir_store_deref(b, dst_slice, nir_vec(b, results, num_components),
nir_component_mask(num_components));
}
static void
lower_cmat_scalar_op(nir_builder *b, nir_intrinsic_instr *intrin,
struct lower_cmat_state *state)
{
nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
nir_deref_instr *src_slice = nir_src_as_deref(intrin->src[1]);
nir_def *scalar = intrin->src[2].ssa;
nir_def *src = nir_load_deref(b, src_slice);
nir_def *results[NIR_MAX_VEC_COMPONENTS];
const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
ASSERTED const struct glsl_type *dst_mat_type = get_coop_type_for_slice(state, dst_slice);
ASSERTED const struct glsl_type *src_mat_type = get_coop_type_for_slice(state, src_slice);
assert(dst_mat_type == src_mat_type);
const struct glsl_cmat_description desc =
*glsl_get_cmat_description(dst_mat_type);
const unsigned bits = glsl_base_type_bit_size(desc.element_type);
const unsigned packing_factor = get_packing_factor(desc, dst_slice->type);
for (unsigned i = 0; i < num_components; i++) {
nir_def *val = nir_channel(b, src, i);
results[i] =
nir_pack_bits(b, nir_build_alu2(b, nir_intrinsic_alu_op(intrin),
nir_unpack_bits(b, val, bits),
scalar),
packing_factor * bits);
}
nir_store_deref(b, dst_slice, nir_vec(b, results, num_components),
nir_component_mask(num_components));
}
static nir_deref_instr *
lower_cmat_deref(nir_builder *b, nir_deref_instr *deref,
struct lower_cmat_state *state)
{
nir_deref_instr *parent = nir_deref_instr_parent(deref);
if (parent) {
assert(deref->deref_type == nir_deref_type_array);
parent = lower_cmat_deref(b, parent, state);
return nir_build_deref_array(b, parent, deref->arr.index.ssa);
} else {
assert(deref->deref_type == nir_deref_type_var);
assert(deref->var);
assert(glsl_type_is_cmat(glsl_without_array(deref->var->type)));
struct hash_entry *entry = _mesa_hash_table_search(state->vars_to_slice, deref->var);
assert(entry);
return nir_build_deref_var(b, (nir_variable *)entry->data);
}
}
static nir_def *
lower_cmat_instr(nir_builder *b, nir_instr *instr, void *_state)
{
struct lower_cmat_state *state = _state;
if (instr->type == nir_instr_type_deref) {
nir_deref_instr *deref = lower_cmat_deref(b, nir_instr_as_deref(instr), state);
return &deref->def;
}
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_cmat_load:
case nir_intrinsic_cmat_store:
lower_cmat_load_store(b, intrin, state);
return NIR_LOWER_INSTR_PROGRESS_REPLACE;
case nir_intrinsic_cmat_construct: {
nir_deref_instr *slice = nir_src_as_deref(intrin->src[0]);
nir_def *src = intrin->src[1].ssa;
const struct glsl_type *mat_type = get_coop_type_for_slice(state, slice);
const struct glsl_cmat_description desc =
*glsl_get_cmat_description(mat_type);
const unsigned packing_factor = get_packing_factor(desc, slice->type);
if (packing_factor > 1) {
src = nir_pack_bits(b, nir_replicate(b, src, packing_factor),
packing_factor * glsl_base_type_get_bit_size(desc.element_type));
}
const unsigned num_components = glsl_get_vector_elements(slice->type);
nir_store_deref(b, slice, nir_replicate(b, src, num_components),
nir_component_mask(num_components));
return NIR_LOWER_INSTR_PROGRESS_REPLACE;
}
case nir_intrinsic_cmat_unary_op:
lower_cmat_unary_op(b, intrin, state);
return NIR_LOWER_INSTR_PROGRESS_REPLACE;
case nir_intrinsic_cmat_binary_op:
lower_cmat_binary_op(b, intrin, state);
return NIR_LOWER_INSTR_PROGRESS_REPLACE;
case nir_intrinsic_cmat_scalar_op:
lower_cmat_scalar_op(b, intrin, state);
return NIR_LOWER_INSTR_PROGRESS_REPLACE;
case nir_intrinsic_cmat_length: {
const struct glsl_cmat_description desc = nir_intrinsic_cmat_desc(intrin);
const struct glsl_type *mat_type = glsl_cmat_type(&desc);
const struct glsl_type *slice_type = get_slice_type(state, mat_type);
return nir_imm_intN_t(b, (get_packing_factor(desc, slice_type) *
glsl_get_vector_elements(slice_type)), 32);
}
case nir_intrinsic_cmat_muladd: {
nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
nir_deref_instr *A_slice = nir_src_as_deref(intrin->src[1]);
nir_deref_instr *B_slice = nir_src_as_deref(intrin->src[2]);
nir_deref_instr *accum_slice = nir_src_as_deref(intrin->src[3]);
const struct glsl_type *dst_mat_type = get_coop_type_for_slice(state, dst_slice);
const struct glsl_cmat_description dst_desc = *glsl_get_cmat_description(dst_mat_type);
const struct glsl_type *src_mat_type = get_coop_type_for_slice(state, A_slice);
const struct glsl_cmat_description src_desc = *glsl_get_cmat_description(src_mat_type);
const unsigned packing_factor = get_packing_factor(dst_desc, dst_slice->type);
const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
nir_def *result =
nir_dpas_intel(b,
packing_factor * glsl_base_type_get_bit_size(dst_desc.element_type),
nir_load_deref(b, A_slice),
nir_load_deref(b, B_slice),
nir_load_deref(b, accum_slice),
.dest_type = nir_get_nir_type_for_glsl_base_type(dst_desc.element_type),
.src_type = nir_get_nir_type_for_glsl_base_type(src_desc.element_type),
.saturate = nir_intrinsic_saturate(intrin),
.cmat_signed_mask = nir_intrinsic_cmat_signed_mask(intrin),
.systolic_depth = 8,
.repeat_count = 8);
nir_store_deref(b, dst_slice, result,
nir_component_mask(num_components));
return NIR_LOWER_INSTR_PROGRESS_REPLACE;
}
case nir_intrinsic_cmat_bitcast: {
nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
nir_deref_instr *src_slice = nir_src_as_deref(intrin->src[1]);
const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
assert(glsl_get_vector_elements(src_slice->type) == num_components);
nir_store_deref(b, dst_slice, nir_load_deref(b, src_slice),
nir_component_mask(num_components));
return NIR_LOWER_INSTR_PROGRESS_REPLACE;
}
case nir_intrinsic_cmat_copy:
nir_copy_deref(b,
nir_src_as_deref(intrin->src[0]),
nir_src_as_deref(intrin->src[1]));
return NIR_LOWER_INSTR_PROGRESS_REPLACE;
case nir_intrinsic_cmat_insert: {
nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
nir_def *scalar = intrin->src[1].ssa;
nir_deref_instr *src_slice = nir_src_as_deref(intrin->src[2]);
const nir_src dst_index = intrin->src[3];
const struct glsl_type *dst_mat_type = get_coop_type_for_slice(state, dst_slice);
ASSERTED const struct glsl_type *src_mat_type = get_coop_type_for_slice(state, src_slice);
assert(dst_mat_type == src_mat_type);
const struct glsl_cmat_description desc =
*glsl_get_cmat_description(dst_mat_type);
const unsigned bits = glsl_base_type_bit_size(desc.element_type);
const unsigned packing_factor = get_packing_factor(desc, dst_slice->type);
const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
nir_def *slice_index = nir_udiv_imm(b, dst_index.ssa, packing_factor);
nir_def *vector_index = nir_umod_imm(b, dst_index.ssa, packing_factor);
nir_def *results[NIR_MAX_VEC_COMPONENTS];
const int slice_constant_index = nir_src_is_const(dst_index)
? nir_src_as_uint(dst_index) / packing_factor
: -1;
for (unsigned i = 0; i < num_components; i++) {
nir_def *val = nir_channel(b, nir_load_deref(b, src_slice), i);
nir_def *insert;
if (slice_constant_index < 0 || slice_constant_index == i) {
if (packing_factor == 1) {
insert = scalar;
} else {
nir_def *unpacked = nir_unpack_bits(b, val, bits);
nir_def *v = nir_vector_insert(b, unpacked, scalar, vector_index);
insert = nir_pack_bits(b, v, bits * packing_factor);
}
} else {
insert = val;
}
results[i] = slice_constant_index < 0
? nir_bcsel(b, nir_ieq_imm(b, slice_index, i), insert, val)
: insert;
}
nir_store_deref(b, dst_slice, nir_vec(b, results, num_components),
nir_component_mask(num_components));
return NIR_LOWER_INSTR_PROGRESS_REPLACE;
}
case nir_intrinsic_cmat_extract: {
nir_deref_instr *slice = nir_src_as_deref(intrin->src[0]);
const struct glsl_type *mat_type = get_coop_type_for_slice(state, slice);
nir_def *index = intrin->src[1].ssa;
const struct glsl_cmat_description desc =
*glsl_get_cmat_description(mat_type);
const unsigned bits = glsl_base_type_bit_size(desc.element_type);
const unsigned packing_factor = get_packing_factor(desc, slice->type);
nir_def *src =
nir_vector_extract(b, nir_load_deref(b, slice),
nir_udiv_imm(b, index, packing_factor));
if (packing_factor == 1) {
return src;
} else {
return nir_vector_extract(b,
nir_unpack_bits(b, src, bits),
nir_umod_imm(b, index, packing_factor));
}
return NIR_LOWER_INSTR_PROGRESS_REPLACE;
}
default:
unreachable("invalid cooperative matrix intrinsic");
}
}
static void
create_slice_var(struct lower_cmat_state *state, nir_variable *var,
nir_function_impl *impl)
{
// TODO: without array
const struct glsl_type *mat_type = glsl_without_array(var->type);
assert(glsl_type_is_cmat(mat_type));
assert((!impl && var->data.mode == nir_var_shader_temp) ||
( impl && var->data.mode == nir_var_function_temp));
const struct glsl_type *slice_type = get_slice_type(state, var->type);
const char *slice_name = ralloc_asprintf(state->shader, "%s_slice", var->name);
nir_variable *slice_var = impl ?
nir_local_variable_create(impl, slice_type, slice_name) :
nir_variable_create(state->shader, var->data.mode, slice_type, slice_name);
_mesa_hash_table_insert(state->vars_to_slice, var, slice_var);
_mesa_hash_table_insert(state->slice_coop_types, slice_var, (void *)mat_type);
}
bool
brw_nir_lower_cmat(nir_shader *shader, unsigned subgroup_size)
{
void *temp_ctx = ralloc_context(NULL);
struct lower_cmat_state state = {
.shader = shader,
.slice_coop_types = _mesa_pointer_hash_table_create(temp_ctx),
.vars_to_slice = _mesa_pointer_hash_table_create(temp_ctx),
.subgroup_size = subgroup_size,
};
/* Create a slice array for each variable and add a map from the original
* variable back to it, so it can be reached during lowering.
*
* TODO: Cooperative matrix inside struct?
*/
nir_foreach_variable_in_shader(var, shader) {
if (glsl_type_is_cmat(glsl_without_array(var->type)))
create_slice_var(&state, var, NULL);
}
nir_foreach_function(func, shader) {
nir_foreach_function_temp_variable(var, func->impl) {
if (glsl_type_is_cmat(glsl_without_array(var->type)))
create_slice_var(&state, var, func->impl);
}
}
bool progress = nir_shader_lower_instructions(shader,
lower_cmat_filter,
lower_cmat_instr,
&state);
ralloc_free(temp_ctx);
return progress;
}

View file

@ -0,0 +1,362 @@
/*
* Copyright (c) 2016 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_nir.h"
#include "compiler/nir/nir_builder.h"
struct lower_intrinsics_state {
nir_shader *nir;
nir_function_impl *impl;
bool progress;
bool hw_generated_local_id;
nir_builder builder;
};
static void
compute_local_index_id(nir_builder *b,
nir_shader *nir,
nir_def **local_index,
nir_def **local_id)
{
nir_def *subgroup_id = nir_load_subgroup_id(b);
nir_def *thread_local_id =
nir_imul(b, subgroup_id, nir_load_simd_width_intel(b));
nir_def *channel = nir_load_subgroup_invocation(b);
nir_def *linear = nir_iadd(b, channel, thread_local_id);
nir_def *size_x;
nir_def *size_y;
if (nir->info.workgroup_size_variable) {
nir_def *size_xyz = nir_load_workgroup_size(b);
size_x = nir_channel(b, size_xyz, 0);
size_y = nir_channel(b, size_xyz, 1);
} else {
size_x = nir_imm_int(b, nir->info.workgroup_size[0]);
size_y = nir_imm_int(b, nir->info.workgroup_size[1]);
}
nir_def *size_xy = nir_imul(b, size_x, size_y);
/* The local invocation index and ID must respect the following
*
* gl_LocalInvocationID.x =
* gl_LocalInvocationIndex % gl_WorkGroupSize.x;
* gl_LocalInvocationID.y =
* (gl_LocalInvocationIndex / gl_WorkGroupSize.x) %
* gl_WorkGroupSize.y;
* gl_LocalInvocationID.z =
* (gl_LocalInvocationIndex /
* (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) %
* gl_WorkGroupSize.z;
*
* However, the final % gl_WorkGroupSize.z does nothing unless we
* accidentally end up with a gl_LocalInvocationIndex that is too
* large so it can safely be omitted.
*/
nir_def *id_x, *id_y, *id_z;
switch (nir->info.cs.derivative_group) {
case DERIVATIVE_GROUP_NONE:
if (nir->info.num_images == 0 &&
nir->info.num_textures == 0) {
/* X-major lid order. Optimal for linear accesses only,
* which are usually buffers. X,Y ordering will look like:
* (0,0) (1,0) (2,0) ... (size_x-1,0) (0,1) (1,1) ...
*/
id_x = nir_umod(b, linear, size_x);
id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y);
*local_index = linear;
} else if (!nir->info.workgroup_size_variable &&
nir->info.workgroup_size[1] % 4 == 0) {
/* 1x4 block X-major lid order. Same as X-major except increments in
* blocks of width=1 height=4. Always optimal for tileY and usually
* optimal for linear accesses.
* x = (linear / 4) % size_x
* y = ((linear % 4) + (linear / 4 / size_x) * 4) % size_y
* X,Y ordering will look like: (0,0) (0,1) (0,2) (0,3) (1,0) (1,1)
* (1,2) (1,3) (2,0) ... (size_x-1,3) (0,4) (0,5) (0,6) (0,7) (1,4) ...
*/
const unsigned height = 4;
nir_def *block = nir_udiv_imm(b, linear, height);
id_x = nir_umod(b, block, size_x);
id_y = nir_umod(b,
nir_iadd(b,
nir_umod_imm(b, linear, height),
nir_imul_imm(b,
nir_udiv(b, block, size_x),
height)),
size_y);
} else {
/* Y-major lid order. Optimal for tileY accesses only,
* which are usually images. X,Y ordering will look like:
* (0,0) (0,1) (0,2) ... (0,size_y-1) (1,0) (1,1) ...
*/
id_y = nir_umod(b, linear, size_y);
id_x = nir_umod(b, nir_udiv(b, linear, size_y), size_x);
}
id_z = nir_udiv(b, linear, size_xy);
*local_id = nir_vec3(b, id_x, id_y, id_z);
if (!*local_index) {
*local_index = nir_iadd(b, nir_iadd(b, id_x,
nir_imul(b, id_y, size_x)),
nir_imul(b, id_z, size_xy));
}
break;
case DERIVATIVE_GROUP_LINEAR:
/* For linear, just set the local invocation index linearly,
* and calculate local invocation ID from that.
*/
id_x = nir_umod(b, linear, size_x);
id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y);
id_z = nir_udiv(b, linear, size_xy);
*local_id = nir_vec3(b, id_x, id_y, id_z);
*local_index = linear;
break;
case DERIVATIVE_GROUP_QUADS: {
/* For quads, first we figure out the 2x2 grid the invocation
* belongs to -- treating extra Z layers as just more rows.
* Then map that into local invocation ID (trivial) and local
* invocation index. Skipping Z simplify index calculation.
*/
nir_def *one = nir_imm_int(b, 1);
nir_def *double_size_x = nir_ishl(b, size_x, one);
/* ID within a pair of rows, where each group of 4 is 2x2 quad. */
nir_def *row_pair_id = nir_umod(b, linear, double_size_x);
nir_def *y_row_pairs = nir_udiv(b, linear, double_size_x);
nir_def *x =
nir_ior(b,
nir_iand(b, row_pair_id, one),
nir_iand(b, nir_ishr(b, row_pair_id, one),
nir_imm_int(b, 0xfffffffe)));
nir_def *y =
nir_ior(b,
nir_ishl(b, y_row_pairs, one),
nir_iand(b, nir_ishr(b, row_pair_id, one), one));
*local_id = nir_vec3(b, x,
nir_umod(b, y, size_y),
nir_udiv(b, y, size_y));
*local_index = nir_iadd(b, x, nir_imul(b, y, size_x));
break;
}
default:
unreachable("invalid derivative group");
}
}
static bool
lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
nir_block *block)
{
bool progress = false;
nir_builder *b = &state->builder;
nir_shader *nir = state->nir;
/* Reuse calculated values inside the block. */
nir_def *local_index = NULL;
nir_def *local_id = NULL;
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
b->cursor = nir_after_instr(&intrinsic->instr);
nir_def *sysval;
switch (intrinsic->intrinsic) {
case nir_intrinsic_load_local_invocation_id:
if (state->hw_generated_local_id)
continue;
FALLTHROUGH;
case nir_intrinsic_load_local_invocation_index: {
if (!local_index && !nir->info.workgroup_size_variable) {
const uint16_t *ws = nir->info.workgroup_size;
if (ws[0] * ws[1] * ws[2] == 1) {
nir_def *zero = nir_imm_int(b, 0);
local_index = zero;
local_id = nir_replicate(b, zero, 3);
}
}
if (!local_index) {
if (nir->info.stage == MESA_SHADER_TASK ||
nir->info.stage == MESA_SHADER_MESH) {
/* Will be lowered by nir_emit_task_mesh_intrinsic() using
* information from the payload.
*/
continue;
}
if (state->hw_generated_local_id) {
nir_def *local_id_vec = nir_load_local_invocation_id(b);
nir_def *local_id[3] = { nir_channel(b, local_id_vec, 0),
nir_channel(b, local_id_vec, 1),
nir_channel(b, local_id_vec, 2) };
nir_def *size_x = nir_imm_int(b, nir->info.workgroup_size[0]);
nir_def *size_y = nir_imm_int(b, nir->info.workgroup_size[1]);
sysval = nir_imul(b, local_id[2], nir_imul(b, size_x, size_y));
sysval = nir_iadd(b, sysval, nir_imul(b, local_id[1], size_x));
sysval = nir_iadd(b, sysval, local_id[0]);
local_index = sysval;
break;
}
/* First time we are using those, so let's calculate them. */
assert(!local_id);
compute_local_index_id(b, nir, &local_index, &local_id);
}
assert(local_id);
assert(local_index);
if (intrinsic->intrinsic == nir_intrinsic_load_local_invocation_id)
sysval = local_id;
else
sysval = local_index;
break;
}
case nir_intrinsic_load_num_subgroups: {
nir_def *size;
if (state->nir->info.workgroup_size_variable) {
nir_def *size_xyz = nir_load_workgroup_size(b);
nir_def *size_x = nir_channel(b, size_xyz, 0);
nir_def *size_y = nir_channel(b, size_xyz, 1);
nir_def *size_z = nir_channel(b, size_xyz, 2);
size = nir_imul(b, nir_imul(b, size_x, size_y), size_z);
} else {
size = nir_imm_int(b, nir->info.workgroup_size[0] *
nir->info.workgroup_size[1] *
nir->info.workgroup_size[2]);
}
/* Calculate the equivalent of DIV_ROUND_UP. */
nir_def *simd_width = nir_load_simd_width_intel(b);
sysval =
nir_udiv(b, nir_iadd_imm(b, nir_iadd(b, size, simd_width), -1),
simd_width);
break;
}
default:
continue;
}
if (intrinsic->def.bit_size == 64)
sysval = nir_u2u64(b, sysval);
nir_def_rewrite_uses(&intrinsic->def, sysval);
nir_instr_remove(&intrinsic->instr);
state->progress = true;
}
return progress;
}
static void
lower_cs_intrinsics_convert_impl(struct lower_intrinsics_state *state)
{
state->builder = nir_builder_create(state->impl);
nir_foreach_block(block, state->impl) {
lower_cs_intrinsics_convert_block(state, block);
}
nir_metadata_preserve(state->impl,
nir_metadata_block_index | nir_metadata_dominance);
}
bool
brw_nir_lower_cs_intrinsics(nir_shader *nir,
const struct intel_device_info *devinfo,
struct brw_cs_prog_data *prog_data)
{
assert(gl_shader_stage_uses_workgroup(nir->info.stage));
struct lower_intrinsics_state state = {
.nir = nir,
.hw_generated_local_id = false,
};
/* Constraints from NV_compute_shader_derivatives. */
if (gl_shader_stage_is_compute(nir->info.stage) &&
!nir->info.workgroup_size_variable) {
if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) {
assert(nir->info.workgroup_size[0] % 2 == 0);
assert(nir->info.workgroup_size[1] % 2 == 0);
} else if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_LINEAR) {
ASSERTED unsigned workgroup_size =
nir->info.workgroup_size[0] *
nir->info.workgroup_size[1] *
nir->info.workgroup_size[2];
assert(workgroup_size % 4 == 0);
}
}
if (devinfo->verx10 >= 125 && prog_data &&
nir->info.stage == MESA_SHADER_COMPUTE &&
nir->info.cs.derivative_group != DERIVATIVE_GROUP_QUADS &&
!nir->info.workgroup_size_variable &&
util_is_power_of_two_nonzero(nir->info.workgroup_size[0]) &&
util_is_power_of_two_nonzero(nir->info.workgroup_size[1])) {
state.hw_generated_local_id = true;
/* TODO: more heuristics about 1D/SLM access vs. 2D access */
bool linear =
BITSET_TEST(nir->info.system_values_read,
SYSTEM_VALUE_LOCAL_INVOCATION_INDEX) ||
(nir->info.workgroup_size[1] == 1 &&
nir->info.workgroup_size[2] == 1) ||
(nir->info.num_images == 0 && nir->info.num_textures == 0);
prog_data->walk_order =
linear ? INTEL_WALK_ORDER_XYZ : INTEL_WALK_ORDER_YXZ;
/* nir_lower_compute_system_values will replace any references to
* SYSTEM_VALUE_LOCAL_INVOCATION_ID vector components with zero for
* any dimension where the workgroup size is 1, so we can skip
* generating those. However, the hardware can only generate
* X, XY, or XYZ - it can't skip earlier components.
*/
prog_data->generate_local_id =
(nir->info.workgroup_size[0] > 1 ? WRITEMASK_X : 0) |
(nir->info.workgroup_size[1] > 1 ? WRITEMASK_XY : 0) |
(nir->info.workgroup_size[2] > 1 ? WRITEMASK_XYZ : 0);
}
nir_foreach_function_impl(impl, nir) {
state.impl = impl;
lower_cs_intrinsics_convert_impl(&state);
}
return state.progress;
}

View file

@ -0,0 +1,273 @@
/*
* Copyright (c) 2020 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_nir_rt.h"
#include "brw_nir_rt_builder.h"
static nir_function_impl *
lower_any_hit_for_intersection(nir_shader *any_hit)
{
nir_function_impl *impl = nir_shader_get_entrypoint(any_hit);
/* Any-hit shaders need three parameters */
assert(impl->function->num_params == 0);
nir_parameter params[] = {
{
/* A pointer to a boolean value for whether or not the hit was
* accepted.
*/
.num_components = 1,
.bit_size = 32,
},
{
/* The hit T value */
.num_components = 1,
.bit_size = 32,
},
{
/* The hit kind */
.num_components = 1,
.bit_size = 32,
},
};
impl->function->num_params = ARRAY_SIZE(params);
impl->function->params =
ralloc_array(any_hit, nir_parameter, ARRAY_SIZE(params));
memcpy(impl->function->params, params, sizeof(params));
nir_builder build = nir_builder_at(nir_before_impl(impl));
nir_builder *b = &build;
nir_def *commit_ptr = nir_load_param(b, 0);
nir_def *hit_t = nir_load_param(b, 1);
nir_def *hit_kind = nir_load_param(b, 2);
nir_deref_instr *commit =
nir_build_deref_cast(b, commit_ptr, nir_var_function_temp,
glsl_bool_type(), 0);
nir_foreach_block_safe(block, impl) {
nir_foreach_instr_safe(instr, block) {
switch (instr->type) {
case nir_instr_type_intrinsic: {
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_ignore_ray_intersection:
b->cursor = nir_instr_remove(&intrin->instr);
/* We put the newly emitted code inside a dummy if because it's
* going to contain a jump instruction and we don't want to
* deal with that mess here. It'll get dealt with by our
* control-flow optimization passes.
*/
nir_store_deref(b, commit, nir_imm_false(b), 0x1);
nir_push_if(b, nir_imm_true(b));
nir_jump(b, nir_jump_return);
nir_pop_if(b, NULL);
break;
case nir_intrinsic_terminate_ray:
/* The "normal" handling of terminateRay works fine in
* intersection shaders.
*/
break;
case nir_intrinsic_load_ray_t_max:
nir_def_rewrite_uses(&intrin->def,
hit_t);
nir_instr_remove(&intrin->instr);
break;
case nir_intrinsic_load_ray_hit_kind:
nir_def_rewrite_uses(&intrin->def,
hit_kind);
nir_instr_remove(&intrin->instr);
break;
default:
break;
}
break;
}
case nir_instr_type_jump: {
/* Stomp any halts to returns since they only return from the
* any-hit shader and not necessarily from the intersection
* shader. This is safe to do because we've already asserted
* that we only have the one function.
*/
nir_jump_instr *jump = nir_instr_as_jump(instr);
if (jump->type == nir_jump_halt)
jump->type = nir_jump_return;
break;
}
default:
break;
}
}
}
nir_validate_shader(any_hit, "after initial any-hit lowering");
nir_lower_returns_impl(impl);
nir_validate_shader(any_hit, "after lowering returns");
return impl;
}
void
brw_nir_lower_intersection_shader(nir_shader *intersection,
const nir_shader *any_hit,
const struct intel_device_info *devinfo)
{
void *dead_ctx = ralloc_context(intersection);
nir_function_impl *any_hit_impl = NULL;
struct hash_table *any_hit_var_remap = NULL;
if (any_hit) {
nir_shader *any_hit_tmp = nir_shader_clone(dead_ctx, any_hit);
NIR_PASS_V(any_hit_tmp, nir_opt_dce);
any_hit_impl = lower_any_hit_for_intersection(any_hit_tmp);
any_hit_var_remap = _mesa_pointer_hash_table_create(dead_ctx);
}
nir_function_impl *impl = nir_shader_get_entrypoint(intersection);
nir_builder build = nir_builder_at(nir_before_impl(impl));
nir_builder *b = &build;
nir_def *t_addr = brw_nir_rt_mem_hit_addr(b, false /* committed */);
nir_variable *commit =
nir_local_variable_create(impl, glsl_bool_type(), "ray_commit");
nir_store_var(b, commit, nir_imm_false(b), 0x1);
assert(impl->end_block->predecessors->entries == 1);
set_foreach(impl->end_block->predecessors, block_entry) {
struct nir_block *block = (void *)block_entry->key;
b->cursor = nir_after_block_before_jump(block);
nir_push_if(b, nir_load_var(b, commit));
{
/* Set the "valid" bit in mem_hit */
nir_def *ray_addr = brw_nir_rt_mem_hit_addr(b, false /* committed */);
nir_def *flags_dw_addr = nir_iadd_imm(b, ray_addr, 12);
nir_store_global(b, flags_dw_addr, 4,
nir_ior(b, nir_load_global(b, flags_dw_addr, 4, 1, 32),
nir_imm_int(b, 1 << 16)), 0x1 /* write_mask */);
nir_accept_ray_intersection(b);
}
nir_push_else(b, NULL);
{
nir_ignore_ray_intersection(b);
}
nir_pop_if(b, NULL);
break;
}
nir_foreach_block_safe(block, impl) {
nir_foreach_instr_safe(instr, block) {
switch (instr->type) {
case nir_instr_type_intrinsic: {
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_report_ray_intersection: {
b->cursor = nir_instr_remove(&intrin->instr);
nir_def *hit_t = intrin->src[0].ssa;
nir_def *hit_kind = intrin->src[1].ssa;
nir_def *min_t = nir_load_ray_t_min(b);
struct brw_nir_rt_mem_ray_defs ray_def;
brw_nir_rt_load_mem_ray(b, &ray_def, BRW_RT_BVH_LEVEL_WORLD);
struct brw_nir_rt_mem_hit_defs hit_in = {};
brw_nir_rt_load_mem_hit(b, &hit_in, false);
nir_def *max_t = ray_def.t_far;
/* bool commit_tmp = false; */
nir_variable *commit_tmp =
nir_local_variable_create(impl, glsl_bool_type(),
"commit_tmp");
nir_store_var(b, commit_tmp, nir_imm_false(b), 0x1);
nir_push_if(b, nir_iand(b, nir_fge(b, hit_t, min_t),
nir_fge(b, max_t, hit_t)));
{
/* Any-hit defaults to commit */
nir_store_var(b, commit_tmp, nir_imm_true(b), 0x1);
if (any_hit_impl != NULL) {
nir_push_if(b, nir_inot(b, nir_load_leaf_opaque_intel(b)));
{
nir_def *params[] = {
&nir_build_deref_var(b, commit_tmp)->def,
hit_t,
hit_kind,
};
nir_inline_function_impl(b, any_hit_impl, params,
any_hit_var_remap);
}
nir_pop_if(b, NULL);
}
nir_push_if(b, nir_load_var(b, commit_tmp));
{
nir_store_var(b, commit, nir_imm_true(b), 0x1);
nir_def *ray_addr =
brw_nir_rt_mem_ray_addr(b, brw_nir_rt_stack_addr(b), BRW_RT_BVH_LEVEL_WORLD);
nir_store_global(b, nir_iadd_imm(b, ray_addr, 16 + 12), 4, hit_t, 0x1);
nir_store_global(b, t_addr, 4,
nir_vec2(b, nir_fmin(b, hit_t, hit_in.t), hit_kind),
0x3);
}
nir_pop_if(b, NULL);
}
nir_pop_if(b, NULL);
nir_def *accepted = nir_load_var(b, commit_tmp);
nir_def_rewrite_uses(&intrin->def,
accepted);
break;
}
default:
break;
}
break;
}
default:
break;
}
}
}
nir_metadata_preserve(impl, nir_metadata_none);
/* We did some inlining; have to re-index SSA defs */
nir_index_ssa_defs(impl);
ralloc_free(dead_ctx);
}

View file

@ -0,0 +1,567 @@
/*
* Copyright (c) 2021 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_nir_rt.h"
#include "brw_nir_rt_builder.h"
#include "nir_deref.h"
#include "util/macros.h"
struct lowering_state {
const struct intel_device_info *devinfo;
nir_function_impl *impl;
struct hash_table *queries;
uint32_t n_queries;
struct brw_nir_rt_globals_defs globals;
nir_def *rq_globals;
};
struct brw_ray_query {
nir_variable *opaque_var;
nir_variable *internal_var;
uint32_t id;
};
#define SIZEOF_QUERY_STATE (sizeof(uint32_t))
static bool
need_spill_fill(struct lowering_state *state)
{
return state->n_queries > 1;
}
/**
* This pass converts opaque RayQuery structures from SPIRV into a vec3 where
* the first 2 elements store a global address for the query and the third
* element is an incremented counter on the number of executed
* nir_intrinsic_rq_proceed.
*/
static void
register_opaque_var(nir_variable *opaque_var, struct lowering_state *state)
{
struct hash_entry *entry = _mesa_hash_table_search(state->queries, opaque_var);
assert(entry == NULL);
struct brw_ray_query *rq = rzalloc(state->queries, struct brw_ray_query);
rq->opaque_var = opaque_var;
rq->id = state->n_queries;
unsigned aoa_size = glsl_get_aoa_size(opaque_var->type);
state->n_queries += MAX2(1, aoa_size);
_mesa_hash_table_insert(state->queries, opaque_var, rq);
}
static void
create_internal_var(struct brw_ray_query *rq, struct lowering_state *state)
{
const struct glsl_type *opaque_type = rq->opaque_var->type;
const struct glsl_type *internal_type = glsl_uint16_t_type();
while (glsl_type_is_array(opaque_type)) {
assert(!glsl_type_is_unsized_array(opaque_type));
internal_type = glsl_array_type(internal_type,
glsl_array_size(opaque_type),
0);
opaque_type = glsl_get_array_element(opaque_type);
}
rq->internal_var = nir_local_variable_create(state->impl,
internal_type,
NULL);
}
static nir_def *
get_ray_query_shadow_addr(nir_builder *b,
nir_deref_instr *deref,
struct lowering_state *state,
nir_deref_instr **out_state_deref)
{
nir_deref_path path;
nir_deref_path_init(&path, deref, NULL);
assert(path.path[0]->deref_type == nir_deref_type_var);
nir_variable *opaque_var = nir_deref_instr_get_variable(path.path[0]);
struct hash_entry *entry = _mesa_hash_table_search(state->queries, opaque_var);
assert(entry);
struct brw_ray_query *rq = entry->data;
/* Base address in the shadow memory of the variable associated with this
* ray query variable.
*/
nir_def *base_addr =
nir_iadd_imm(b, state->globals.resume_sbt_addr,
brw_rt_ray_queries_shadow_stack_size(state->devinfo) * rq->id);
bool spill_fill = need_spill_fill(state);
*out_state_deref = nir_build_deref_var(b, rq->internal_var);
if (!spill_fill)
return NULL;
/* Just emit code and let constant-folding go to town */
nir_deref_instr **p = &path.path[1];
for (; *p; p++) {
if ((*p)->deref_type == nir_deref_type_array) {
nir_def *index = (*p)->arr.index.ssa;
/**/
*out_state_deref = nir_build_deref_array(b, *out_state_deref, index);
/**/
uint64_t size = MAX2(1, glsl_get_aoa_size((*p)->type)) *
brw_rt_ray_queries_shadow_stack_size(state->devinfo);
nir_def *mul = nir_amul_imm(b, nir_i2i64(b, index), size);
base_addr = nir_iadd(b, base_addr, mul);
} else {
unreachable("Unsupported deref type");
}
}
nir_deref_path_finish(&path);
/* Add the lane offset to the shadow memory address */
nir_def *lane_offset =
nir_imul_imm(
b,
nir_iadd(
b,
nir_imul(
b,
brw_load_btd_dss_id(b),
brw_nir_rt_load_num_simd_lanes_per_dss(b, state->devinfo)),
brw_nir_rt_sync_stack_id(b)),
BRW_RT_SIZEOF_SHADOW_RAY_QUERY);
return nir_iadd(b, base_addr, nir_i2i64(b, lane_offset));
}
static void
update_trace_ctrl_level(nir_builder *b,
nir_deref_instr *state_deref,
nir_def **out_old_ctrl,
nir_def **out_old_level,
nir_def *new_ctrl,
nir_def *new_level)
{
nir_def *old_value = nir_load_deref(b, state_deref);
nir_def *old_ctrl = nir_ishr_imm(b, old_value, 2);
nir_def *old_level = nir_iand_imm(b, old_value, 0x3);
if (out_old_ctrl)
*out_old_ctrl = old_ctrl;
if (out_old_level)
*out_old_level = old_level;
if (new_ctrl)
new_ctrl = nir_i2i16(b, new_ctrl);
if (new_level)
new_level = nir_i2i16(b, new_level);
if (new_ctrl || new_level) {
if (!new_ctrl)
new_ctrl = old_ctrl;
if (!new_level)
new_level = old_level;
nir_def *new_value = nir_ior(b, nir_ishl_imm(b, new_ctrl, 2), new_level);
nir_store_deref(b, state_deref, new_value, 0x1);
}
}
static void
fill_query(nir_builder *b,
nir_def *hw_stack_addr,
nir_def *shadow_stack_addr,
nir_def *ctrl)
{
brw_nir_memcpy_global(b, hw_stack_addr, 64, shadow_stack_addr, 64,
BRW_RT_SIZEOF_RAY_QUERY);
}
static void
spill_query(nir_builder *b,
nir_def *hw_stack_addr,
nir_def *shadow_stack_addr)
{
brw_nir_memcpy_global(b, shadow_stack_addr, 64, hw_stack_addr, 64,
BRW_RT_SIZEOF_RAY_QUERY);
}
static void
lower_ray_query_intrinsic(nir_builder *b,
nir_intrinsic_instr *intrin,
struct lowering_state *state)
{
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
b->cursor = nir_instr_remove(&intrin->instr);
nir_deref_instr *ctrl_level_deref;
nir_def *shadow_stack_addr =
get_ray_query_shadow_addr(b, deref, state, &ctrl_level_deref);
nir_def *hw_stack_addr =
brw_nir_rt_sync_stack_addr(b, state->globals.base_mem_addr, state->devinfo);
nir_def *stack_addr = shadow_stack_addr ? shadow_stack_addr : hw_stack_addr;
switch (intrin->intrinsic) {
case nir_intrinsic_rq_initialize: {
nir_def *as_addr = intrin->src[1].ssa;
nir_def *ray_flags = intrin->src[2].ssa;
/* From the SPIR-V spec:
*
* "Only the 8 least-significant bits of Cull Mask are used by
* this instruction - other bits are ignored.
*
* Only the 16 least-significant bits of Miss Index are used by
* this instruction - other bits are ignored."
*/
nir_def *cull_mask = nir_iand_imm(b, intrin->src[3].ssa, 0xff);
nir_def *ray_orig = intrin->src[4].ssa;
nir_def *ray_t_min = intrin->src[5].ssa;
nir_def *ray_dir = intrin->src[6].ssa;
nir_def *ray_t_max = intrin->src[7].ssa;
nir_def *root_node_ptr =
brw_nir_rt_acceleration_structure_to_root_node(b, as_addr);
struct brw_nir_rt_mem_ray_defs ray_defs = {
.root_node_ptr = root_node_ptr,
.ray_flags = nir_u2u16(b, ray_flags),
.ray_mask = cull_mask,
.orig = ray_orig,
.t_near = ray_t_min,
.dir = ray_dir,
.t_far = ray_t_max,
};
nir_def *ray_addr =
brw_nir_rt_mem_ray_addr(b, stack_addr, BRW_RT_BVH_LEVEL_WORLD);
brw_nir_rt_query_mark_init(b, stack_addr);
brw_nir_rt_store_mem_ray_query_at_addr(b, ray_addr, &ray_defs);
update_trace_ctrl_level(b, ctrl_level_deref,
NULL, NULL,
nir_imm_int(b, GEN_RT_TRACE_RAY_INITAL),
nir_imm_int(b, BRW_RT_BVH_LEVEL_WORLD));
break;
}
case nir_intrinsic_rq_proceed: {
nir_def *not_done =
nir_inot(b, brw_nir_rt_query_done(b, stack_addr));
nir_def *not_done_then, *not_done_else;
nir_push_if(b, not_done);
{
nir_def *ctrl, *level;
update_trace_ctrl_level(b, ctrl_level_deref,
&ctrl, &level,
NULL,
NULL);
/* Mark the query as done because handing it over to the HW for
* processing. If the HW make any progress, it will write back some
* data and as a side effect, clear the "done" bit. If no progress is
* made, HW does not write anything back and we can use this bit to
* detect that.
*/
brw_nir_rt_query_mark_done(b, stack_addr);
if (shadow_stack_addr)
fill_query(b, hw_stack_addr, shadow_stack_addr, ctrl);
nir_trace_ray_intel(b, state->rq_globals, level, ctrl, .synchronous = true);
struct brw_nir_rt_mem_hit_defs hit_in = {};
brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, hw_stack_addr, false);
if (shadow_stack_addr)
spill_query(b, hw_stack_addr, shadow_stack_addr);
update_trace_ctrl_level(b, ctrl_level_deref,
NULL, NULL,
nir_imm_int(b, GEN_RT_TRACE_RAY_CONTINUE),
hit_in.bvh_level);
not_done_then = nir_inot(b, hit_in.done);
}
nir_push_else(b, NULL);
{
not_done_else = nir_imm_false(b);
}
nir_pop_if(b, NULL);
not_done = nir_if_phi(b, not_done_then, not_done_else);
nir_def_rewrite_uses(&intrin->def, not_done);
break;
}
case nir_intrinsic_rq_confirm_intersection: {
brw_nir_memcpy_global(b,
brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, true), 16,
brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, false), 16,
BRW_RT_SIZEOF_HIT_INFO);
update_trace_ctrl_level(b, ctrl_level_deref,
NULL, NULL,
nir_imm_int(b, GEN_RT_TRACE_RAY_COMMIT),
nir_imm_int(b, BRW_RT_BVH_LEVEL_OBJECT));
break;
}
case nir_intrinsic_rq_generate_intersection: {
brw_nir_rt_generate_hit_addr(b, stack_addr, intrin->src[1].ssa);
update_trace_ctrl_level(b, ctrl_level_deref,
NULL, NULL,
nir_imm_int(b, GEN_RT_TRACE_RAY_COMMIT),
nir_imm_int(b, BRW_RT_BVH_LEVEL_OBJECT));
break;
}
case nir_intrinsic_rq_terminate: {
brw_nir_rt_query_mark_done(b, stack_addr);
break;
}
case nir_intrinsic_rq_load: {
const bool committed = nir_intrinsic_committed(intrin);
struct brw_nir_rt_mem_ray_defs world_ray_in = {};
struct brw_nir_rt_mem_ray_defs object_ray_in = {};
struct brw_nir_rt_mem_hit_defs hit_in = {};
brw_nir_rt_load_mem_ray_from_addr(b, &world_ray_in, stack_addr,
BRW_RT_BVH_LEVEL_WORLD);
brw_nir_rt_load_mem_ray_from_addr(b, &object_ray_in, stack_addr,
BRW_RT_BVH_LEVEL_OBJECT);
brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, stack_addr, committed);
nir_def *sysval = NULL;
switch (nir_intrinsic_ray_query_value(intrin)) {
case nir_ray_query_value_intersection_type:
if (committed) {
/* Values we want to generate :
*
* RayQueryCommittedIntersectionNoneEXT = 0U <= hit_in.valid == false
* RayQueryCommittedIntersectionTriangleEXT = 1U <= hit_in.leaf_type == BRW_RT_BVH_NODE_TYPE_QUAD (4)
* RayQueryCommittedIntersectionGeneratedEXT = 2U <= hit_in.leaf_type == BRW_RT_BVH_NODE_TYPE_PROCEDURAL (3)
*/
sysval =
nir_bcsel(b, nir_ieq_imm(b, hit_in.leaf_type, 4),
nir_imm_int(b, 1), nir_imm_int(b, 2));
sysval =
nir_bcsel(b, hit_in.valid,
sysval, nir_imm_int(b, 0));
} else {
/* 0 -> triangle, 1 -> AABB */
sysval =
nir_b2i32(b,
nir_ieq_imm(b, hit_in.leaf_type,
BRW_RT_BVH_NODE_TYPE_PROCEDURAL));
}
break;
case nir_ray_query_value_intersection_t:
sysval = hit_in.t;
break;
case nir_ray_query_value_intersection_instance_custom_index: {
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
sysval = leaf.instance_id;
break;
}
case nir_ray_query_value_intersection_instance_id: {
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
sysval = leaf.instance_index;
break;
}
case nir_ray_query_value_intersection_instance_sbt_index: {
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
sysval = leaf.contribution_to_hit_group_index;
break;
}
case nir_ray_query_value_intersection_geometry_index: {
nir_def *geometry_index_dw =
nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
1, 32);
sysval = nir_iand_imm(b, geometry_index_dw, BITFIELD_MASK(29));
break;
}
case nir_ray_query_value_intersection_primitive_index:
sysval = brw_nir_rt_load_primitive_id_from_hit(b, NULL /* is_procedural */, &hit_in);
break;
case nir_ray_query_value_intersection_barycentrics:
sysval = hit_in.tri_bary;
break;
case nir_ray_query_value_intersection_front_face:
sysval = hit_in.front_face;
break;
case nir_ray_query_value_intersection_object_ray_direction:
sysval = world_ray_in.dir;
break;
case nir_ray_query_value_intersection_object_ray_origin:
sysval = world_ray_in.orig;
break;
case nir_ray_query_value_intersection_object_to_world: {
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
sysval = leaf.object_to_world[nir_intrinsic_column(intrin)];
break;
}
case nir_ray_query_value_intersection_world_to_object: {
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
sysval = leaf.world_to_object[nir_intrinsic_column(intrin)];
break;
}
case nir_ray_query_value_intersection_candidate_aabb_opaque:
sysval = hit_in.front_face;
break;
case nir_ray_query_value_tmin:
sysval = world_ray_in.t_near;
break;
case nir_ray_query_value_flags:
sysval = nir_u2u32(b, world_ray_in.ray_flags);
break;
case nir_ray_query_value_world_ray_direction:
sysval = world_ray_in.dir;
break;
case nir_ray_query_value_world_ray_origin:
sysval = world_ray_in.orig;
break;
case nir_ray_query_value_intersection_triangle_vertex_positions: {
struct brw_nir_rt_bvh_primitive_leaf_positions_defs pos;
brw_nir_rt_load_bvh_primitive_leaf_positions(b, &pos, hit_in.prim_leaf_ptr);
sysval = pos.positions[nir_intrinsic_column(intrin)];
break;
}
default:
unreachable("Invalid ray query");
}
assert(sysval);
nir_def_rewrite_uses(&intrin->def, sysval);
break;
}
default:
unreachable("Invalid intrinsic");
}
}
static void
lower_ray_query_impl(nir_function_impl *impl, struct lowering_state *state)
{
nir_builder _b, *b = &_b;
_b = nir_builder_at(nir_before_impl(impl));
state->rq_globals = nir_load_ray_query_global_intel(b);
brw_nir_rt_load_globals_addr(b, &state->globals, state->rq_globals);
nir_foreach_block_safe(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_rq_initialize &&
intrin->intrinsic != nir_intrinsic_rq_terminate &&
intrin->intrinsic != nir_intrinsic_rq_proceed &&
intrin->intrinsic != nir_intrinsic_rq_generate_intersection &&
intrin->intrinsic != nir_intrinsic_rq_confirm_intersection &&
intrin->intrinsic != nir_intrinsic_rq_load)
continue;
lower_ray_query_intrinsic(b, intrin, state);
}
}
nir_metadata_preserve(impl, nir_metadata_none);
}
bool
brw_nir_lower_ray_queries(nir_shader *shader,
const struct intel_device_info *devinfo)
{
assert(exec_list_length(&shader->functions) == 1);
struct lowering_state state = {
.devinfo = devinfo,
.impl = nir_shader_get_entrypoint(shader),
.queries = _mesa_pointer_hash_table_create(NULL),
};
/* Map all query variable to internal type variables */
nir_foreach_function_temp_variable(var, state.impl)
register_opaque_var(var, &state);
hash_table_foreach(state.queries, entry)
create_internal_var(entry->data, &state);
bool progress = state.n_queries > 0;
if (progress) {
lower_ray_query_impl(state.impl, &state);
nir_remove_dead_derefs(shader);
nir_remove_dead_variables(shader,
nir_var_shader_temp | nir_var_function_temp,
NULL);
nir_metadata_preserve(state.impl, nir_metadata_none);
}
ralloc_free(state.queries);
return progress;
}

View file

@ -0,0 +1,386 @@
/*
* Copyright (c) 2020 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_nir_rt.h"
#include "brw_nir_rt_builder.h"
static nir_def *
build_leaf_is_procedural(nir_builder *b, struct brw_nir_rt_mem_hit_defs *hit)
{
switch (b->shader->info.stage) {
case MESA_SHADER_ANY_HIT:
/* Any-hit shaders are always compiled into intersection shaders for
* procedural geometry. If we got here in an any-hit shader, it's for
* triangles.
*/
return nir_imm_false(b);
case MESA_SHADER_INTERSECTION:
return nir_imm_true(b);
default:
return nir_ieq_imm(b, hit->leaf_type,
BRW_RT_BVH_NODE_TYPE_PROCEDURAL);
}
}
static void
lower_rt_intrinsics_impl(nir_function_impl *impl,
const struct intel_device_info *devinfo)
{
bool progress = false;
nir_builder build = nir_builder_at(nir_before_impl(impl));
nir_builder *b = &build;
struct brw_nir_rt_globals_defs globals;
brw_nir_rt_load_globals(b, &globals);
nir_def *hotzone_addr = brw_nir_rt_sw_hotzone_addr(b, devinfo);
nir_def *hotzone = nir_load_global(b, hotzone_addr, 16, 4, 32);
gl_shader_stage stage = b->shader->info.stage;
struct brw_nir_rt_mem_ray_defs world_ray_in = {};
struct brw_nir_rt_mem_ray_defs object_ray_in = {};
struct brw_nir_rt_mem_hit_defs hit_in = {};
switch (stage) {
case MESA_SHADER_ANY_HIT:
case MESA_SHADER_CLOSEST_HIT:
case MESA_SHADER_INTERSECTION:
brw_nir_rt_load_mem_hit(b, &hit_in,
stage == MESA_SHADER_CLOSEST_HIT);
brw_nir_rt_load_mem_ray(b, &object_ray_in,
BRW_RT_BVH_LEVEL_OBJECT);
FALLTHROUGH;
case MESA_SHADER_MISS:
brw_nir_rt_load_mem_ray(b, &world_ray_in,
BRW_RT_BVH_LEVEL_WORLD);
break;
default:
break;
}
nir_def *thread_stack_base_addr = brw_nir_rt_sw_stack_addr(b, devinfo);
nir_def *stack_base_offset = nir_channel(b, hotzone, 0);
nir_def *stack_base_addr =
nir_iadd(b, thread_stack_base_addr, nir_u2u64(b, stack_base_offset));
ASSERTED bool seen_scratch_base_ptr_load = false;
ASSERTED bool found_resume = false;
nir_foreach_block(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
b->cursor = nir_after_instr(&intrin->instr);
nir_def *sysval = NULL;
switch (intrin->intrinsic) {
case nir_intrinsic_load_scratch_base_ptr:
assert(nir_intrinsic_base(intrin) == 1);
seen_scratch_base_ptr_load = true;
sysval = stack_base_addr;
break;
case nir_intrinsic_btd_stack_push_intel: {
int32_t stack_size = nir_intrinsic_stack_size(intrin);
if (stack_size > 0) {
nir_def *child_stack_offset =
nir_iadd_imm(b, stack_base_offset, stack_size);
nir_store_global(b, hotzone_addr, 16, child_stack_offset, 0x1);
}
nir_instr_remove(instr);
break;
}
case nir_intrinsic_rt_resume:
/* This is the first "interesting" instruction */
assert(block == nir_start_block(impl));
assert(!seen_scratch_base_ptr_load);
found_resume = true;
int32_t stack_size = nir_intrinsic_stack_size(intrin);
if (stack_size > 0) {
stack_base_offset =
nir_iadd_imm(b, stack_base_offset, -stack_size);
nir_store_global(b, hotzone_addr, 16, stack_base_offset, 0x1);
stack_base_addr = nir_iadd(b, thread_stack_base_addr,
nir_u2u64(b, stack_base_offset));
}
nir_instr_remove(instr);
break;
case nir_intrinsic_load_uniform: {
/* We don't want to lower this in the launch trampoline. */
if (stage == MESA_SHADER_COMPUTE)
break;
sysval = brw_nir_load_global_const(b, intrin,
nir_load_btd_global_arg_addr_intel(b),
BRW_RT_PUSH_CONST_OFFSET);
break;
}
case nir_intrinsic_load_ray_launch_id:
sysval = nir_channels(b, hotzone, 0xe);
break;
case nir_intrinsic_load_ray_launch_size:
sysval = globals.launch_size;
break;
case nir_intrinsic_load_ray_world_origin:
sysval = world_ray_in.orig;
break;
case nir_intrinsic_load_ray_world_direction:
sysval = world_ray_in.dir;
break;
case nir_intrinsic_load_ray_object_origin:
sysval = object_ray_in.orig;
break;
case nir_intrinsic_load_ray_object_direction:
sysval = object_ray_in.dir;
break;
case nir_intrinsic_load_ray_t_min:
/* It shouldn't matter which we pull this from */
sysval = world_ray_in.t_near;
break;
case nir_intrinsic_load_ray_t_max:
if (stage == MESA_SHADER_MISS)
sysval = world_ray_in.t_far;
else
sysval = hit_in.t;
break;
case nir_intrinsic_load_primitive_id:
sysval = brw_nir_rt_load_primitive_id_from_hit(b,
build_leaf_is_procedural(b, &hit_in),
&hit_in);
break;
case nir_intrinsic_load_instance_id: {
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
sysval = leaf.instance_index;
break;
}
case nir_intrinsic_load_ray_object_to_world: {
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
sysval = leaf.object_to_world[nir_intrinsic_column(intrin)];
break;
}
case nir_intrinsic_load_ray_world_to_object: {
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
sysval = leaf.world_to_object[nir_intrinsic_column(intrin)];
break;
}
case nir_intrinsic_load_ray_hit_kind: {
nir_def *tri_hit_kind =
nir_bcsel(b, hit_in.front_face,
nir_imm_int(b, BRW_RT_HIT_KIND_FRONT_FACE),
nir_imm_int(b, BRW_RT_HIT_KIND_BACK_FACE));
sysval = nir_bcsel(b, build_leaf_is_procedural(b, &hit_in),
hit_in.aabb_hit_kind, tri_hit_kind);
break;
}
case nir_intrinsic_load_ray_flags:
/* We need to fetch the original ray flags we stored in the
* leaf pointer, because the actual ray flags we get here
* will include any flags passed on the pipeline at creation
* time, and the spec for IncomingRayFlagsKHR says:
* Setting pipeline flags on the raytracing pipeline must not
* cause any corresponding flags to be set in variables with
* this decoration.
*/
sysval = nir_u2u32(b, world_ray_in.inst_leaf_ptr);
break;
case nir_intrinsic_load_cull_mask:
sysval = nir_u2u32(b, world_ray_in.ray_mask);
break;
case nir_intrinsic_load_ray_geometry_index: {
nir_def *geometry_index_dw =
nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
1, 32);
sysval = nir_iand_imm(b, geometry_index_dw, BITFIELD_MASK(29));
break;
}
case nir_intrinsic_load_ray_instance_custom_index: {
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
sysval = leaf.instance_id;
break;
}
case nir_intrinsic_load_shader_record_ptr:
/* We can't handle this intrinsic in resume shaders because the
* handle we get there won't be from the original SBT. The shader
* call lowering/splitting pass should have ensured that this
* value was spilled from the initial shader and unspilled in any
* resume shaders that need it.
*/
assert(!found_resume);
sysval = nir_load_btd_local_arg_addr_intel(b);
break;
case nir_intrinsic_load_ray_base_mem_addr_intel:
sysval = globals.base_mem_addr;
break;
case nir_intrinsic_load_ray_hw_stack_size_intel:
sysval = nir_imul_imm(b, globals.hw_stack_size, 64);
break;
case nir_intrinsic_load_ray_sw_stack_size_intel:
sysval = nir_imul_imm(b, globals.sw_stack_size, 64);
break;
case nir_intrinsic_load_ray_num_dss_rt_stacks_intel:
sysval = globals.num_dss_rt_stacks;
break;
case nir_intrinsic_load_ray_hit_sbt_addr_intel:
sysval = globals.hit_sbt_addr;
break;
case nir_intrinsic_load_ray_hit_sbt_stride_intel:
sysval = globals.hit_sbt_stride;
break;
case nir_intrinsic_load_ray_miss_sbt_addr_intel:
sysval = globals.miss_sbt_addr;
break;
case nir_intrinsic_load_ray_miss_sbt_stride_intel:
sysval = globals.miss_sbt_stride;
break;
case nir_intrinsic_load_callable_sbt_addr_intel:
sysval = globals.call_sbt_addr;
break;
case nir_intrinsic_load_callable_sbt_stride_intel:
sysval = globals.call_sbt_stride;
break;
case nir_intrinsic_load_btd_resume_sbt_addr_intel:
sysval = nir_pack_64_2x32_split(b,
nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW),
nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH));
break;
case nir_intrinsic_load_leaf_procedural_intel:
sysval = build_leaf_is_procedural(b, &hit_in);
break;
case nir_intrinsic_load_ray_triangle_vertex_positions: {
struct brw_nir_rt_bvh_primitive_leaf_positions_defs pos;
brw_nir_rt_load_bvh_primitive_leaf_positions(b, &pos, hit_in.prim_leaf_ptr);
sysval = pos.positions[nir_intrinsic_column(intrin)];
break;
}
case nir_intrinsic_load_leaf_opaque_intel: {
if (stage == MESA_SHADER_INTERSECTION) {
/* In intersection shaders, the opaque bit is passed to us in
* the front_face bit.
*/
sysval = hit_in.front_face;
} else {
nir_def *flags_dw =
nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
1, 32);
sysval = nir_i2b(b, nir_iand_imm(b, flags_dw, 1u << 30));
}
break;
}
default:
continue;
}
progress = true;
if (sysval) {
nir_def_rewrite_uses(&intrin->def,
sysval);
nir_instr_remove(&intrin->instr);
}
}
}
nir_metadata_preserve(impl,
progress ?
nir_metadata_none :
(nir_metadata_block_index |
nir_metadata_dominance));
}
/** Lower ray-tracing system values and intrinsics
*
* In most 3D shader stages, intrinsics are a fairly thin wrapper around
* hardware functionality and system values represent magic bits that come
* into the shader from FF hardware. Ray-tracing, however, looks a bit more
* like the OpenGL 1.0 world where the underlying hardware is simple and most
* of the API implementation is software.
*
* In particular, most things that are treated as system values (or built-ins
* in SPIR-V) don't get magically dropped into registers for us. Instead, we
* have to fetch them from the relevant data structures shared with the
* ray-tracing hardware. Most come from either the RT_DISPATCH_GLOBALS or
* from one of the MemHit data structures. Some, such as primitive_id require
* us to fetch the leaf address from the MemHit struct and then manually read
* the data out of the BVH. Instead of trying to emit all this code deep in
* the back-end where we can't effectively optimize it, we lower it all to
* global memory access in NIR.
*
* Once this pass is complete, the only real system values left are the two
* argument pointer system values for BTD dispatch: btd_local_arg_addr and
* btd_global_arg_addr.
*/
void
brw_nir_lower_rt_intrinsics(nir_shader *nir,
const struct intel_device_info *devinfo)
{
nir_foreach_function_impl(impl, nir) {
lower_rt_intrinsics_impl(impl, devinfo);
}
}

View file

@ -0,0 +1,329 @@
/*
* Copyright © 2020 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_nir_rt.h"
#include "brw_nir_rt_builder.h"
#include "nir_phi_builder.h"
UNUSED static bool
no_load_scratch_base_ptr_intrinsic(nir_shader *shader)
{
nir_foreach_function_impl(impl, shader) {
nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic == nir_intrinsic_load_scratch_base_ptr)
return false;
}
}
}
return true;
}
/** Insert the appropriate return instruction at the end of the shader */
void
brw_nir_lower_shader_returns(nir_shader *shader)
{
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
/* Reserve scratch space at the start of the shader's per-thread scratch
* space for the return BINDLESS_SHADER_RECORD address and data payload.
* When a shader is called, the calling shader will write the return BSR
* address in this region of the callee's scratch space.
*
* We could also put it at the end of the caller's scratch space. However,
* doing this way means that a shader never accesses its caller's scratch
* space unless given an explicit pointer (such as for ray payloads). It
* also makes computing the address easier given that we want to apply an
* alignment to the scratch offset to ensure we can make alignment
* assumptions in the called shader.
*
* This isn't needed for ray-gen shaders because they end the thread and
* never return to the calling trampoline shader.
*/
assert(no_load_scratch_base_ptr_intrinsic(shader));
if (shader->info.stage != MESA_SHADER_RAYGEN)
shader->scratch_size += BRW_BTD_STACK_CALLEE_DATA_SIZE;
nir_builder b = nir_builder_create(impl);
set_foreach(impl->end_block->predecessors, block_entry) {
struct nir_block *block = (void *)block_entry->key;
b.cursor = nir_after_block_before_jump(block);
switch (shader->info.stage) {
case MESA_SHADER_RAYGEN:
/* A raygen shader is always the root of the shader call tree. When
* it ends, we retire the bindless stack ID and no further shaders
* will be executed.
*/
assert(impl->end_block->predecessors->entries == 1);
brw_nir_btd_retire(&b);
break;
case MESA_SHADER_ANY_HIT:
/* The default action of an any-hit shader is to accept the ray
* intersection. Any-hit shaders may have more than one exit. Only
* the final "normal" exit will actually need to accept the
* intersection as any others should come from nir_jump_halt
* instructions inserted after ignore_ray_intersection or
* terminate_ray or the like. However, inserting an accept after
* the ignore or terminate is safe because it'll get deleted later.
*/
nir_accept_ray_intersection(&b);
break;
case MESA_SHADER_CALLABLE:
case MESA_SHADER_MISS:
case MESA_SHADER_CLOSEST_HIT:
/* Callable, miss, and closest-hit shaders don't take any special
* action at the end. They simply return back to the previous shader
* in the call stack.
*/
assert(impl->end_block->predecessors->entries == 1);
brw_nir_btd_return(&b);
break;
case MESA_SHADER_INTERSECTION:
/* This will be handled by brw_nir_lower_intersection_shader */
break;
default:
unreachable("Invalid callable shader stage");
}
}
nir_metadata_preserve(impl, nir_metadata_block_index |
nir_metadata_dominance);
}
static void
store_resume_addr(nir_builder *b, nir_intrinsic_instr *call)
{
uint32_t call_idx = nir_intrinsic_call_idx(call);
uint32_t offset = nir_intrinsic_stack_size(call);
/* First thing on the called shader's stack is the resume address
* followed by a pointer to the payload.
*/
nir_def *resume_record_addr =
nir_iadd_imm(b, nir_load_btd_resume_sbt_addr_intel(b),
call_idx * BRW_BTD_RESUME_SBT_STRIDE);
/* By the time we get here, any remaining shader/function memory
* pointers have been lowered to SSA values.
*/
nir_def *payload_addr =
nir_get_shader_call_payload_src(call)->ssa;
brw_nir_rt_store_scratch(b, offset, BRW_BTD_STACK_ALIGN,
nir_vec2(b, resume_record_addr, payload_addr),
0xf /* write_mask */);
nir_btd_stack_push_intel(b, offset);
}
static bool
lower_shader_trace_ray_instr(struct nir_builder *b, nir_instr *instr, void *data)
{
struct brw_bs_prog_key *key = data;
if (instr->type != nir_instr_type_intrinsic)
return false;
/* Leave nir_intrinsic_rt_resume to be lowered by
* brw_nir_lower_rt_intrinsics()
*/
nir_intrinsic_instr *call = nir_instr_as_intrinsic(instr);
if (call->intrinsic != nir_intrinsic_rt_trace_ray)
return false;
b->cursor = nir_instr_remove(instr);
store_resume_addr(b, call);
nir_def *as_addr = call->src[0].ssa;
nir_def *ray_flags = call->src[1].ssa;
/* From the SPIR-V spec:
*
* "Only the 8 least-significant bits of Cull Mask are used by this
* instruction - other bits are ignored.
*
* Only the 4 least-significant bits of SBT Offset and SBT Stride are
* used by this instruction - other bits are ignored.
*
* Only the 16 least-significant bits of Miss Index are used by this
* instruction - other bits are ignored."
*/
nir_def *cull_mask = nir_iand_imm(b, call->src[2].ssa, 0xff);
nir_def *sbt_offset = nir_iand_imm(b, call->src[3].ssa, 0xf);
nir_def *sbt_stride = nir_iand_imm(b, call->src[4].ssa, 0xf);
nir_def *miss_index = nir_iand_imm(b, call->src[5].ssa, 0xffff);
nir_def *ray_orig = call->src[6].ssa;
nir_def *ray_t_min = call->src[7].ssa;
nir_def *ray_dir = call->src[8].ssa;
nir_def *ray_t_max = call->src[9].ssa;
nir_def *root_node_ptr =
brw_nir_rt_acceleration_structure_to_root_node(b, as_addr);
/* The hardware packet requires an address to the first element of the
* hit SBT.
*
* In order to calculate this, we must multiply the "SBT Offset"
* provided to OpTraceRay by the SBT stride provided for the hit SBT in
* the call to vkCmdTraceRay() and add that to the base address of the
* hit SBT. This stride is not to be confused with the "SBT Stride"
* provided to OpTraceRay which is in units of this stride. It's a
* rather terrible overload of the word "stride". The hardware docs
* calls the SPIR-V stride value the "shader index multiplier" which is
* a much more sane name.
*/
nir_def *hit_sbt_stride_B =
nir_load_ray_hit_sbt_stride_intel(b);
nir_def *hit_sbt_offset_B =
nir_imul(b, sbt_offset, nir_u2u32(b, hit_sbt_stride_B));
nir_def *hit_sbt_addr =
nir_iadd(b, nir_load_ray_hit_sbt_addr_intel(b),
nir_u2u64(b, hit_sbt_offset_B));
/* The hardware packet takes an address to the miss BSR. */
nir_def *miss_sbt_stride_B =
nir_load_ray_miss_sbt_stride_intel(b);
nir_def *miss_sbt_offset_B =
nir_imul(b, miss_index, nir_u2u32(b, miss_sbt_stride_B));
nir_def *miss_sbt_addr =
nir_iadd(b, nir_load_ray_miss_sbt_addr_intel(b),
nir_u2u64(b, miss_sbt_offset_B));
struct brw_nir_rt_mem_ray_defs ray_defs = {
.root_node_ptr = root_node_ptr,
/* Combine the shader value given to traceRayEXT() with the pipeline
* creation value VkPipelineCreateFlags.
*/
.ray_flags = nir_ior_imm(b, nir_u2u16(b, ray_flags), key->pipeline_ray_flags),
.ray_mask = cull_mask,
.hit_group_sr_base_ptr = hit_sbt_addr,
.hit_group_sr_stride = nir_u2u16(b, hit_sbt_stride_B),
.miss_sr_ptr = miss_sbt_addr,
.orig = ray_orig,
.t_near = ray_t_min,
.dir = ray_dir,
.t_far = ray_t_max,
.shader_index_multiplier = sbt_stride,
/* The instance leaf pointer is unused in the top level BVH traversal
* since we always start from the root node. We can reuse that field to
* store the ray_flags handed to traceRayEXT(). This will be reloaded
* when the shader accesses gl_IncomingRayFlagsEXT (see
* nir_intrinsic_load_ray_flags brw_nir_lower_rt_intrinsic.c)
*/
.inst_leaf_ptr = nir_u2u64(b, ray_flags),
};
brw_nir_rt_store_mem_ray(b, &ray_defs, BRW_RT_BVH_LEVEL_WORLD);
nir_trace_ray_intel(b,
nir_load_btd_global_arg_addr_intel(b),
nir_imm_int(b, BRW_RT_BVH_LEVEL_WORLD),
nir_imm_int(b, GEN_RT_TRACE_RAY_INITAL),
.synchronous = false);
return true;
}
static bool
lower_shader_call_instr(struct nir_builder *b, nir_intrinsic_instr *call,
void *data)
{
if (call->intrinsic != nir_intrinsic_rt_execute_callable)
return false;
b->cursor = nir_instr_remove(&call->instr);
store_resume_addr(b, call);
nir_def *sbt_offset32 =
nir_imul(b, call->src[0].ssa,
nir_u2u32(b, nir_load_callable_sbt_stride_intel(b)));
nir_def *sbt_addr =
nir_iadd(b, nir_load_callable_sbt_addr_intel(b),
nir_u2u64(b, sbt_offset32));
brw_nir_btd_spawn(b, sbt_addr);
return true;
}
bool
brw_nir_lower_shader_calls(nir_shader *shader, struct brw_bs_prog_key *key)
{
bool a = nir_shader_instructions_pass(shader,
lower_shader_trace_ray_instr,
nir_metadata_none,
key);
bool b = nir_shader_intrinsics_pass(shader, lower_shader_call_instr,
nir_metadata_block_index |
nir_metadata_dominance,
NULL);
return a || b;
}
/** Creates a trivial return shader
*
* In most cases this shader doesn't actually do anything. It just needs to
* return to the caller.
*
* By default, our HW has the ability to handle the fact that a shader is not
* available and will execute the next following shader in the tracing call.
* For instance, a RAYGEN shader traces a ray, the tracing generates a hit,
* but there is no ANYHIT shader available. The HW should follow up by
* execution the CLOSESTHIT shader.
*
* This default behavior can be changed through the RT_CTRL register
* (privileged access) and when NULL shader checks are disabled, the HW will
* instead call the call stack handler (this shader). This is what i915 is
* doing as part of Wa_14013202645.
*
* In order to ensure the call to the CLOSESTHIT shader, this shader needs to
* commit the ray and will not proceed with the BTD return. Similarly when the
* same thing happen with the INTERSECTION shader, we should just carry on the
* ray traversal with the continue operation.
*
*/
nir_shader *
brw_nir_create_trivial_return_shader(const struct brw_compiler *compiler,
void *mem_ctx)
{
const nir_shader_compiler_options *nir_options =
compiler->nir_options[MESA_SHADER_CALLABLE];
nir_builder _b = nir_builder_init_simple_shader(MESA_SHADER_CALLABLE,
nir_options,
"RT Trivial Return");
nir_builder *b = &_b;
ralloc_steal(mem_ctx, b->shader);
nir_shader *nir = b->shader;
NIR_PASS_V(nir, brw_nir_lower_shader_returns);
return nir;
}

View file

@ -0,0 +1,765 @@
/*
* Copyright © 2018 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "isl/isl.h"
#include "brw_nir.h"
#include "compiler/nir/nir_builder.h"
#include "compiler/nir/nir_format_convert.h"
static nir_def *
_load_image_param(nir_builder *b, nir_deref_instr *deref, unsigned offset)
{
nir_intrinsic_instr *load =
nir_intrinsic_instr_create(b->shader,
nir_intrinsic_image_deref_load_param_intel);
load->src[0] = nir_src_for_ssa(&deref->def);
nir_intrinsic_set_base(load, offset / 4);
switch (offset) {
case ISL_IMAGE_PARAM_OFFSET_OFFSET:
case ISL_IMAGE_PARAM_SWIZZLING_OFFSET:
load->num_components = 2;
break;
case ISL_IMAGE_PARAM_TILING_OFFSET:
case ISL_IMAGE_PARAM_SIZE_OFFSET:
load->num_components = 3;
break;
case ISL_IMAGE_PARAM_STRIDE_OFFSET:
load->num_components = 4;
break;
default:
unreachable("Invalid param offset");
}
nir_def_init(&load->instr, &load->def, load->num_components, 32);
nir_builder_instr_insert(b, &load->instr);
return &load->def;
}
#define load_image_param(b, d, o) \
_load_image_param(b, d, ISL_IMAGE_PARAM_##o##_OFFSET)
static nir_def *
image_coord_is_in_bounds(nir_builder *b, nir_deref_instr *deref,
nir_def *coord)
{
nir_def *size = load_image_param(b, deref, SIZE);
nir_def *cmp = nir_ilt(b, coord, size);
unsigned coord_comps = glsl_get_sampler_coordinate_components(deref->type);
nir_def *in_bounds = nir_imm_true(b);
for (unsigned i = 0; i < coord_comps; i++)
in_bounds = nir_iand(b, in_bounds, nir_channel(b, cmp, i));
return in_bounds;
}
/** Calculate the offset in memory of the texel given by \p coord.
*
* This is meant to be used with untyped surface messages to access a tiled
* surface, what involves taking into account the tiling and swizzling modes
* of the surface manually so it will hopefully not happen very often.
*
* The tiling algorithm implemented here matches either the X or Y tiling
* layouts supported by the hardware depending on the tiling coefficients
* passed to the program as uniforms. See Volume 1 Part 2 Section 4.5
* "Address Tiling Function" of the IVB PRM for an in-depth explanation of
* the hardware tiling format.
*/
static nir_def *
image_address(nir_builder *b, const struct intel_device_info *devinfo,
nir_deref_instr *deref, nir_def *coord)
{
if (glsl_get_sampler_dim(deref->type) == GLSL_SAMPLER_DIM_1D &&
glsl_sampler_type_is_array(deref->type)) {
/* It's easier if 1D arrays are treated like 2D arrays */
coord = nir_vec3(b, nir_channel(b, coord, 0),
nir_imm_int(b, 0),
nir_channel(b, coord, 1));
} else {
unsigned dims = glsl_get_sampler_coordinate_components(deref->type);
coord = nir_trim_vector(b, coord, dims);
}
nir_def *offset = load_image_param(b, deref, OFFSET);
nir_def *tiling = load_image_param(b, deref, TILING);
nir_def *stride = load_image_param(b, deref, STRIDE);
/* Shift the coordinates by the fixed surface offset. It may be non-zero
* if the image is a single slice of a higher-dimensional surface, or if a
* non-zero mipmap level of the surface is bound to the pipeline. The
* offset needs to be applied here rather than at surface state set-up time
* because the desired slice-level may start mid-tile, so simply shifting
* the surface base address wouldn't give a well-formed tiled surface in
* the general case.
*/
nir_def *xypos = (coord->num_components == 1) ?
nir_vec2(b, coord, nir_imm_int(b, 0)) :
nir_trim_vector(b, coord, 2);
xypos = nir_iadd(b, xypos, offset);
/* The layout of 3-D textures in memory is sort-of like a tiling
* format. At each miplevel, the slices are arranged in rows of
* 2^level slices per row. The slice row is stored in tmp.y and
* the slice within the row is stored in tmp.x.
*
* The layout of 2-D array textures and cubemaps is much simpler:
* Depending on whether the ARYSPC_LOD0 layout is in use it will be
* stored in memory as an array of slices, each one being a 2-D
* arrangement of miplevels, or as a 2D arrangement of miplevels,
* each one being an array of slices. In either case the separation
* between slices of the same LOD is equal to the qpitch value
* provided as stride.w.
*
* This code can be made to handle either 2D arrays and 3D textures
* by passing in the miplevel as tile.z for 3-D textures and 0 in
* tile.z for 2-D array textures.
*
* See Volume 1 Part 1 of the Gfx7 PRM, sections 6.18.4.7 "Surface
* Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
* of the hardware 3D texture and 2D array layouts.
*/
if (coord->num_components > 2) {
/* Decompose z into a major (tmp.y) and a minor (tmp.x)
* index.
*/
nir_def *z = nir_channel(b, coord, 2);
nir_def *z_x = nir_ubfe(b, z, nir_imm_int(b, 0),
nir_channel(b, tiling, 2));
nir_def *z_y = nir_ushr(b, z, nir_channel(b, tiling, 2));
/* Take into account the horizontal (tmp.x) and vertical (tmp.y)
* slice offset.
*/
xypos = nir_iadd(b, xypos, nir_imul(b, nir_vec2(b, z_x, z_y),
nir_channels(b, stride, 0xc)));
}
nir_def *addr;
if (coord->num_components > 1) {
/* Calculate the major/minor x and y indices. In order to
* accommodate both X and Y tiling, the Y-major tiling format is
* treated as being a bunch of narrow X-tiles placed next to each
* other. This means that the tile width for Y-tiling is actually
* the width of one sub-column of the Y-major tile where each 4K
* tile has 8 512B sub-columns.
*
* The major Y value is the row of tiles in which the pixel lives.
* The major X value is the tile sub-column in which the pixel
* lives; for X tiling, this is the same as the tile column, for Y
* tiling, each tile has 8 sub-columns. The minor X and Y indices
* are the position within the sub-column.
*/
/* Calculate the minor x and y indices. */
nir_def *minor = nir_ubfe(b, xypos, nir_imm_int(b, 0),
nir_trim_vector(b, tiling, 2));
nir_def *major = nir_ushr(b, xypos, nir_trim_vector(b, tiling, 2));
/* Calculate the texel index from the start of the tile row and the
* vertical coordinate of the row.
* Equivalent to:
* tmp.x = (major.x << tile.y << tile.x) +
* (minor.y << tile.x) + minor.x
* tmp.y = major.y << tile.y
*/
nir_def *idx_x, *idx_y;
idx_x = nir_ishl(b, nir_channel(b, major, 0), nir_channel(b, tiling, 1));
idx_x = nir_iadd(b, idx_x, nir_channel(b, minor, 1));
idx_x = nir_ishl(b, idx_x, nir_channel(b, tiling, 0));
idx_x = nir_iadd(b, idx_x, nir_channel(b, minor, 0));
idx_y = nir_ishl(b, nir_channel(b, major, 1), nir_channel(b, tiling, 1));
/* Add it to the start of the tile row. */
nir_def *idx;
idx = nir_imul(b, idx_y, nir_channel(b, stride, 1));
idx = nir_iadd(b, idx, idx_x);
/* Multiply by the Bpp value. */
addr = nir_imul(b, idx, nir_channel(b, stride, 0));
if (devinfo->ver < 8 && devinfo->platform != INTEL_PLATFORM_BYT) {
/* Take into account the two dynamically specified shifts. Both are
* used to implement swizzling of X-tiled surfaces. For Y-tiled
* surfaces only one bit needs to be XOR-ed with bit 6 of the memory
* address, so a swz value of 0xff (actually interpreted as 31 by the
* hardware) will be provided to cause the relevant bit of tmp.y to
* be zero and turn the first XOR into the identity. For linear
* surfaces or platforms lacking address swizzling both shifts will
* be 0xff causing the relevant bits of both tmp.x and .y to be zero,
* what effectively disables swizzling.
*/
nir_def *swizzle = load_image_param(b, deref, SWIZZLING);
nir_def *shift0 = nir_ushr(b, addr, nir_channel(b, swizzle, 0));
nir_def *shift1 = nir_ushr(b, addr, nir_channel(b, swizzle, 1));
/* XOR tmp.x and tmp.y with bit 6 of the memory address. */
nir_def *bit = nir_iand(b, nir_ixor(b, shift0, shift1),
nir_imm_int(b, 1 << 6));
addr = nir_ixor(b, addr, bit);
}
} else {
/* Multiply by the Bpp/stride value. Note that the addr.y may be
* non-zero even if the image is one-dimensional because a vertical
* offset may have been applied above to select a non-zero slice or
* level of a higher-dimensional texture.
*/
nir_def *idx;
idx = nir_imul(b, nir_channel(b, xypos, 1), nir_channel(b, stride, 1));
idx = nir_iadd(b, nir_channel(b, xypos, 0), idx);
addr = nir_imul(b, idx, nir_channel(b, stride, 0));
}
return addr;
}
struct format_info {
const struct isl_format_layout *fmtl;
unsigned chans;
unsigned bits[4];
};
static struct format_info
get_format_info(enum isl_format fmt)
{
const struct isl_format_layout *fmtl = isl_format_get_layout(fmt);
return (struct format_info) {
.fmtl = fmtl,
.chans = isl_format_get_num_channels(fmt),
.bits = {
fmtl->channels.r.bits,
fmtl->channels.g.bits,
fmtl->channels.b.bits,
fmtl->channels.a.bits
},
};
}
static nir_def *
convert_color_for_load(nir_builder *b, const struct intel_device_info *devinfo,
nir_def *color,
enum isl_format image_fmt, enum isl_format lower_fmt,
unsigned dest_components)
{
if (image_fmt == lower_fmt)
goto expand_vec;
if (image_fmt == ISL_FORMAT_R11G11B10_FLOAT) {
assert(lower_fmt == ISL_FORMAT_R32_UINT);
color = nir_format_unpack_11f11f10f(b, color);
goto expand_vec;
}
struct format_info image = get_format_info(image_fmt);
struct format_info lower = get_format_info(lower_fmt);
const bool needs_sign_extension =
isl_format_has_snorm_channel(image_fmt) ||
isl_format_has_sint_channel(image_fmt);
/* We only check the red channel to detect if we need to pack/unpack */
assert(image.bits[0] != lower.bits[0] ||
memcmp(image.bits, lower.bits, sizeof(image.bits)) == 0);
if (image.bits[0] != lower.bits[0] && lower_fmt == ISL_FORMAT_R32_UINT) {
if (needs_sign_extension)
color = nir_format_unpack_sint(b, color, image.bits, image.chans);
else
color = nir_format_unpack_uint(b, color, image.bits, image.chans);
} else {
/* All these formats are homogeneous */
for (unsigned i = 1; i < image.chans; i++)
assert(image.bits[i] == image.bits[0]);
/* On IVB, we rely on the undocumented behavior that typed reads from
* surfaces of the unsupported R8 and R16 formats return useful data in
* their least significant bits. However, the data in the high bits is
* garbage so we have to discard it.
*/
if (devinfo->verx10 == 70 &&
(lower_fmt == ISL_FORMAT_R16_UINT ||
lower_fmt == ISL_FORMAT_R8_UINT))
color = nir_format_mask_uvec(b, color, lower.bits);
if (image.bits[0] != lower.bits[0]) {
color = nir_format_bitcast_uvec_unmasked(b, color, lower.bits[0],
image.bits[0]);
}
if (needs_sign_extension)
color = nir_format_sign_extend_ivec(b, color, image.bits);
}
switch (image.fmtl->channels.r.type) {
case ISL_UNORM:
assert(isl_format_has_uint_channel(lower_fmt));
color = nir_format_unorm_to_float(b, color, image.bits);
break;
case ISL_SNORM:
assert(isl_format_has_uint_channel(lower_fmt));
color = nir_format_snorm_to_float(b, color, image.bits);
break;
case ISL_SFLOAT:
if (image.bits[0] == 16)
color = nir_unpack_half_2x16_split_x(b, color);
break;
case ISL_UINT:
case ISL_SINT:
break;
default:
unreachable("Invalid image channel type");
}
expand_vec:
assert(dest_components == 1 || dest_components == 4);
assert(color->num_components <= dest_components);
if (color->num_components == dest_components)
return color;
nir_def *comps[4];
for (unsigned i = 0; i < color->num_components; i++)
comps[i] = nir_channel(b, color, i);
for (unsigned i = color->num_components; i < 3; i++)
comps[i] = nir_imm_int(b, 0);
if (color->num_components < 4) {
if (isl_format_has_int_channel(image_fmt))
comps[3] = nir_imm_int(b, 1);
else
comps[3] = nir_imm_float(b, 1);
}
return nir_vec(b, comps, dest_components);
}
static bool
lower_image_load_instr(nir_builder *b,
const struct intel_device_info *devinfo,
nir_intrinsic_instr *intrin,
bool sparse)
{
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
nir_variable *var = nir_deref_instr_get_variable(deref);
if (var->data.image.format == PIPE_FORMAT_NONE)
return false;
const enum isl_format image_fmt =
isl_format_for_pipe_format(var->data.image.format);
if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt)) {
const enum isl_format lower_fmt =
isl_lower_storage_image_format(devinfo, image_fmt);
const unsigned dest_components =
sparse ? (intrin->num_components - 1) : intrin->num_components;
/* Use an undef to hold the uses of the load while we do the color
* conversion.
*/
nir_def *placeholder = nir_undef(b, 4, 32);
nir_def_rewrite_uses(&intrin->def, placeholder);
intrin->num_components = isl_format_get_num_channels(lower_fmt);
intrin->def.num_components = intrin->num_components;
b->cursor = nir_after_instr(&intrin->instr);
nir_def *color = convert_color_for_load(b, devinfo,
&intrin->def,
image_fmt, lower_fmt,
dest_components);
if (sparse) {
/* Put the sparse component back on the original instruction */
intrin->num_components++;
intrin->def.num_components = intrin->num_components;
/* Carry over the sparse component without modifying it with the
* converted color.
*/
nir_def *sparse_color[NIR_MAX_VEC_COMPONENTS];
for (unsigned i = 0; i < dest_components; i++)
sparse_color[i] = nir_channel(b, color, i);
sparse_color[dest_components] =
nir_channel(b, &intrin->def, intrin->num_components - 1);
color = nir_vec(b, sparse_color, dest_components + 1);
}
nir_def_rewrite_uses(placeholder, color);
nir_instr_remove(placeholder->parent_instr);
} else {
/* This code part is only useful prior to Gfx9, we do not have plans to
* enable sparse there.
*/
assert(!sparse);
const struct isl_format_layout *image_fmtl =
isl_format_get_layout(image_fmt);
/* We have a matching typed format for everything 32b and below */
assert(image_fmtl->bpb == 64 || image_fmtl->bpb == 128);
enum isl_format raw_fmt = (image_fmtl->bpb == 64) ?
ISL_FORMAT_R32G32_UINT :
ISL_FORMAT_R32G32B32A32_UINT;
const unsigned dest_components = intrin->num_components;
b->cursor = nir_instr_remove(&intrin->instr);
nir_def *coord = intrin->src[1].ssa;
nir_def *do_load = image_coord_is_in_bounds(b, deref, coord);
if (devinfo->verx10 == 70) {
/* Check whether the first stride component (i.e. the Bpp value)
* is greater than four, what on Gfx7 indicates that a surface of
* type RAW has been bound for untyped access. Reading or writing
* to a surface of type other than RAW using untyped surface
* messages causes a hang on IVB and VLV.
*/
nir_def *stride = load_image_param(b, deref, STRIDE);
nir_def *is_raw =
nir_igt_imm(b, nir_channel(b, stride, 0), 4);
do_load = nir_iand(b, do_load, is_raw);
}
nir_push_if(b, do_load);
nir_def *addr = image_address(b, devinfo, deref, coord);
nir_def *load =
nir_image_deref_load_raw_intel(b, image_fmtl->bpb / 32, 32,
&deref->def, addr);
nir_push_else(b, NULL);
nir_def *zero = nir_imm_zero(b, load->num_components, 32);
nir_pop_if(b, NULL);
nir_def *value = nir_if_phi(b, load, zero);
nir_def *color = convert_color_for_load(b, devinfo, value,
image_fmt, raw_fmt,
dest_components);
nir_def_rewrite_uses(&intrin->def, color);
}
return true;
}
static nir_def *
convert_color_for_store(nir_builder *b, const struct intel_device_info *devinfo,
nir_def *color,
enum isl_format image_fmt, enum isl_format lower_fmt)
{
struct format_info image = get_format_info(image_fmt);
struct format_info lower = get_format_info(lower_fmt);
color = nir_trim_vector(b, color, image.chans);
if (image_fmt == lower_fmt)
return color;
if (image_fmt == ISL_FORMAT_R11G11B10_FLOAT) {
assert(lower_fmt == ISL_FORMAT_R32_UINT);
return nir_format_pack_11f11f10f(b, color);
}
switch (image.fmtl->channels.r.type) {
case ISL_UNORM:
assert(isl_format_has_uint_channel(lower_fmt));
color = nir_format_float_to_unorm(b, color, image.bits);
break;
case ISL_SNORM:
assert(isl_format_has_uint_channel(lower_fmt));
color = nir_format_float_to_snorm(b, color, image.bits);
break;
case ISL_SFLOAT:
if (image.bits[0] == 16)
color = nir_format_float_to_half(b, color);
break;
case ISL_UINT:
color = nir_format_clamp_uint(b, color, image.bits);
break;
case ISL_SINT:
color = nir_format_clamp_sint(b, color, image.bits);
break;
default:
unreachable("Invalid image channel type");
}
if (image.bits[0] < 32 &&
(isl_format_has_snorm_channel(image_fmt) ||
isl_format_has_sint_channel(image_fmt)))
color = nir_format_mask_uvec(b, color, image.bits);
if (image.bits[0] != lower.bits[0] && lower_fmt == ISL_FORMAT_R32_UINT) {
color = nir_format_pack_uint(b, color, image.bits, image.chans);
} else {
/* All these formats are homogeneous */
for (unsigned i = 1; i < image.chans; i++)
assert(image.bits[i] == image.bits[0]);
if (image.bits[0] != lower.bits[0]) {
color = nir_format_bitcast_uvec_unmasked(b, color, image.bits[0],
lower.bits[0]);
}
}
return color;
}
static bool
lower_image_store_instr(nir_builder *b,
const struct intel_device_info *devinfo,
nir_intrinsic_instr *intrin)
{
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
nir_variable *var = nir_deref_instr_get_variable(deref);
/* For write-only surfaces, we trust that the hardware can just do the
* conversion for us.
*/
if (var->data.access & ACCESS_NON_READABLE)
return false;
if (var->data.image.format == PIPE_FORMAT_NONE)
return false;
const enum isl_format image_fmt =
isl_format_for_pipe_format(var->data.image.format);
if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt)) {
const enum isl_format lower_fmt =
isl_lower_storage_image_format(devinfo, image_fmt);
/* Color conversion goes before the store */
b->cursor = nir_before_instr(&intrin->instr);
nir_def *color = convert_color_for_store(b, devinfo,
intrin->src[3].ssa,
image_fmt, lower_fmt);
intrin->num_components = isl_format_get_num_channels(lower_fmt);
nir_src_rewrite(&intrin->src[3], color);
} else {
const struct isl_format_layout *image_fmtl =
isl_format_get_layout(image_fmt);
/* We have a matching typed format for everything 32b and below */
assert(image_fmtl->bpb == 64 || image_fmtl->bpb == 128);
enum isl_format raw_fmt = (image_fmtl->bpb == 64) ?
ISL_FORMAT_R32G32_UINT :
ISL_FORMAT_R32G32B32A32_UINT;
b->cursor = nir_instr_remove(&intrin->instr);
nir_def *coord = intrin->src[1].ssa;
nir_def *do_store = image_coord_is_in_bounds(b, deref, coord);
if (devinfo->verx10 == 70) {
/* Check whether the first stride component (i.e. the Bpp value)
* is greater than four, what on Gfx7 indicates that a surface of
* type RAW has been bound for untyped access. Reading or writing
* to a surface of type other than RAW using untyped surface
* messages causes a hang on IVB and VLV.
*/
nir_def *stride = load_image_param(b, deref, STRIDE);
nir_def *is_raw =
nir_igt_imm(b, nir_channel(b, stride, 0), 4);
do_store = nir_iand(b, do_store, is_raw);
}
nir_push_if(b, do_store);
nir_def *addr = image_address(b, devinfo, deref, coord);
nir_def *color = convert_color_for_store(b, devinfo,
intrin->src[3].ssa,
image_fmt, raw_fmt);
nir_intrinsic_instr *store =
nir_intrinsic_instr_create(b->shader,
nir_intrinsic_image_deref_store_raw_intel);
store->src[0] = nir_src_for_ssa(&deref->def);
store->src[1] = nir_src_for_ssa(addr);
store->src[2] = nir_src_for_ssa(color);
store->num_components = image_fmtl->bpb / 32;
nir_builder_instr_insert(b, &store->instr);
nir_pop_if(b, NULL);
}
return true;
}
static bool
lower_image_atomic_instr(nir_builder *b,
const struct intel_device_info *devinfo,
nir_intrinsic_instr *intrin)
{
if (devinfo->verx10 >= 75)
return false;
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
b->cursor = nir_instr_remove(&intrin->instr);
/* Use an undef to hold the uses of the load conversion. */
nir_def *placeholder = nir_undef(b, 4, 32);
nir_def_rewrite_uses(&intrin->def, placeholder);
/* Check the first component of the size field to find out if the
* image is bound. Necessary on IVB for typed atomics because
* they don't seem to respect null surfaces and will happily
* corrupt or read random memory when no image is bound.
*/
nir_def *size = load_image_param(b, deref, SIZE);
nir_def *zero = nir_imm_int(b, 0);
nir_push_if(b, nir_ine(b, nir_channel(b, size, 0), zero));
nir_builder_instr_insert(b, &intrin->instr);
nir_pop_if(b, NULL);
nir_def *result = nir_if_phi(b, &intrin->def, zero);
nir_def_rewrite_uses(placeholder, result);
return true;
}
static bool
lower_image_size_instr(nir_builder *b,
const struct intel_device_info *devinfo,
nir_intrinsic_instr *intrin)
{
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
nir_variable *var = nir_deref_instr_get_variable(deref);
/* For write-only images, we have an actual image surface so we fall back
* and let the back-end emit a TXS for this.
*/
if (var->data.access & ACCESS_NON_READABLE)
return false;
if (var->data.image.format == PIPE_FORMAT_NONE)
return false;
/* If we have a matching typed format, then we have an actual image surface
* so we fall back and let the back-end emit a TXS for this.
*/
const enum isl_format image_fmt =
isl_format_for_pipe_format(var->data.image.format);
if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt))
return false;
assert(nir_src_as_uint(intrin->src[1]) == 0);
b->cursor = nir_instr_remove(&intrin->instr);
nir_def *size = load_image_param(b, deref, SIZE);
nir_def *comps[4] = { NULL, NULL, NULL, NULL };
assert(nir_intrinsic_image_dim(intrin) != GLSL_SAMPLER_DIM_CUBE);
unsigned coord_comps = glsl_get_sampler_coordinate_components(deref->type);
for (unsigned c = 0; c < coord_comps; c++)
comps[c] = nir_channel(b, size, c);
for (unsigned c = coord_comps; c < intrin->def.num_components; ++c)
comps[c] = nir_imm_int(b, 1);
nir_def *vec = nir_vec(b, comps, intrin->def.num_components);
nir_def_rewrite_uses(&intrin->def, vec);
return true;
}
static bool
brw_nir_lower_storage_image_instr(nir_builder *b,
nir_instr *instr,
void *cb_data)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
const struct brw_nir_lower_storage_image_opts *opts = cb_data;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_image_deref_load:
if (opts->lower_loads)
return lower_image_load_instr(b, opts->devinfo, intrin, false);
return false;
case nir_intrinsic_image_deref_sparse_load:
if (opts->lower_loads)
return lower_image_load_instr(b, opts->devinfo, intrin, true);
return false;
case nir_intrinsic_image_deref_store:
if (opts->lower_stores)
return lower_image_store_instr(b, opts->devinfo, intrin);
return false;
case nir_intrinsic_image_deref_atomic:
case nir_intrinsic_image_deref_atomic_swap:
if (opts->lower_atomics)
return lower_image_atomic_instr(b, opts->devinfo, intrin);
return false;
case nir_intrinsic_image_deref_size:
if (opts->lower_get_size)
return lower_image_size_instr(b, opts->devinfo, intrin);
return false;
default:
/* Nothing to do */
return false;
}
}
bool
brw_nir_lower_storage_image(nir_shader *shader,
const struct brw_nir_lower_storage_image_opts *opts)
{
bool progress = false;
const nir_lower_image_options image_options = {
.lower_cube_size = true,
.lower_image_samples_to_one = true,
};
progress |= nir_lower_image(shader, &image_options);
progress |= nir_shader_instructions_pass(shader,
brw_nir_lower_storage_image_instr,
nir_metadata_none,
(void *)opts);
return progress;
}

View file

@ -0,0 +1,536 @@
/*
* Copyright © 2020 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "intel_nir.h"
#include "brw_nir_rt.h"
#include "brw_nir_rt_builder.h"
#include "intel_nir.h"
static bool
resize_deref(nir_builder *b, nir_deref_instr *deref,
unsigned num_components, unsigned bit_size)
{
if (deref->def.num_components == num_components &&
deref->def.bit_size == bit_size)
return false;
/* NIR requires array indices have to match the deref bit size */
if (deref->def.bit_size != bit_size &&
(deref->deref_type == nir_deref_type_array ||
deref->deref_type == nir_deref_type_ptr_as_array)) {
b->cursor = nir_before_instr(&deref->instr);
nir_def *idx;
if (nir_src_is_const(deref->arr.index)) {
idx = nir_imm_intN_t(b, nir_src_as_int(deref->arr.index), bit_size);
} else {
idx = nir_i2iN(b, deref->arr.index.ssa, bit_size);
}
nir_src_rewrite(&deref->arr.index, idx);
}
deref->def.num_components = num_components;
deref->def.bit_size = bit_size;
return true;
}
static bool
lower_rt_io_derefs(nir_shader *shader)
{
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
bool progress = false;
unsigned num_shader_call_vars = 0;
nir_foreach_variable_with_modes(var, shader, nir_var_shader_call_data)
num_shader_call_vars++;
unsigned num_ray_hit_attrib_vars = 0;
nir_foreach_variable_with_modes(var, shader, nir_var_ray_hit_attrib)
num_ray_hit_attrib_vars++;
/* At most one payload is allowed because it's an input. Technically, this
* is also true for hit attribute variables. However, after we inline an
* any-hit shader into an intersection shader, we can end up with multiple
* hit attribute variables. They'll end up mapping to a cast from the same
* base pointer so this is fine.
*/
assert(num_shader_call_vars <= 1);
nir_builder b = nir_builder_at(nir_before_impl(impl));
nir_def *call_data_addr = NULL;
if (num_shader_call_vars > 0) {
assert(shader->scratch_size >= BRW_BTD_STACK_CALLEE_DATA_SIZE);
call_data_addr =
brw_nir_rt_load_scratch(&b, BRW_BTD_STACK_CALL_DATA_PTR_OFFSET, 8,
1, 64);
progress = true;
}
gl_shader_stage stage = shader->info.stage;
nir_def *hit_attrib_addr = NULL;
if (num_ray_hit_attrib_vars > 0) {
assert(stage == MESA_SHADER_ANY_HIT ||
stage == MESA_SHADER_CLOSEST_HIT ||
stage == MESA_SHADER_INTERSECTION);
nir_def *hit_addr =
brw_nir_rt_mem_hit_addr(&b, stage == MESA_SHADER_CLOSEST_HIT);
/* The vec2 barycentrics are in 2nd and 3rd dwords of MemHit */
nir_def *bary_addr = nir_iadd_imm(&b, hit_addr, 4);
hit_attrib_addr = nir_bcsel(&b, nir_load_leaf_procedural_intel(&b),
brw_nir_rt_hit_attrib_data_addr(&b),
bary_addr);
progress = true;
}
nir_foreach_block(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_deref)
continue;
nir_deref_instr *deref = nir_instr_as_deref(instr);
if (nir_deref_mode_is(deref, nir_var_shader_call_data)) {
deref->modes = nir_var_function_temp;
if (deref->deref_type == nir_deref_type_var) {
b.cursor = nir_before_instr(&deref->instr);
nir_deref_instr *cast =
nir_build_deref_cast(&b, call_data_addr,
nir_var_function_temp,
deref->var->type, 0);
nir_def_rewrite_uses(&deref->def,
&cast->def);
nir_instr_remove(&deref->instr);
progress = true;
}
} else if (nir_deref_mode_is(deref, nir_var_ray_hit_attrib)) {
deref->modes = nir_var_function_temp;
if (deref->deref_type == nir_deref_type_var) {
b.cursor = nir_before_instr(&deref->instr);
nir_deref_instr *cast =
nir_build_deref_cast(&b, hit_attrib_addr,
nir_var_function_temp,
deref->type, 0);
nir_def_rewrite_uses(&deref->def,
&cast->def);
nir_instr_remove(&deref->instr);
progress = true;
}
}
/* We're going to lower all function_temp memory to scratch using
* 64-bit addresses. We need to resize all our derefs first or else
* nir_lower_explicit_io will have a fit.
*/
if (nir_deref_mode_is(deref, nir_var_function_temp) &&
resize_deref(&b, deref, 1, 64))
progress = true;
}
}
if (progress) {
nir_metadata_preserve(impl, nir_metadata_block_index |
nir_metadata_dominance);
} else {
nir_metadata_preserve(impl, nir_metadata_all);
}
return progress;
}
/** Lowers ray-tracing shader I/O and scratch access
*
* SPV_KHR_ray_tracing adds three new types of I/O, each of which need their
* own bit of special care:
*
* - Shader payload data: This is represented by the IncomingCallableData
* and IncomingRayPayload storage classes which are both represented by
* nir_var_call_data in NIR. There is at most one of these per-shader and
* they contain payload data passed down the stack from the parent shader
* when it calls executeCallable() or traceRay(). In our implementation,
* the actual storage lives in the calling shader's scratch space and we're
* passed a pointer to it.
*
* - Hit attribute data: This is represented by the HitAttribute storage
* class in SPIR-V and nir_var_ray_hit_attrib in NIR. For triangle
* geometry, it's supposed to contain two floats which are the barycentric
* coordinates. For AABS/procedural geometry, it contains the hit data
* written out by the intersection shader. In our implementation, it's a
* 64-bit pointer which points either to the u/v area of the relevant
* MemHit data structure or the space right after the HW ray stack entry.
*
* - Shader record buffer data: This allows read-only access to the data
* stored in the SBT right after the bindless shader handles. It's
* effectively a UBO with a magic address. Coming out of spirv_to_nir,
* we get a nir_intrinsic_load_shader_record_ptr which is cast to a
* nir_var_mem_global deref and all access happens through that. The
* shader_record_ptr system value is handled in brw_nir_lower_rt_intrinsics
* and we assume nir_lower_explicit_io is called elsewhere thanks to
* VK_KHR_buffer_device_address so there's really nothing to do here.
*
* We also handle lowering any remaining function_temp variables to scratch at
* this point. This gets rid of any remaining arrays and also takes care of
* the sending side of ray payloads where we pass pointers to a function_temp
* variable down the call stack.
*/
static void
lower_rt_io_and_scratch(nir_shader *nir)
{
/* First, we to ensure all the I/O variables have explicit types. Because
* these are shader-internal and don't come in from outside, they don't
* have an explicit memory layout and we have to assign them one.
*/
NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
nir_var_function_temp |
nir_var_shader_call_data |
nir_var_ray_hit_attrib,
glsl_get_natural_size_align_bytes);
/* Now patch any derefs to I/O vars */
NIR_PASS_V(nir, lower_rt_io_derefs);
/* Finally, lower any remaining function_temp, mem_constant, or
* ray_hit_attrib access to 64-bit global memory access.
*/
NIR_PASS_V(nir, nir_lower_explicit_io,
nir_var_function_temp |
nir_var_mem_constant |
nir_var_ray_hit_attrib,
nir_address_format_64bit_global);
}
static void
build_terminate_ray(nir_builder *b)
{
nir_def *skip_closest_hit = nir_test_mask(b, nir_load_ray_flags(b),
BRW_RT_RAY_FLAG_SKIP_CLOSEST_HIT_SHADER);
nir_push_if(b, skip_closest_hit);
{
/* The shader that calls traceRay() is unable to access any ray hit
* information except for that which is explicitly written into the ray
* payload by shaders invoked during the trace. If there's no closest-
* hit shader, then accepting the hit has no observable effect; it's
* just extra memory traffic for no reason.
*/
brw_nir_btd_return(b);
nir_jump(b, nir_jump_halt);
}
nir_push_else(b, NULL);
{
/* The closest hit shader is in the same shader group as the any-hit
* shader that we're currently in. We can get the address for its SBT
* handle by looking at the shader record pointer and subtracting the
* size of a SBT handle. The BINDLESS_SHADER_RECORD for a closest hit
* shader is the first one in the SBT handle.
*/
nir_def *closest_hit =
nir_iadd_imm(b, nir_load_shader_record_ptr(b),
-BRW_RT_SBT_HANDLE_SIZE);
brw_nir_rt_commit_hit(b);
brw_nir_btd_spawn(b, closest_hit);
nir_jump(b, nir_jump_halt);
}
nir_pop_if(b, NULL);
}
/** Lowers away ray walk intrinsics
*
* This lowers terminate_ray, ignore_ray_intersection, and the NIR-specific
* accept_ray_intersection intrinsics to the appropriate Intel-specific
* intrinsics.
*/
static bool
lower_ray_walk_intrinsics(nir_shader *shader,
const struct intel_device_info *devinfo)
{
assert(shader->info.stage == MESA_SHADER_ANY_HIT ||
shader->info.stage == MESA_SHADER_INTERSECTION);
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
nir_builder b = nir_builder_create(impl);
bool progress = false;
nir_foreach_block_safe(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_ignore_ray_intersection: {
b.cursor = nir_instr_remove(&intrin->instr);
/* We put the newly emitted code inside a dummy if because it's
* going to contain a jump instruction and we don't want to deal
* with that mess here. It'll get dealt with by our control-flow
* optimization passes.
*/
nir_push_if(&b, nir_imm_true(&b));
nir_trace_ray_intel(&b,
nir_load_btd_global_arg_addr_intel(&b),
nir_imm_int(&b, BRW_RT_BVH_LEVEL_OBJECT),
nir_imm_int(&b, GEN_RT_TRACE_RAY_CONTINUE),
.synchronous = false);
nir_jump(&b, nir_jump_halt);
nir_pop_if(&b, NULL);
progress = true;
break;
}
case nir_intrinsic_accept_ray_intersection: {
b.cursor = nir_instr_remove(&intrin->instr);
nir_def *terminate = nir_test_mask(&b, nir_load_ray_flags(&b),
BRW_RT_RAY_FLAG_TERMINATE_ON_FIRST_HIT);
nir_push_if(&b, terminate);
{
build_terminate_ray(&b);
}
nir_push_else(&b, NULL);
{
nir_trace_ray_intel(&b,
nir_load_btd_global_arg_addr_intel(&b),
nir_imm_int(&b, BRW_RT_BVH_LEVEL_OBJECT),
nir_imm_int(&b, GEN_RT_TRACE_RAY_COMMIT),
.synchronous = false);
nir_jump(&b, nir_jump_halt);
}
nir_pop_if(&b, NULL);
progress = true;
break;
}
case nir_intrinsic_terminate_ray: {
b.cursor = nir_instr_remove(&intrin->instr);
build_terminate_ray(&b);
progress = true;
break;
}
default:
break;
}
}
}
if (progress) {
nir_metadata_preserve(impl, nir_metadata_none);
} else {
nir_metadata_preserve(impl, nir_metadata_all);
}
return progress;
}
void
brw_nir_lower_raygen(nir_shader *nir)
{
assert(nir->info.stage == MESA_SHADER_RAYGEN);
NIR_PASS_V(nir, brw_nir_lower_shader_returns);
lower_rt_io_and_scratch(nir);
}
void
brw_nir_lower_any_hit(nir_shader *nir, const struct intel_device_info *devinfo)
{
assert(nir->info.stage == MESA_SHADER_ANY_HIT);
NIR_PASS_V(nir, brw_nir_lower_shader_returns);
NIR_PASS_V(nir, lower_ray_walk_intrinsics, devinfo);
lower_rt_io_and_scratch(nir);
}
void
brw_nir_lower_closest_hit(nir_shader *nir)
{
assert(nir->info.stage == MESA_SHADER_CLOSEST_HIT);
NIR_PASS_V(nir, brw_nir_lower_shader_returns);
lower_rt_io_and_scratch(nir);
}
void
brw_nir_lower_miss(nir_shader *nir)
{
assert(nir->info.stage == MESA_SHADER_MISS);
NIR_PASS_V(nir, brw_nir_lower_shader_returns);
lower_rt_io_and_scratch(nir);
}
void
brw_nir_lower_callable(nir_shader *nir)
{
assert(nir->info.stage == MESA_SHADER_CALLABLE);
NIR_PASS_V(nir, brw_nir_lower_shader_returns);
lower_rt_io_and_scratch(nir);
}
void
brw_nir_lower_combined_intersection_any_hit(nir_shader *intersection,
const nir_shader *any_hit,
const struct intel_device_info *devinfo)
{
assert(intersection->info.stage == MESA_SHADER_INTERSECTION);
assert(any_hit == NULL || any_hit->info.stage == MESA_SHADER_ANY_HIT);
NIR_PASS_V(intersection, brw_nir_lower_shader_returns);
NIR_PASS_V(intersection, brw_nir_lower_intersection_shader,
any_hit, devinfo);
NIR_PASS_V(intersection, lower_ray_walk_intrinsics, devinfo);
lower_rt_io_and_scratch(intersection);
}
static nir_def *
build_load_uniform(nir_builder *b, unsigned offset,
unsigned num_components, unsigned bit_size)
{
return nir_load_uniform(b, num_components, bit_size, nir_imm_int(b, 0),
.base = offset,
.range = num_components * bit_size / 8);
}
#define load_trampoline_param(b, name, num_components, bit_size) \
build_load_uniform((b), offsetof(struct brw_rt_raygen_trampoline_params, name), \
(num_components), (bit_size))
nir_shader *
brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler,
void *mem_ctx)
{
const struct intel_device_info *devinfo = compiler->devinfo;
const nir_shader_compiler_options *nir_options =
compiler->nir_options[MESA_SHADER_COMPUTE];
STATIC_ASSERT(sizeof(struct brw_rt_raygen_trampoline_params) == 32);
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
nir_options,
"RT Ray-Gen Trampoline");
ralloc_steal(mem_ctx, b.shader);
b.shader->info.workgroup_size_variable = true;
/* The RT global data and raygen BINDLESS_SHADER_RECORD addresses are
* passed in as push constants in the first register. We deal with the
* raygen BSR address here; the global data we'll deal with later.
*/
b.shader->num_uniforms = 32;
nir_def *raygen_param_bsr_addr =
load_trampoline_param(&b, raygen_bsr_addr, 1, 64);
nir_def *is_indirect =
nir_i2b(&b, load_trampoline_param(&b, is_indirect, 1, 8));
nir_def *local_shift =
nir_u2u32(&b, load_trampoline_param(&b, local_group_size_log2, 3, 8));
nir_def *raygen_indirect_bsr_addr;
nir_push_if(&b, is_indirect);
{
raygen_indirect_bsr_addr =
nir_load_global_constant(&b, raygen_param_bsr_addr,
8 /* align */,
1 /* components */,
64 /* bit_size */);
}
nir_pop_if(&b, NULL);
nir_def *raygen_bsr_addr =
nir_if_phi(&b, raygen_indirect_bsr_addr, raygen_param_bsr_addr);
nir_def *global_id = nir_load_workgroup_id_zero_base(&b);
nir_def *simd_channel = nir_load_subgroup_invocation(&b);
nir_def *local_x =
nir_ubfe(&b, simd_channel, nir_imm_int(&b, 0),
nir_channel(&b, local_shift, 0));
nir_def *local_y =
nir_ubfe(&b, simd_channel, nir_channel(&b, local_shift, 0),
nir_channel(&b, local_shift, 1));
nir_def *local_z =
nir_ubfe(&b, simd_channel,
nir_iadd(&b, nir_channel(&b, local_shift, 0),
nir_channel(&b, local_shift, 1)),
nir_channel(&b, local_shift, 2));
nir_def *launch_id =
nir_iadd(&b, nir_ishl(&b, global_id, local_shift),
nir_vec3(&b, local_x, local_y, local_z));
nir_def *launch_size = nir_load_ray_launch_size(&b);
nir_push_if(&b, nir_ball(&b, nir_ult(&b, launch_id, launch_size)));
{
nir_store_global(&b, brw_nir_rt_sw_hotzone_addr(&b, devinfo), 16,
nir_vec4(&b, nir_imm_int(&b, 0), /* Stack ptr */
nir_channel(&b, launch_id, 0),
nir_channel(&b, launch_id, 1),
nir_channel(&b, launch_id, 2)),
0xf /* write mask */);
brw_nir_btd_spawn(&b, raygen_bsr_addr);
}
nir_push_else(&b, NULL);
{
/* Even though these invocations aren't being used for anything, the
* hardware allocated stack IDs for them. They need to retire them.
*/
brw_nir_btd_retire(&b);
}
nir_pop_if(&b, NULL);
nir_shader *nir = b.shader;
nir->info.name = ralloc_strdup(nir, "RT: TraceRay trampoline");
nir_validate_shader(nir, "in brw_nir_create_raygen_trampoline");
struct brw_nir_compiler_opts opts = {};
brw_preprocess_nir(compiler, nir, &opts);
NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, devinfo);
b = nir_builder_create(nir_shader_get_entrypoint(b.shader));
/* brw_nir_lower_rt_intrinsics will leave us with a btd_global_arg_addr
* intrinsic which doesn't exist in compute shaders. We also created one
* above when we generated the BTD spawn intrinsic. Now we go through and
* replace them with a uniform load.
*/
nir_foreach_block(block, b.impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_btd_global_arg_addr_intel)
continue;
b.cursor = nir_before_instr(&intrin->instr);
nir_def *global_arg_addr =
load_trampoline_param(&b, rt_disp_globals_addr, 1, 64);
nir_def_rewrite_uses(&intrin->def,
global_arg_addr);
nir_instr_remove(instr);
}
}
NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics, devinfo, NULL);
const bool is_scalar = true;
brw_nir_optimize(nir, is_scalar, devinfo);
return nir;
}

View file

@ -0,0 +1,76 @@
/*
* Copyright © 2020 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_NIR_RT_H
#define BRW_NIR_RT_H
#include "brw_nir.h"
#include "brw_rt.h"
#ifdef __cplusplus
extern "C" {
#endif
void brw_nir_lower_raygen(nir_shader *nir);
void brw_nir_lower_any_hit(nir_shader *nir,
const struct intel_device_info *devinfo);
void brw_nir_lower_closest_hit(nir_shader *nir);
void brw_nir_lower_miss(nir_shader *nir);
void brw_nir_lower_callable(nir_shader *nir);
void brw_nir_lower_combined_intersection_any_hit(nir_shader *intersection,
const nir_shader *any_hit,
const struct intel_device_info *devinfo);
/* We reserve the first 16B of the stack for callee data pointers */
#define BRW_BTD_STACK_RESUME_BSR_ADDR_OFFSET 0
#define BRW_BTD_STACK_CALL_DATA_PTR_OFFSET 8
#define BRW_BTD_STACK_CALLEE_DATA_SIZE 16
/* We require the stack to be 8B aligned at the start of a shader */
#define BRW_BTD_STACK_ALIGN 8
bool brw_nir_lower_ray_queries(nir_shader *shader,
const struct intel_device_info *devinfo);
void brw_nir_lower_shader_returns(nir_shader *shader);
bool brw_nir_lower_shader_calls(nir_shader *shader, struct brw_bs_prog_key *key);
void brw_nir_lower_rt_intrinsics(nir_shader *shader,
const struct intel_device_info *devinfo);
void brw_nir_lower_intersection_shader(nir_shader *intersection,
const nir_shader *any_hit,
const struct intel_device_info *devinfo);
nir_shader *
brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler,
void *mem_ctx);
nir_shader *
brw_nir_create_trivial_return_shader(const struct brw_compiler *compiler,
void *mem_ctx);
#ifdef __cplusplus
}
#endif
#endif /* BRW_NIR_RT_H */

View file

@ -0,0 +1,990 @@
/*
* Copyright © 2020 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_NIR_RT_BUILDER_H
#define BRW_NIR_RT_BUILDER_H
/* This file provides helpers to access memory based data structures that the
* RT hardware reads/writes and their locations.
*
* See also "Memory Based Data Structures for Ray Tracing" (BSpec 47547) and
* "Ray Tracing Address Computation for Memory Resident Structures" (BSpec
* 47550).
*/
#include "brw_rt.h"
#include "nir_builder.h"
#define is_access_for_builder(b) \
((b)->shader->info.stage == MESA_SHADER_FRAGMENT ? \
ACCESS_INCLUDE_HELPERS : 0)
static inline nir_def *
brw_nir_rt_load(nir_builder *b, nir_def *addr, unsigned align,
unsigned components, unsigned bit_size)
{
return nir_build_load_global(b, components, bit_size, addr,
.align_mul = align,
.access = is_access_for_builder(b));
}
static inline void
brw_nir_rt_store(nir_builder *b, nir_def *addr, unsigned align,
nir_def *value, unsigned write_mask)
{
nir_build_store_global(b, value, addr,
.align_mul = align,
.write_mask = (write_mask) &
BITFIELD_MASK(value->num_components),
.access = is_access_for_builder(b));
}
static inline nir_def *
brw_nir_rt_load_const(nir_builder *b, unsigned components,
nir_def *addr, nir_def *pred)
{
return nir_load_global_const_block_intel(b, components, addr, pred);
}
static inline nir_def *
brw_load_btd_dss_id(nir_builder *b)
{
return nir_load_topology_id_intel(b, .base = BRW_TOPOLOGY_ID_DSS);
}
static inline nir_def *
brw_nir_rt_load_num_simd_lanes_per_dss(nir_builder *b,
const struct intel_device_info *devinfo)
{
return nir_imm_int(b, devinfo->num_thread_per_eu *
devinfo->max_eus_per_subslice *
16 /* The RT computation is based off SIMD16 */);
}
static inline nir_def *
brw_load_eu_thread_simd(nir_builder *b)
{
return nir_load_topology_id_intel(b, .base = BRW_TOPOLOGY_ID_EU_THREAD_SIMD);
}
static inline nir_def *
brw_nir_rt_async_stack_id(nir_builder *b)
{
return nir_iadd(b, nir_umul_32x16(b, nir_load_ray_num_dss_rt_stacks_intel(b),
brw_load_btd_dss_id(b)),
nir_load_btd_stack_id_intel(b));
}
static inline nir_def *
brw_nir_rt_sync_stack_id(nir_builder *b)
{
return brw_load_eu_thread_simd(b);
}
/* We have our own load/store scratch helpers because they emit a global
* memory read or write based on the scratch_base_ptr system value rather
* than a load/store_scratch intrinsic.
*/
static inline nir_def *
brw_nir_rt_load_scratch(nir_builder *b, uint32_t offset, unsigned align,
unsigned num_components, unsigned bit_size)
{
nir_def *addr =
nir_iadd_imm(b, nir_load_scratch_base_ptr(b, 1, 64, 1), offset);
return brw_nir_rt_load(b, addr, MIN2(align, BRW_BTD_STACK_ALIGN),
num_components, bit_size);
}
static inline void
brw_nir_rt_store_scratch(nir_builder *b, uint32_t offset, unsigned align,
nir_def *value, nir_component_mask_t write_mask)
{
nir_def *addr =
nir_iadd_imm(b, nir_load_scratch_base_ptr(b, 1, 64, 1), offset);
brw_nir_rt_store(b, addr, MIN2(align, BRW_BTD_STACK_ALIGN),
value, write_mask);
}
static inline void
brw_nir_btd_spawn(nir_builder *b, nir_def *record_addr)
{
nir_btd_spawn_intel(b, nir_load_btd_global_arg_addr_intel(b), record_addr);
}
static inline void
brw_nir_btd_retire(nir_builder *b)
{
nir_btd_retire_intel(b);
}
/** This is a pseudo-op which does a bindless return
*
* It loads the return address from the stack and calls btd_spawn to spawn the
* resume shader.
*/
static inline void
brw_nir_btd_return(struct nir_builder *b)
{
nir_def *resume_addr =
brw_nir_rt_load_scratch(b, BRW_BTD_STACK_RESUME_BSR_ADDR_OFFSET,
8 /* align */, 1, 64);
brw_nir_btd_spawn(b, resume_addr);
}
static inline void
assert_def_size(nir_def *def, unsigned num_components, unsigned bit_size)
{
assert(def->num_components == num_components);
assert(def->bit_size == bit_size);
}
static inline nir_def *
brw_nir_num_rt_stacks(nir_builder *b,
const struct intel_device_info *devinfo)
{
return nir_imul_imm(b, nir_load_ray_num_dss_rt_stacks_intel(b),
intel_device_info_dual_subslice_id_bound(devinfo));
}
static inline nir_def *
brw_nir_rt_sw_hotzone_addr(nir_builder *b,
const struct intel_device_info *devinfo)
{
nir_def *offset32 =
nir_imul_imm(b, brw_nir_rt_async_stack_id(b),
BRW_RT_SIZEOF_HOTZONE);
offset32 = nir_iadd(b, offset32, nir_ineg(b,
nir_imul_imm(b, brw_nir_num_rt_stacks(b, devinfo),
BRW_RT_SIZEOF_HOTZONE)));
return nir_iadd(b, nir_load_ray_base_mem_addr_intel(b),
nir_i2i64(b, offset32));
}
static inline nir_def *
brw_nir_rt_sync_stack_addr(nir_builder *b,
nir_def *base_mem_addr,
const struct intel_device_info *devinfo)
{
/* For Ray queries (Synchronous Ray Tracing), the formula is similar but
* goes down from rtMemBasePtr :
*
* syncBase = RTDispatchGlobals.rtMemBasePtr
* - (DSSID * NUM_SIMD_LANES_PER_DSS + SyncStackID + 1)
* * syncStackSize
*
* We assume that we can calculate a 32-bit offset first and then add it
* to the 64-bit base address at the end.
*/
nir_def *offset32 =
nir_imul(b,
nir_iadd(b,
nir_imul(b, brw_load_btd_dss_id(b),
brw_nir_rt_load_num_simd_lanes_per_dss(b, devinfo)),
nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
nir_imm_int(b, BRW_RT_SIZEOF_RAY_QUERY));
return nir_isub(b, base_mem_addr, nir_u2u64(b, offset32));
}
static inline nir_def *
brw_nir_rt_stack_addr(nir_builder *b)
{
/* From the BSpec "Address Computation for Memory Based Data Structures:
* Ray and TraversalStack (Async Ray Tracing)":
*
* stackBase = RTDispatchGlobals.rtMemBasePtr
* + (DSSID * RTDispatchGlobals.numDSSRTStacks + stackID)
* * RTDispatchGlobals.stackSizePerRay // 64B aligned
*
* We assume that we can calculate a 32-bit offset first and then add it
* to the 64-bit base address at the end.
*/
nir_def *offset32 =
nir_imul(b, brw_nir_rt_async_stack_id(b),
nir_load_ray_hw_stack_size_intel(b));
return nir_iadd(b, nir_load_ray_base_mem_addr_intel(b),
nir_u2u64(b, offset32));
}
static inline nir_def *
brw_nir_rt_mem_hit_addr_from_addr(nir_builder *b,
nir_def *stack_addr,
bool committed)
{
return nir_iadd_imm(b, stack_addr, committed ? 0 : BRW_RT_SIZEOF_HIT_INFO);
}
static inline nir_def *
brw_nir_rt_mem_hit_addr(nir_builder *b, bool committed)
{
return nir_iadd_imm(b, brw_nir_rt_stack_addr(b),
committed ? 0 : BRW_RT_SIZEOF_HIT_INFO);
}
static inline nir_def *
brw_nir_rt_hit_attrib_data_addr(nir_builder *b)
{
return nir_iadd_imm(b, brw_nir_rt_stack_addr(b),
BRW_RT_OFFSETOF_HIT_ATTRIB_DATA);
}
static inline nir_def *
brw_nir_rt_mem_ray_addr(nir_builder *b,
nir_def *stack_addr,
enum brw_rt_bvh_level bvh_level)
{
/* From the BSpec "Address Computation for Memory Based Data Structures:
* Ray and TraversalStack (Async Ray Tracing)":
*
* rayBase = stackBase + sizeof(HitInfo) * 2 // 64B aligned
* rayPtr = rayBase + bvhLevel * sizeof(Ray); // 64B aligned
*
* In Vulkan, we always have exactly two levels of BVH: World and Object.
*/
uint32_t offset = BRW_RT_SIZEOF_HIT_INFO * 2 +
bvh_level * BRW_RT_SIZEOF_RAY;
return nir_iadd_imm(b, stack_addr, offset);
}
static inline nir_def *
brw_nir_rt_sw_stack_addr(nir_builder *b,
const struct intel_device_info *devinfo)
{
nir_def *addr = nir_load_ray_base_mem_addr_intel(b);
nir_def *offset32 = nir_imul(b, brw_nir_num_rt_stacks(b, devinfo),
nir_load_ray_hw_stack_size_intel(b));
addr = nir_iadd(b, addr, nir_u2u64(b, offset32));
nir_def *offset_in_stack =
nir_imul(b, nir_u2u64(b, brw_nir_rt_async_stack_id(b)),
nir_u2u64(b, nir_load_ray_sw_stack_size_intel(b)));
return nir_iadd(b, addr, offset_in_stack);
}
static inline nir_def *
nir_unpack_64_4x16_split_z(nir_builder *b, nir_def *val)
{
return nir_unpack_32_2x16_split_x(b, nir_unpack_64_2x32_split_y(b, val));
}
struct brw_nir_rt_globals_defs {
nir_def *base_mem_addr;
nir_def *call_stack_handler_addr;
nir_def *hw_stack_size;
nir_def *num_dss_rt_stacks;
nir_def *hit_sbt_addr;
nir_def *hit_sbt_stride;
nir_def *miss_sbt_addr;
nir_def *miss_sbt_stride;
nir_def *sw_stack_size;
nir_def *launch_size;
nir_def *call_sbt_addr;
nir_def *call_sbt_stride;
nir_def *resume_sbt_addr;
};
static inline void
brw_nir_rt_load_globals_addr(nir_builder *b,
struct brw_nir_rt_globals_defs *defs,
nir_def *addr)
{
nir_def *data;
data = brw_nir_rt_load_const(b, 16, addr, nir_imm_true(b));
defs->base_mem_addr = nir_pack_64_2x32(b, nir_trim_vector(b, data, 2));
defs->call_stack_handler_addr =
nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 2));
defs->hw_stack_size = nir_channel(b, data, 4);
defs->num_dss_rt_stacks = nir_iand_imm(b, nir_channel(b, data, 5), 0xffff);
defs->hit_sbt_addr =
nir_pack_64_2x32_split(b, nir_channel(b, data, 8),
nir_extract_i16(b, nir_channel(b, data, 9),
nir_imm_int(b, 0)));
defs->hit_sbt_stride =
nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 9));
defs->miss_sbt_addr =
nir_pack_64_2x32_split(b, nir_channel(b, data, 10),
nir_extract_i16(b, nir_channel(b, data, 11),
nir_imm_int(b, 0)));
defs->miss_sbt_stride =
nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 11));
defs->sw_stack_size = nir_channel(b, data, 12);
defs->launch_size = nir_channels(b, data, 0x7u << 13);
data = brw_nir_rt_load_const(b, 8, nir_iadd_imm(b, addr, 64), nir_imm_true(b));
defs->call_sbt_addr =
nir_pack_64_2x32_split(b, nir_channel(b, data, 0),
nir_extract_i16(b, nir_channel(b, data, 1),
nir_imm_int(b, 0)));
defs->call_sbt_stride =
nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 1));
defs->resume_sbt_addr =
nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 2));
}
static inline void
brw_nir_rt_load_globals(nir_builder *b,
struct brw_nir_rt_globals_defs *defs)
{
brw_nir_rt_load_globals_addr(b, defs, nir_load_btd_global_arg_addr_intel(b));
}
static inline nir_def *
brw_nir_rt_unpack_leaf_ptr(nir_builder *b, nir_def *vec2)
{
/* Hit record leaf pointers are 42-bit and assumed to be in 64B chunks.
* This leaves 22 bits at the top for other stuff.
*/
nir_def *ptr64 = nir_imul_imm(b, nir_pack_64_2x32(b, vec2), 64);
/* The top 16 bits (remember, we shifted by 6 already) contain garbage
* that we need to get rid of.
*/
nir_def *ptr_lo = nir_unpack_64_2x32_split_x(b, ptr64);
nir_def *ptr_hi = nir_unpack_64_2x32_split_y(b, ptr64);
ptr_hi = nir_extract_i16(b, ptr_hi, nir_imm_int(b, 0));
return nir_pack_64_2x32_split(b, ptr_lo, ptr_hi);
}
/**
* MemHit memory layout (BSpec 47547) :
*
* name bits description
* - t 32 hit distance of current hit (or initial traversal distance)
* - u 32 barycentric hit coordinates
* - v 32 barycentric hit coordinates
* - primIndexDelta 16 prim index delta for compressed meshlets and quads
* - valid 1 set if there is a hit
* - leafType 3 type of node primLeafPtr is pointing to
* - primLeafIndex 4 index of the hit primitive inside the leaf
* - bvhLevel 3 the instancing level at which the hit occured
* - frontFace 1 whether we hit the front-facing side of a triangle (also used to pass opaque flag when calling intersection shaders)
* - pad0 4 unused bits
* - primLeafPtr 42 pointer to BVH leaf node (multiple of 64 bytes)
* - hitGroupRecPtr0 22 LSB of hit group record of the hit triangle (multiple of 16 bytes)
* - instLeafPtr 42 pointer to BVH instance leaf node (in multiple of 64 bytes)
* - hitGroupRecPtr1 22 MSB of hit group record of the hit triangle (multiple of 32 bytes)
*/
struct brw_nir_rt_mem_hit_defs {
nir_def *t;
nir_def *tri_bary; /**< Only valid for triangle geometry */
nir_def *aabb_hit_kind; /**< Only valid for AABB geometry */
nir_def *valid;
nir_def *leaf_type;
nir_def *prim_index_delta;
nir_def *prim_leaf_index;
nir_def *bvh_level;
nir_def *front_face;
nir_def *done; /**< Only for ray queries */
nir_def *prim_leaf_ptr;
nir_def *inst_leaf_ptr;
};
static inline void
brw_nir_rt_load_mem_hit_from_addr(nir_builder *b,
struct brw_nir_rt_mem_hit_defs *defs,
nir_def *stack_addr,
bool committed)
{
nir_def *hit_addr =
brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, committed);
nir_def *data = brw_nir_rt_load(b, hit_addr, 16, 4, 32);
defs->t = nir_channel(b, data, 0);
defs->aabb_hit_kind = nir_channel(b, data, 1);
defs->tri_bary = nir_channels(b, data, 0x6);
nir_def *bitfield = nir_channel(b, data, 3);
defs->prim_index_delta =
nir_ubitfield_extract(b, bitfield, nir_imm_int(b, 0), nir_imm_int(b, 16));
defs->valid = nir_i2b(b, nir_iand_imm(b, bitfield, 1u << 16));
defs->leaf_type =
nir_ubitfield_extract(b, bitfield, nir_imm_int(b, 17), nir_imm_int(b, 3));
defs->prim_leaf_index =
nir_ubitfield_extract(b, bitfield, nir_imm_int(b, 20), nir_imm_int(b, 4));
defs->bvh_level =
nir_ubitfield_extract(b, bitfield, nir_imm_int(b, 24), nir_imm_int(b, 3));
defs->front_face = nir_i2b(b, nir_iand_imm(b, bitfield, 1 << 27));
defs->done = nir_i2b(b, nir_iand_imm(b, bitfield, 1 << 28));
data = brw_nir_rt_load(b, nir_iadd_imm(b, hit_addr, 16), 16, 4, 32);
defs->prim_leaf_ptr =
brw_nir_rt_unpack_leaf_ptr(b, nir_channels(b, data, 0x3 << 0));
defs->inst_leaf_ptr =
brw_nir_rt_unpack_leaf_ptr(b, nir_channels(b, data, 0x3 << 2));
}
static inline void
brw_nir_rt_load_mem_hit(nir_builder *b,
struct brw_nir_rt_mem_hit_defs *defs,
bool committed)
{
brw_nir_rt_load_mem_hit_from_addr(b, defs, brw_nir_rt_stack_addr(b),
committed);
}
static inline void
brw_nir_memcpy_global(nir_builder *b,
nir_def *dst_addr, uint32_t dst_align,
nir_def *src_addr, uint32_t src_align,
uint32_t size)
{
/* We're going to copy in 16B chunks */
assert(size % 16 == 0);
dst_align = MIN2(dst_align, 16);
src_align = MIN2(src_align, 16);
for (unsigned offset = 0; offset < size; offset += 16) {
nir_def *data =
brw_nir_rt_load(b, nir_iadd_imm(b, src_addr, offset), 16,
4, 32);
brw_nir_rt_store(b, nir_iadd_imm(b, dst_addr, offset), 16,
data, 0xf /* write_mask */);
}
}
static inline void
brw_nir_memclear_global(nir_builder *b,
nir_def *dst_addr, uint32_t dst_align,
uint32_t size)
{
/* We're going to copy in 16B chunks */
assert(size % 16 == 0);
dst_align = MIN2(dst_align, 16);
nir_def *zero = nir_imm_ivec4(b, 0, 0, 0, 0);
for (unsigned offset = 0; offset < size; offset += 16) {
brw_nir_rt_store(b, nir_iadd_imm(b, dst_addr, offset), dst_align,
zero, 0xf /* write_mask */);
}
}
static inline nir_def *
brw_nir_rt_query_done(nir_builder *b, nir_def *stack_addr)
{
struct brw_nir_rt_mem_hit_defs hit_in = {};
brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, stack_addr,
false /* committed */);
return hit_in.done;
}
static inline void
brw_nir_rt_set_dword_bit_at(nir_builder *b,
nir_def *addr,
uint32_t addr_offset,
uint32_t bit)
{
nir_def *dword_addr = nir_iadd_imm(b, addr, addr_offset);
nir_def *dword = brw_nir_rt_load(b, dword_addr, 4, 1, 32);
brw_nir_rt_store(b, dword_addr, 4, nir_ior_imm(b, dword, 1u << bit), 0x1);
}
static inline void
brw_nir_rt_query_mark_done(nir_builder *b, nir_def *stack_addr)
{
brw_nir_rt_set_dword_bit_at(b,
brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr,
false /* committed */),
4 * 3 /* dword offset */, 28 /* bit */);
}
/* This helper clears the 3rd dword of the MemHit structure where the valid
* bit is located.
*/
static inline void
brw_nir_rt_query_mark_init(nir_builder *b, nir_def *stack_addr)
{
nir_def *dword_addr;
for (uint32_t i = 0; i < 2; i++) {
dword_addr =
nir_iadd_imm(b,
brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr,
i == 0 /* committed */),
4 * 3 /* dword offset */);
brw_nir_rt_store(b, dword_addr, 4, nir_imm_int(b, 0), 0x1);
}
}
/* This helper is pretty much a memcpy of uncommitted into committed hit
* structure, just adding the valid bit.
*/
static inline void
brw_nir_rt_commit_hit_addr(nir_builder *b, nir_def *stack_addr)
{
nir_def *dst_addr =
brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, true /* committed */);
nir_def *src_addr =
brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, false /* committed */);
for (unsigned offset = 0; offset < BRW_RT_SIZEOF_HIT_INFO; offset += 16) {
nir_def *data =
brw_nir_rt_load(b, nir_iadd_imm(b, src_addr, offset), 16, 4, 32);
if (offset == 0) {
data = nir_vec4(b,
nir_channel(b, data, 0),
nir_channel(b, data, 1),
nir_channel(b, data, 2),
nir_ior_imm(b,
nir_channel(b, data, 3),
0x1 << 16 /* valid */));
/* Also write the potential hit as we change it. */
brw_nir_rt_store(b, nir_iadd_imm(b, src_addr, offset), 16,
data, 0xf /* write_mask */);
}
brw_nir_rt_store(b, nir_iadd_imm(b, dst_addr, offset), 16,
data, 0xf /* write_mask */);
}
}
static inline void
brw_nir_rt_commit_hit(nir_builder *b)
{
nir_def *stack_addr = brw_nir_rt_stack_addr(b);
brw_nir_rt_commit_hit_addr(b, stack_addr);
}
static inline void
brw_nir_rt_generate_hit_addr(nir_builder *b, nir_def *stack_addr, nir_def *t_val)
{
nir_def *committed_addr =
brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, true /* committed */);
nir_def *potential_addr =
brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, false /* committed */);
/* Set:
*
* potential.t = t_val;
* potential.valid = true;
*/
nir_def *potential_hit_dwords_0_3 =
brw_nir_rt_load(b, potential_addr, 16, 4, 32);
potential_hit_dwords_0_3 =
nir_vec4(b,
t_val,
nir_channel(b, potential_hit_dwords_0_3, 1),
nir_channel(b, potential_hit_dwords_0_3, 2),
nir_ior_imm(b, nir_channel(b, potential_hit_dwords_0_3, 3),
(0x1 << 16) /* valid */));
brw_nir_rt_store(b, potential_addr, 16, potential_hit_dwords_0_3, 0xf /* write_mask */);
/* Set:
*
* committed.t = t_val;
* committed.u = 0.0f;
* committed.v = 0.0f;
* committed.valid = true;
* committed.leaf_type = potential.leaf_type;
* committed.bvh_level = BRW_RT_BVH_LEVEL_OBJECT;
* committed.front_face = false;
* committed.prim_leaf_index = 0;
* committed.done = false;
*/
nir_def *committed_hit_dwords_0_3 =
brw_nir_rt_load(b, committed_addr, 16, 4, 32);
committed_hit_dwords_0_3 =
nir_vec4(b,
t_val,
nir_imm_float(b, 0.0f),
nir_imm_float(b, 0.0f),
nir_ior_imm(b,
nir_ior_imm(b, nir_channel(b, potential_hit_dwords_0_3, 3), 0x000e0000),
(0x1 << 16) /* valid */ |
(BRW_RT_BVH_LEVEL_OBJECT << 24) /* leaf_type */));
brw_nir_rt_store(b, committed_addr, 16, committed_hit_dwords_0_3, 0xf /* write_mask */);
/* Set:
*
* committed.prim_leaf_ptr = potential.prim_leaf_ptr;
* committed.inst_leaf_ptr = potential.inst_leaf_ptr;
*/
brw_nir_memcpy_global(b,
nir_iadd_imm(b, committed_addr, 16), 16,
nir_iadd_imm(b, potential_addr, 16), 16,
16);
}
struct brw_nir_rt_mem_ray_defs {
nir_def *orig;
nir_def *dir;
nir_def *t_near;
nir_def *t_far;
nir_def *root_node_ptr;
nir_def *ray_flags;
nir_def *hit_group_sr_base_ptr;
nir_def *hit_group_sr_stride;
nir_def *miss_sr_ptr;
nir_def *shader_index_multiplier;
nir_def *inst_leaf_ptr;
nir_def *ray_mask;
};
static inline void
brw_nir_rt_store_mem_ray_query_at_addr(nir_builder *b,
nir_def *ray_addr,
const struct brw_nir_rt_mem_ray_defs *defs)
{
assert_def_size(defs->orig, 3, 32);
assert_def_size(defs->dir, 3, 32);
brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 0), 16,
nir_vec4(b, nir_channel(b, defs->orig, 0),
nir_channel(b, defs->orig, 1),
nir_channel(b, defs->orig, 2),
nir_channel(b, defs->dir, 0)),
~0 /* write mask */);
assert_def_size(defs->t_near, 1, 32);
assert_def_size(defs->t_far, 1, 32);
brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 16), 16,
nir_vec4(b, nir_channel(b, defs->dir, 1),
nir_channel(b, defs->dir, 2),
defs->t_near,
defs->t_far),
~0 /* write mask */);
assert_def_size(defs->root_node_ptr, 1, 64);
assert_def_size(defs->ray_flags, 1, 16);
brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 32), 16,
nir_vec2(b, nir_unpack_64_2x32_split_x(b, defs->root_node_ptr),
nir_pack_32_2x16_split(b,
nir_unpack_64_4x16_split_z(b, defs->root_node_ptr),
defs->ray_flags)),
0x3 /* write mask */);
/* leaf_ptr is optional */
nir_def *inst_leaf_ptr;
if (defs->inst_leaf_ptr) {
inst_leaf_ptr = defs->inst_leaf_ptr;
} else {
inst_leaf_ptr = nir_imm_int64(b, 0);
}
assert_def_size(inst_leaf_ptr, 1, 64);
assert_def_size(defs->ray_mask, 1, 32);
brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 56), 8,
nir_vec2(b, nir_unpack_64_2x32_split_x(b, inst_leaf_ptr),
nir_pack_32_2x16_split(b,
nir_unpack_64_4x16_split_z(b, inst_leaf_ptr),
nir_unpack_32_2x16_split_x(b, defs->ray_mask))),
~0 /* write mask */);
}
static inline void
brw_nir_rt_store_mem_ray(nir_builder *b,
const struct brw_nir_rt_mem_ray_defs *defs,
enum brw_rt_bvh_level bvh_level)
{
nir_def *ray_addr =
brw_nir_rt_mem_ray_addr(b, brw_nir_rt_stack_addr(b), bvh_level);
assert_def_size(defs->orig, 3, 32);
assert_def_size(defs->dir, 3, 32);
brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 0), 16,
nir_vec4(b, nir_channel(b, defs->orig, 0),
nir_channel(b, defs->orig, 1),
nir_channel(b, defs->orig, 2),
nir_channel(b, defs->dir, 0)),
~0 /* write mask */);
assert_def_size(defs->t_near, 1, 32);
assert_def_size(defs->t_far, 1, 32);
brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 16), 16,
nir_vec4(b, nir_channel(b, defs->dir, 1),
nir_channel(b, defs->dir, 2),
defs->t_near,
defs->t_far),
~0 /* write mask */);
assert_def_size(defs->root_node_ptr, 1, 64);
assert_def_size(defs->ray_flags, 1, 16);
assert_def_size(defs->hit_group_sr_base_ptr, 1, 64);
assert_def_size(defs->hit_group_sr_stride, 1, 16);
brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 32), 16,
nir_vec4(b, nir_unpack_64_2x32_split_x(b, defs->root_node_ptr),
nir_pack_32_2x16_split(b,
nir_unpack_64_4x16_split_z(b, defs->root_node_ptr),
defs->ray_flags),
nir_unpack_64_2x32_split_x(b, defs->hit_group_sr_base_ptr),
nir_pack_32_2x16_split(b,
nir_unpack_64_4x16_split_z(b, defs->hit_group_sr_base_ptr),
defs->hit_group_sr_stride)),
~0 /* write mask */);
/* leaf_ptr is optional */
nir_def *inst_leaf_ptr;
if (defs->inst_leaf_ptr) {
inst_leaf_ptr = defs->inst_leaf_ptr;
} else {
inst_leaf_ptr = nir_imm_int64(b, 0);
}
assert_def_size(defs->miss_sr_ptr, 1, 64);
assert_def_size(defs->shader_index_multiplier, 1, 32);
assert_def_size(inst_leaf_ptr, 1, 64);
assert_def_size(defs->ray_mask, 1, 32);
brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 48), 16,
nir_vec4(b, nir_unpack_64_2x32_split_x(b, defs->miss_sr_ptr),
nir_pack_32_2x16_split(b,
nir_unpack_64_4x16_split_z(b, defs->miss_sr_ptr),
nir_unpack_32_2x16_split_x(b,
nir_ishl(b, defs->shader_index_multiplier,
nir_imm_int(b, 8)))),
nir_unpack_64_2x32_split_x(b, inst_leaf_ptr),
nir_pack_32_2x16_split(b,
nir_unpack_64_4x16_split_z(b, inst_leaf_ptr),
nir_unpack_32_2x16_split_x(b, defs->ray_mask))),
~0 /* write mask */);
}
static inline void
brw_nir_rt_load_mem_ray_from_addr(nir_builder *b,
struct brw_nir_rt_mem_ray_defs *defs,
nir_def *ray_base_addr,
enum brw_rt_bvh_level bvh_level)
{
nir_def *ray_addr = brw_nir_rt_mem_ray_addr(b,
ray_base_addr,
bvh_level);
nir_def *data[4] = {
brw_nir_rt_load(b, nir_iadd_imm(b, ray_addr, 0), 16, 4, 32),
brw_nir_rt_load(b, nir_iadd_imm(b, ray_addr, 16), 16, 4, 32),
brw_nir_rt_load(b, nir_iadd_imm(b, ray_addr, 32), 16, 4, 32),
brw_nir_rt_load(b, nir_iadd_imm(b, ray_addr, 48), 16, 4, 32),
};
defs->orig = nir_trim_vector(b, data[0], 3);
defs->dir = nir_vec3(b, nir_channel(b, data[0], 3),
nir_channel(b, data[1], 0),
nir_channel(b, data[1], 1));
defs->t_near = nir_channel(b, data[1], 2);
defs->t_far = nir_channel(b, data[1], 3);
defs->root_node_ptr =
nir_pack_64_2x32_split(b, nir_channel(b, data[2], 0),
nir_extract_i16(b, nir_channel(b, data[2], 1),
nir_imm_int(b, 0)));
defs->ray_flags =
nir_unpack_32_2x16_split_y(b, nir_channel(b, data[2], 1));
defs->hit_group_sr_base_ptr =
nir_pack_64_2x32_split(b, nir_channel(b, data[2], 2),
nir_extract_i16(b, nir_channel(b, data[2], 3),
nir_imm_int(b, 0)));
defs->hit_group_sr_stride =
nir_unpack_32_2x16_split_y(b, nir_channel(b, data[2], 3));
defs->miss_sr_ptr =
nir_pack_64_2x32_split(b, nir_channel(b, data[3], 0),
nir_extract_i16(b, nir_channel(b, data[3], 1),
nir_imm_int(b, 0)));
defs->shader_index_multiplier =
nir_ushr(b, nir_unpack_32_2x16_split_y(b, nir_channel(b, data[3], 1)),
nir_imm_int(b, 8));
defs->inst_leaf_ptr =
nir_pack_64_2x32_split(b, nir_channel(b, data[3], 2),
nir_extract_i16(b, nir_channel(b, data[3], 3),
nir_imm_int(b, 0)));
defs->ray_mask =
nir_unpack_32_2x16_split_y(b, nir_channel(b, data[3], 3));
}
static inline void
brw_nir_rt_load_mem_ray(nir_builder *b,
struct brw_nir_rt_mem_ray_defs *defs,
enum brw_rt_bvh_level bvh_level)
{
brw_nir_rt_load_mem_ray_from_addr(b, defs, brw_nir_rt_stack_addr(b),
bvh_level);
}
struct brw_nir_rt_bvh_instance_leaf_defs {
nir_def *shader_index;
nir_def *contribution_to_hit_group_index;
nir_def *world_to_object[4];
nir_def *instance_id;
nir_def *instance_index;
nir_def *object_to_world[4];
};
static inline void
brw_nir_rt_load_bvh_instance_leaf(nir_builder *b,
struct brw_nir_rt_bvh_instance_leaf_defs *defs,
nir_def *leaf_addr)
{
nir_def *leaf_desc = brw_nir_rt_load(b, leaf_addr, 4, 2, 32);
defs->shader_index =
nir_iand_imm(b, nir_channel(b, leaf_desc, 0), (1 << 24) - 1);
defs->contribution_to_hit_group_index =
nir_iand_imm(b, nir_channel(b, leaf_desc, 1), (1 << 24) - 1);
defs->world_to_object[0] =
brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 16), 4, 3, 32);
defs->world_to_object[1] =
brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 28), 4, 3, 32);
defs->world_to_object[2] =
brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 40), 4, 3, 32);
/* The last column of the matrices is swapped between the two probably
* because it makes it easier/faster for hardware somehow.
*/
defs->object_to_world[3] =
brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 52), 4, 3, 32);
nir_def *data =
brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 64), 4, 4, 32);
defs->instance_id = nir_channel(b, data, 2);
defs->instance_index = nir_channel(b, data, 3);
defs->object_to_world[0] =
brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 80), 4, 3, 32);
defs->object_to_world[1] =
brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 92), 4, 3, 32);
defs->object_to_world[2] =
brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 104), 4, 3, 32);
defs->world_to_object[3] =
brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 116), 4, 3, 32);
}
struct brw_nir_rt_bvh_primitive_leaf_defs {
nir_def *shader_index;
nir_def *geom_mask;
nir_def *geom_index;
nir_def *type;
nir_def *geom_flags;
};
static inline void
brw_nir_rt_load_bvh_primitive_leaf(nir_builder *b,
struct brw_nir_rt_bvh_primitive_leaf_defs *defs,
nir_def *leaf_addr)
{
nir_def *desc = brw_nir_rt_load(b, leaf_addr, 4, 2, 32);
defs->shader_index =
nir_ubitfield_extract(b, nir_channel(b, desc, 0),
nir_imm_int(b, 23), nir_imm_int(b, 0));
defs->geom_mask =
nir_ubitfield_extract(b, nir_channel(b, desc, 0),
nir_imm_int(b, 31), nir_imm_int(b, 24));
defs->geom_index =
nir_ubitfield_extract(b, nir_channel(b, desc, 1),
nir_imm_int(b, 28), nir_imm_int(b, 0));
defs->type =
nir_ubitfield_extract(b, nir_channel(b, desc, 1),
nir_imm_int(b, 29), nir_imm_int(b, 29));
defs->geom_flags =
nir_ubitfield_extract(b, nir_channel(b, desc, 1),
nir_imm_int(b, 31), nir_imm_int(b, 30));
}
struct brw_nir_rt_bvh_primitive_leaf_positions_defs {
nir_def *positions[3];
};
static inline void
brw_nir_rt_load_bvh_primitive_leaf_positions(nir_builder *b,
struct brw_nir_rt_bvh_primitive_leaf_positions_defs *defs,
nir_def *leaf_addr)
{
for (unsigned i = 0; i < ARRAY_SIZE(defs->positions); i++) {
defs->positions[i] =
brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 16 + i * 4 * 3), 4, 3, 32);
}
}
static inline nir_def *
brw_nir_rt_load_primitive_id_from_hit(nir_builder *b,
nir_def *is_procedural,
const struct brw_nir_rt_mem_hit_defs *defs)
{
if (!is_procedural) {
is_procedural =
nir_ieq_imm(b, defs->leaf_type,
BRW_RT_BVH_NODE_TYPE_PROCEDURAL);
}
nir_def *prim_id_proc, *prim_id_quad;
nir_push_if(b, is_procedural);
{
/* For procedural leafs, the index is in dw[3]. */
nir_def *offset =
nir_iadd_imm(b, nir_ishl_imm(b, defs->prim_leaf_index, 2), 12);
prim_id_proc = nir_load_global(b, nir_iadd(b, defs->prim_leaf_ptr,
nir_u2u64(b, offset)),
4, /* align */ 1, 32);
}
nir_push_else(b, NULL);
{
/* For quad leafs, the index is dw[2] and there is a 16bit additional
* offset in dw[3].
*/
prim_id_quad = nir_load_global(b, nir_iadd_imm(b, defs->prim_leaf_ptr, 8),
4, /* align */ 1, 32);
prim_id_quad = nir_iadd(b,
prim_id_quad,
defs->prim_index_delta);
}
nir_pop_if(b, NULL);
return nir_if_phi(b, prim_id_proc, prim_id_quad);
}
static inline nir_def *
brw_nir_rt_acceleration_structure_to_root_node(nir_builder *b,
nir_def *as_addr)
{
/* The HW memory structure in which we specify what acceleration structure
* to traverse, takes the address to the root node in the acceleration
* structure, not the acceleration structure itself. To find that, we have
* to read the root node offset from the acceleration structure which is
* the first QWord.
*
* But if the acceleration structure pointer is NULL, then we should return
* NULL as root node pointer.
*
* TODO: we could optimize this by assuming that for a given version of the
* BVH, we can find the root node at a given offset.
*/
nir_def *root_node_ptr, *null_node_ptr;
nir_push_if(b, nir_ieq_imm(b, as_addr, 0));
{
null_node_ptr = nir_imm_int64(b, 0);
}
nir_push_else(b, NULL);
{
root_node_ptr =
nir_iadd(b, as_addr, brw_nir_rt_load(b, as_addr, 256, 1, 64));
}
nir_pop_if(b, NULL);
return nir_if_phi(b, null_node_ptr, root_node_ptr);
}
#endif /* BRW_NIR_RT_BUILDER_H */

View file

@ -0,0 +1,67 @@
#
# Copyright (C) 2016 Intel Corporation
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice (including the next
# paragraph) shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
# Prior to Kaby Lake, The SIN and COS instructions on Intel hardware can
# produce values slightly outside of the [-1.0, 1.0] range for a small set of
# values. Obviously, this can break everyone's expectations about trig
# functions. This appears to be fixed in Kaby Lake.
#
# According to an internal presentation, the COS instruction can produce
# a value up to 1.000027 for inputs in the range (0.08296, 0.09888). One
# suggested workaround is to multiply by 0.99997, scaling down the
# amplitude slightly. Apparently this also minimizes the error function,
# reducing the maximum error from 0.00006 to about 0.00003.
import argparse
import sys
from math import pi
TRIG_WORKAROUNDS = [
(('fsin', 'x(is_not_const)'), ('fmul', ('fsin', 'x'), 0.99997)),
(('fcos', 'x(is_not_const)'), ('fmul', ('fcos', 'x'), 0.99997)),
]
LIMIT_TRIG_INPUT_RANGE_WORKAROUND = [
(('fsin', 'x(is_not_const)'), ('fsin', ('fmod', 'x', 2.0 * pi))),
(('fcos', 'x(is_not_const)'), ('fcos', ('fmod', 'x', 2.0 * pi))),
]
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-p', '--import-path', required=True)
args = parser.parse_args()
sys.path.insert(0, args.import_path)
run()
def run():
import nir_algebraic # pylint: disable=import-error
print('#include "brw_nir.h"')
print(nir_algebraic.AlgebraicPass("brw_nir_apply_trig_workarounds",
TRIG_WORKAROUNDS).render())
print(nir_algebraic.AlgebraicPass("brw_nir_limit_trig_input_range_workaround",
LIMIT_TRIG_INPUT_RANGE_WORKAROUND).render())
if __name__ == '__main__':
main()

View file

@ -0,0 +1,75 @@
/*
* Copyright © 2014 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_reg.h"
union fu {
float f;
unsigned u;
struct {
unsigned mantissa:23;
unsigned exponent:8;
unsigned sign:1;
} s;
};
int
brw_float_to_vf(float f)
{
union fu fu = { .f = f };
/* ±0.0f is special cased. */
if (f == 0.0f)
return fu.s.sign << 7;
unsigned mantissa = fu.s.mantissa >> (23 - 4);
unsigned exponent = fu.s.exponent - (127 - 3);
unsigned vf = (fu.s.sign << 7) | (exponent << 4) | mantissa;
/* 0.125 would have had the same representation as 0.0, so reject it. */
if ((vf & 0x7f) == 0)
return -1;
/* Make sure the mantissa fits in 4-bits and the exponent in 3-bits. */
if (fu.u & 0x7ffff || exponent > 7)
return -1;
return vf;
}
float
brw_vf_to_float(unsigned char vf)
{
union fu fu;
/* ±0.0f is special cased. */
if (vf == 0x00 || vf == 0x80) {
fu.u = (unsigned)vf << 24;
return fu.f;
}
fu.s.sign = vf >> 7;
fu.s.exponent = ((vf & 0x70) >> 4) + (127 - 3);
fu.s.mantissa = (vf & 0xf) << (23 - 4);
return fu.f;
}

View file

@ -0,0 +1,243 @@
/*
* Copyright © 2013 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_shader.h"
using namespace brw;
/** @file brw_predicated_break.cpp
*
* Loops are often structured as
*
* loop:
* CMP.f0
* (+f0) IF
* BREAK
* ENDIF
* ...
* WHILE loop
*
* This peephole pass removes the IF and ENDIF instructions and predicates the
* BREAK, dropping two instructions from the loop body.
*
* If the loop was a DO { ... } WHILE loop, it looks like
*
* loop:
* ...
* CMP.f0
* (+f0) IF
* BREAK
* ENDIF
* WHILE loop
*
* and we can remove the BREAK instruction and predicate the WHILE.
*/
#define MAX_NESTING 128
struct loop_continue_tracking {
BITSET_WORD has_continue[BITSET_WORDS(MAX_NESTING)];
unsigned depth;
};
static void
enter_loop(struct loop_continue_tracking *s)
{
s->depth++;
/* Any loops deeper than that maximum nesting will just re-use the last
* flag. This simplifies most of the code. MAX_NESTING is chosen to be
* large enough that it is unlikely to occur. Even if it does, the
* optimization that uses this tracking is unlikely to make much
* difference.
*/
if (s->depth < MAX_NESTING)
BITSET_CLEAR(s->has_continue, s->depth);
}
static void
exit_loop(struct loop_continue_tracking *s)
{
assert(s->depth > 0);
s->depth--;
}
static void
set_continue(struct loop_continue_tracking *s)
{
const unsigned i = MIN2(s->depth, MAX_NESTING - 1);
BITSET_SET(s->has_continue, i);
}
static bool
has_continue(const struct loop_continue_tracking *s)
{
const unsigned i = MIN2(s->depth, MAX_NESTING - 1);
return BITSET_TEST(s->has_continue, i);
}
bool
opt_predicated_break(backend_shader *s)
{
bool progress = false;
struct loop_continue_tracking state = { {0, }, 0 };
foreach_block (block, s->cfg) {
/* DO instructions, by definition, can only be found at the beginning of
* basic blocks.
*/
backend_instruction *const do_inst = block->start();
/* BREAK, CONTINUE, and WHILE instructions, by definition, can only be
* found at the ends of basic blocks.
*/
backend_instruction *jump_inst = block->end();
if (do_inst->opcode == BRW_OPCODE_DO)
enter_loop(&state);
if (jump_inst->opcode == BRW_OPCODE_CONTINUE)
set_continue(&state);
else if (jump_inst->opcode == BRW_OPCODE_WHILE)
exit_loop(&state);
if (block->start_ip != block->end_ip)
continue;
if (jump_inst->opcode != BRW_OPCODE_BREAK &&
jump_inst->opcode != BRW_OPCODE_CONTINUE)
continue;
backend_instruction *if_inst = block->prev()->end();
if (if_inst->opcode != BRW_OPCODE_IF)
continue;
backend_instruction *endif_inst = block->next()->start();
if (endif_inst->opcode != BRW_OPCODE_ENDIF)
continue;
bblock_t *jump_block = block;
bblock_t *if_block = jump_block->prev();
bblock_t *endif_block = jump_block->next();
jump_inst->predicate = if_inst->predicate;
jump_inst->predicate_inverse = if_inst->predicate_inverse;
bblock_t *earlier_block = if_block;
if (if_block->start_ip == if_block->end_ip) {
earlier_block = if_block->prev();
}
if_inst->remove(if_block);
bblock_t *later_block = endif_block;
if (endif_block->start_ip == endif_block->end_ip) {
later_block = endif_block->next();
}
endif_inst->remove(endif_block);
if (!earlier_block->ends_with_control_flow()) {
/* FIXME: There is a potential problem here. If earlier_block starts
* with a DO instruction, this will delete the physical link to the
* WHILE block. It is unclear whether ENDIF has the same potential
* problem.
*/
assert(earlier_block->start() == NULL ||
earlier_block->start()->opcode != BRW_OPCODE_DO);
earlier_block->unlink_children();
earlier_block->add_successor(s->cfg->mem_ctx, jump_block,
bblock_link_logical);
}
if (!later_block->starts_with_control_flow()) {
later_block->unlink_parents();
}
/* If jump_block already has a link to later_block, don't create another
* one. Instead, promote the link to logical.
*/
bool need_to_link = true;
foreach_list_typed(bblock_link, link, link, &jump_block->children) {
if (link->block == later_block) {
assert(later_block->starts_with_control_flow());
/* Update the link from later_block back to jump_block. */
foreach_list_typed(bblock_link, parent_link, link, &later_block->parents) {
if (parent_link->block == jump_block) {
parent_link->kind = bblock_link_logical;
}
}
/* Update the link from jump_block to later_block. */
link->kind = bblock_link_logical;
need_to_link = false;
}
}
if (need_to_link) {
jump_block->add_successor(s->cfg->mem_ctx, later_block,
bblock_link_logical);
}
if (earlier_block->can_combine_with(jump_block)) {
earlier_block->combine_with(jump_block);
block = earlier_block;
}
/* Now look at the first instruction of the block following the BREAK. If
* it's a WHILE, we can delete the break, predicate the WHILE, and join
* the two basic blocks.
*
* This optimization can only be applied if the only instruction that
* can transfer control to the WHILE is the BREAK. If other paths can
* lead to the while, the flags may be in an unknown state, and the loop
* could terminate prematurely. This can occur if the loop contains a
* CONT instruction.
*/
bblock_t *while_block = earlier_block->next();
backend_instruction *while_inst = while_block->start();
if (jump_inst->opcode == BRW_OPCODE_BREAK &&
while_inst->opcode == BRW_OPCODE_WHILE &&
while_inst->predicate == BRW_PREDICATE_NONE &&
!has_continue(&state)) {
jump_inst->remove(earlier_block);
while_inst->predicate = jump_inst->predicate;
while_inst->predicate_inverse = !jump_inst->predicate_inverse;
assert(earlier_block->can_combine_with(while_block));
earlier_block->combine_with(while_block);
}
progress = true;
}
if (progress)
s->invalidate_analysis(DEPENDENCY_BLOCKS | DEPENDENCY_INSTRUCTIONS);
return progress;
}

View file

@ -0,0 +1,50 @@
/*
* Copyright © 2022 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_PRIM_H
#define BRW_PRIM_H
#define _3DPRIM_POINTLIST 0x01
#define _3DPRIM_LINELIST 0x02
#define _3DPRIM_LINESTRIP 0x03
#define _3DPRIM_TRILIST 0x04
#define _3DPRIM_TRISTRIP 0x05
#define _3DPRIM_TRIFAN 0x06
#define _3DPRIM_QUADLIST 0x07
#define _3DPRIM_QUADSTRIP 0x08
#define _3DPRIM_LINELIST_ADJ 0x09 /* G45+ */
#define _3DPRIM_LINESTRIP_ADJ 0x0A /* G45+ */
#define _3DPRIM_TRILIST_ADJ 0x0B /* G45+ */
#define _3DPRIM_TRISTRIP_ADJ 0x0C /* G45+ */
#define _3DPRIM_TRISTRIP_REVERSE 0x0D
#define _3DPRIM_POLYGON 0x0E
#define _3DPRIM_RECTLIST 0x0F
#define _3DPRIM_LINELOOP 0x10
#define _3DPRIM_POINTLIST_BF 0x11
#define _3DPRIM_LINESTRIP_CONT 0x12
#define _3DPRIM_LINESTRIP_BF 0x13
#define _3DPRIM_LINESTRIP_CONT_BF 0x14
#define _3DPRIM_TRIFAN_NOSTIPPLE 0x16
#define _3DPRIM_PATCHLIST(n) ({ assert(n > 0 && n <= 32); 0x20 + (n - 1); })
#endif /* BRW_PRIM_H */

View file

@ -0,0 +1,76 @@
/* -*- c++ -*- */
/*
* Copyright © 2021 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_PRIVATE_H
#define BRW_PRIVATE_H
#include "brw_compiler.h"
#include <variant>
unsigned brw_required_dispatch_width(const struct shader_info *info);
static constexpr int SIMD_COUNT = 3;
struct brw_simd_selection_state {
const struct intel_device_info *devinfo;
std::variant<struct brw_cs_prog_data *,
struct brw_bs_prog_data *> prog_data;
unsigned required_width;
const char *error[SIMD_COUNT];
bool compiled[SIMD_COUNT];
bool spilled[SIMD_COUNT];
};
inline int brw_simd_first_compiled(const brw_simd_selection_state &state)
{
for (int i = 0; i < SIMD_COUNT; i++) {
if (state.compiled[i])
return i;
}
return -1;
}
inline bool brw_simd_any_compiled(const brw_simd_selection_state &state)
{
return brw_simd_first_compiled(state) >= 0;
}
bool brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd);
void brw_simd_mark_compiled(brw_simd_selection_state &state, unsigned simd, bool spilled);
int brw_simd_select(const brw_simd_selection_state &state);
int brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo,
const struct brw_cs_prog_data *prog_data,
const unsigned *sizes);
bool brw_should_print_shader(const nir_shader *shader, uint64_t debug_flag);
#endif // BRW_PRIVATE_H

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,563 @@
/*
* Copyright © 2017 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_reg.h"
#include "brw_eu_defines.h"
#include "dev/intel_device_info.h"
#define INVALID (-1)
enum hw_reg_type {
BRW_HW_REG_TYPE_UD = 0,
BRW_HW_REG_TYPE_D = 1,
BRW_HW_REG_TYPE_UW = 2,
BRW_HW_REG_TYPE_W = 3,
BRW_HW_REG_TYPE_F = 7,
GFX8_HW_REG_TYPE_UQ = 8,
GFX8_HW_REG_TYPE_Q = 9,
BRW_HW_REG_TYPE_UB = 4,
BRW_HW_REG_TYPE_B = 5,
GFX7_HW_REG_TYPE_DF = 6,
GFX8_HW_REG_TYPE_HF = 10,
GFX11_HW_REG_TYPE_UD = 0,
GFX11_HW_REG_TYPE_D = 1,
GFX11_HW_REG_TYPE_UW = 2,
GFX11_HW_REG_TYPE_W = 3,
GFX11_HW_REG_TYPE_UB = 4,
GFX11_HW_REG_TYPE_B = 5,
GFX11_HW_REG_TYPE_UQ = 6,
GFX11_HW_REG_TYPE_Q = 7,
GFX11_HW_REG_TYPE_HF = 8,
GFX11_HW_REG_TYPE_F = 9,
GFX11_HW_REG_TYPE_DF = 10,
GFX11_HW_REG_TYPE_NF = 11,
};
enum hw_imm_type {
BRW_HW_IMM_TYPE_UD = 0,
BRW_HW_IMM_TYPE_D = 1,
BRW_HW_IMM_TYPE_UW = 2,
BRW_HW_IMM_TYPE_W = 3,
BRW_HW_IMM_TYPE_F = 7,
GFX8_HW_IMM_TYPE_UQ = 8,
GFX8_HW_IMM_TYPE_Q = 9,
BRW_HW_IMM_TYPE_UV = 4,
BRW_HW_IMM_TYPE_VF = 5,
BRW_HW_IMM_TYPE_V = 6,
GFX8_HW_IMM_TYPE_DF = 10,
GFX8_HW_IMM_TYPE_HF = 11,
GFX11_HW_IMM_TYPE_UD = 0,
GFX11_HW_IMM_TYPE_D = 1,
GFX11_HW_IMM_TYPE_UW = 2,
GFX11_HW_IMM_TYPE_W = 3,
GFX11_HW_IMM_TYPE_UV = 4,
GFX11_HW_IMM_TYPE_V = 5,
GFX11_HW_IMM_TYPE_UQ = 6,
GFX11_HW_IMM_TYPE_Q = 7,
GFX11_HW_IMM_TYPE_HF = 8,
GFX11_HW_IMM_TYPE_F = 9,
GFX11_HW_IMM_TYPE_DF = 10,
GFX11_HW_IMM_TYPE_VF = 11,
};
#define GFX12_HW_REG_TYPE_UINT(n) (n)
#define GFX12_HW_REG_TYPE_SINT(n) (0x4 | (n))
#define GFX12_HW_REG_TYPE_FLOAT(n) (0x8 | (n))
static const struct hw_type {
enum hw_reg_type reg_type;
enum hw_imm_type imm_type;
} gfx4_hw_type[] = {
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID, INVALID },
[BRW_REGISTER_TYPE_F] = { BRW_HW_REG_TYPE_F, BRW_HW_IMM_TYPE_F },
[BRW_REGISTER_TYPE_VF] = { INVALID, BRW_HW_IMM_TYPE_VF },
[BRW_REGISTER_TYPE_D] = { BRW_HW_REG_TYPE_D, BRW_HW_IMM_TYPE_D },
[BRW_REGISTER_TYPE_UD] = { BRW_HW_REG_TYPE_UD, BRW_HW_IMM_TYPE_UD },
[BRW_REGISTER_TYPE_W] = { BRW_HW_REG_TYPE_W, BRW_HW_IMM_TYPE_W },
[BRW_REGISTER_TYPE_UW] = { BRW_HW_REG_TYPE_UW, BRW_HW_IMM_TYPE_UW },
[BRW_REGISTER_TYPE_B] = { BRW_HW_REG_TYPE_B, INVALID },
[BRW_REGISTER_TYPE_UB] = { BRW_HW_REG_TYPE_UB, INVALID },
[BRW_REGISTER_TYPE_V] = { INVALID, BRW_HW_IMM_TYPE_V },
}, gfx6_hw_type[] = {
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID, INVALID },
[BRW_REGISTER_TYPE_F] = { BRW_HW_REG_TYPE_F, BRW_HW_IMM_TYPE_F },
[BRW_REGISTER_TYPE_VF] = { INVALID, BRW_HW_IMM_TYPE_VF },
[BRW_REGISTER_TYPE_D] = { BRW_HW_REG_TYPE_D, BRW_HW_IMM_TYPE_D },
[BRW_REGISTER_TYPE_UD] = { BRW_HW_REG_TYPE_UD, BRW_HW_IMM_TYPE_UD },
[BRW_REGISTER_TYPE_W] = { BRW_HW_REG_TYPE_W, BRW_HW_IMM_TYPE_W },
[BRW_REGISTER_TYPE_UW] = { BRW_HW_REG_TYPE_UW, BRW_HW_IMM_TYPE_UW },
[BRW_REGISTER_TYPE_B] = { BRW_HW_REG_TYPE_B, INVALID },
[BRW_REGISTER_TYPE_UB] = { BRW_HW_REG_TYPE_UB, INVALID },
[BRW_REGISTER_TYPE_V] = { INVALID, BRW_HW_IMM_TYPE_V },
[BRW_REGISTER_TYPE_UV] = { INVALID, BRW_HW_IMM_TYPE_UV },
}, gfx7_hw_type[] = {
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID, INVALID },
[BRW_REGISTER_TYPE_DF] = { GFX7_HW_REG_TYPE_DF, INVALID },
[BRW_REGISTER_TYPE_F] = { BRW_HW_REG_TYPE_F, BRW_HW_IMM_TYPE_F },
[BRW_REGISTER_TYPE_VF] = { INVALID, BRW_HW_IMM_TYPE_VF },
[BRW_REGISTER_TYPE_D] = { BRW_HW_REG_TYPE_D, BRW_HW_IMM_TYPE_D },
[BRW_REGISTER_TYPE_UD] = { BRW_HW_REG_TYPE_UD, BRW_HW_IMM_TYPE_UD },
[BRW_REGISTER_TYPE_W] = { BRW_HW_REG_TYPE_W, BRW_HW_IMM_TYPE_W },
[BRW_REGISTER_TYPE_UW] = { BRW_HW_REG_TYPE_UW, BRW_HW_IMM_TYPE_UW },
[BRW_REGISTER_TYPE_B] = { BRW_HW_REG_TYPE_B, INVALID },
[BRW_REGISTER_TYPE_UB] = { BRW_HW_REG_TYPE_UB, INVALID },
[BRW_REGISTER_TYPE_V] = { INVALID, BRW_HW_IMM_TYPE_V },
[BRW_REGISTER_TYPE_UV] = { INVALID, BRW_HW_IMM_TYPE_UV },
}, gfx8_hw_type[] = {
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID, INVALID },
[BRW_REGISTER_TYPE_DF] = { GFX7_HW_REG_TYPE_DF, GFX8_HW_IMM_TYPE_DF },
[BRW_REGISTER_TYPE_F] = { BRW_HW_REG_TYPE_F, BRW_HW_IMM_TYPE_F },
[BRW_REGISTER_TYPE_HF] = { GFX8_HW_REG_TYPE_HF, GFX8_HW_IMM_TYPE_HF },
[BRW_REGISTER_TYPE_VF] = { INVALID, BRW_HW_IMM_TYPE_VF },
[BRW_REGISTER_TYPE_Q] = { GFX8_HW_REG_TYPE_Q, GFX8_HW_IMM_TYPE_Q },
[BRW_REGISTER_TYPE_UQ] = { GFX8_HW_REG_TYPE_UQ, GFX8_HW_IMM_TYPE_UQ },
[BRW_REGISTER_TYPE_D] = { BRW_HW_REG_TYPE_D, BRW_HW_IMM_TYPE_D },
[BRW_REGISTER_TYPE_UD] = { BRW_HW_REG_TYPE_UD, BRW_HW_IMM_TYPE_UD },
[BRW_REGISTER_TYPE_W] = { BRW_HW_REG_TYPE_W, BRW_HW_IMM_TYPE_W },
[BRW_REGISTER_TYPE_UW] = { BRW_HW_REG_TYPE_UW, BRW_HW_IMM_TYPE_UW },
[BRW_REGISTER_TYPE_B] = { BRW_HW_REG_TYPE_B, INVALID },
[BRW_REGISTER_TYPE_UB] = { BRW_HW_REG_TYPE_UB, INVALID },
[BRW_REGISTER_TYPE_V] = { INVALID, BRW_HW_IMM_TYPE_V },
[BRW_REGISTER_TYPE_UV] = { INVALID, BRW_HW_IMM_TYPE_UV },
}, gfx11_hw_type[] = {
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID, INVALID },
[BRW_REGISTER_TYPE_NF] = { GFX11_HW_REG_TYPE_NF, INVALID },
[BRW_REGISTER_TYPE_F] = { GFX11_HW_REG_TYPE_F, GFX11_HW_IMM_TYPE_F },
[BRW_REGISTER_TYPE_HF] = { GFX11_HW_REG_TYPE_HF, GFX11_HW_IMM_TYPE_HF },
[BRW_REGISTER_TYPE_VF] = { INVALID, GFX11_HW_IMM_TYPE_VF },
[BRW_REGISTER_TYPE_D] = { GFX11_HW_REG_TYPE_D, GFX11_HW_IMM_TYPE_D },
[BRW_REGISTER_TYPE_UD] = { GFX11_HW_REG_TYPE_UD, GFX11_HW_IMM_TYPE_UD },
[BRW_REGISTER_TYPE_W] = { GFX11_HW_REG_TYPE_W, GFX11_HW_IMM_TYPE_W },
[BRW_REGISTER_TYPE_UW] = { GFX11_HW_REG_TYPE_UW, GFX11_HW_IMM_TYPE_UW },
[BRW_REGISTER_TYPE_B] = { GFX11_HW_REG_TYPE_B, INVALID },
[BRW_REGISTER_TYPE_UB] = { GFX11_HW_REG_TYPE_UB, INVALID },
[BRW_REGISTER_TYPE_V] = { INVALID, GFX11_HW_IMM_TYPE_V },
[BRW_REGISTER_TYPE_UV] = { INVALID, GFX11_HW_IMM_TYPE_UV },
}, gfx12_hw_type[] = {
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID, INVALID },
[BRW_REGISTER_TYPE_F] = { GFX12_HW_REG_TYPE_FLOAT(2), GFX12_HW_REG_TYPE_FLOAT(2) },
[BRW_REGISTER_TYPE_HF] = { GFX12_HW_REG_TYPE_FLOAT(1), GFX12_HW_REG_TYPE_FLOAT(1) },
[BRW_REGISTER_TYPE_VF] = { INVALID, GFX12_HW_REG_TYPE_FLOAT(0) },
[BRW_REGISTER_TYPE_D] = { GFX12_HW_REG_TYPE_SINT(2), GFX12_HW_REG_TYPE_SINT(2) },
[BRW_REGISTER_TYPE_UD] = { GFX12_HW_REG_TYPE_UINT(2), GFX12_HW_REG_TYPE_UINT(2) },
[BRW_REGISTER_TYPE_W] = { GFX12_HW_REG_TYPE_SINT(1), GFX12_HW_REG_TYPE_SINT(1) },
[BRW_REGISTER_TYPE_UW] = { GFX12_HW_REG_TYPE_UINT(1), GFX12_HW_REG_TYPE_UINT(1) },
[BRW_REGISTER_TYPE_B] = { GFX12_HW_REG_TYPE_SINT(0), INVALID },
[BRW_REGISTER_TYPE_UB] = { GFX12_HW_REG_TYPE_UINT(0), INVALID },
[BRW_REGISTER_TYPE_V] = { INVALID, GFX12_HW_REG_TYPE_SINT(0) },
[BRW_REGISTER_TYPE_UV] = { INVALID, GFX12_HW_REG_TYPE_UINT(0) },
}, gfx125_hw_type[] = {
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID, INVALID },
[BRW_REGISTER_TYPE_DF] = { GFX12_HW_REG_TYPE_FLOAT(3), GFX12_HW_REG_TYPE_FLOAT(3) },
[BRW_REGISTER_TYPE_F] = { GFX12_HW_REG_TYPE_FLOAT(2), GFX12_HW_REG_TYPE_FLOAT(2) },
[BRW_REGISTER_TYPE_HF] = { GFX12_HW_REG_TYPE_FLOAT(1), GFX12_HW_REG_TYPE_FLOAT(1) },
[BRW_REGISTER_TYPE_VF] = { INVALID, GFX12_HW_REG_TYPE_FLOAT(0) },
[BRW_REGISTER_TYPE_Q] = { GFX12_HW_REG_TYPE_SINT(3), GFX12_HW_REG_TYPE_SINT(3) },
[BRW_REGISTER_TYPE_UQ] = { GFX12_HW_REG_TYPE_UINT(3), GFX12_HW_REG_TYPE_UINT(3) },
[BRW_REGISTER_TYPE_D] = { GFX12_HW_REG_TYPE_SINT(2), GFX12_HW_REG_TYPE_SINT(2) },
[BRW_REGISTER_TYPE_UD] = { GFX12_HW_REG_TYPE_UINT(2), GFX12_HW_REG_TYPE_UINT(2) },
[BRW_REGISTER_TYPE_W] = { GFX12_HW_REG_TYPE_SINT(1), GFX12_HW_REG_TYPE_SINT(1) },
[BRW_REGISTER_TYPE_UW] = { GFX12_HW_REG_TYPE_UINT(1), GFX12_HW_REG_TYPE_UINT(1) },
[BRW_REGISTER_TYPE_B] = { GFX12_HW_REG_TYPE_SINT(0), INVALID },
[BRW_REGISTER_TYPE_UB] = { GFX12_HW_REG_TYPE_UINT(0), INVALID },
[BRW_REGISTER_TYPE_V] = { INVALID, GFX12_HW_REG_TYPE_SINT(0) },
[BRW_REGISTER_TYPE_UV] = { INVALID, GFX12_HW_REG_TYPE_UINT(0) },
};
/* SNB adds 3-src instructions (MAD and LRP) that only operate on floats, so
* the types were implied. IVB adds BFE and BFI2 that operate on doublewords
* and unsigned doublewords, so a new field is also available in the da3src
* struct (part of struct brw_instruction.bits1 in brw_structs.h) to select
* dst and shared-src types.
*
* CNL adds support for 3-src instructions in align1 mode, and with it support
* for most register types.
*/
enum hw_3src_reg_type {
GFX7_3SRC_TYPE_F = 0,
GFX7_3SRC_TYPE_D = 1,
GFX7_3SRC_TYPE_UD = 2,
GFX7_3SRC_TYPE_DF = 3,
GFX8_3SRC_TYPE_HF = 4,
/** When ExecutionDatatype is 1: @{ */
GFX10_ALIGN1_3SRC_REG_TYPE_HF = 0b000,
GFX10_ALIGN1_3SRC_REG_TYPE_F = 0b001,
GFX10_ALIGN1_3SRC_REG_TYPE_DF = 0b010,
GFX11_ALIGN1_3SRC_REG_TYPE_NF = 0b011,
/** @} */
/** When ExecutionDatatype is 0: @{ */
GFX10_ALIGN1_3SRC_REG_TYPE_UD = 0b000,
GFX10_ALIGN1_3SRC_REG_TYPE_D = 0b001,
GFX10_ALIGN1_3SRC_REG_TYPE_UW = 0b010,
GFX10_ALIGN1_3SRC_REG_TYPE_W = 0b011,
GFX10_ALIGN1_3SRC_REG_TYPE_UB = 0b100,
GFX10_ALIGN1_3SRC_REG_TYPE_B = 0b101,
/** @} */
};
static const struct hw_3src_type {
enum hw_3src_reg_type reg_type;
enum gfx10_align1_3src_exec_type exec_type;
} gfx6_hw_3src_type[] = {
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
[BRW_REGISTER_TYPE_F] = { GFX7_3SRC_TYPE_F },
}, gfx7_hw_3src_type[] = {
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
[BRW_REGISTER_TYPE_F] = { GFX7_3SRC_TYPE_F },
[BRW_REGISTER_TYPE_D] = { GFX7_3SRC_TYPE_D },
[BRW_REGISTER_TYPE_UD] = { GFX7_3SRC_TYPE_UD },
[BRW_REGISTER_TYPE_DF] = { GFX7_3SRC_TYPE_DF },
}, gfx8_hw_3src_type[] = {
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
[BRW_REGISTER_TYPE_F] = { GFX7_3SRC_TYPE_F },
[BRW_REGISTER_TYPE_D] = { GFX7_3SRC_TYPE_D },
[BRW_REGISTER_TYPE_UD] = { GFX7_3SRC_TYPE_UD },
[BRW_REGISTER_TYPE_DF] = { GFX7_3SRC_TYPE_DF },
[BRW_REGISTER_TYPE_HF] = { GFX8_3SRC_TYPE_HF },
}, gfx10_hw_3src_align1_type[] = {
#define E(x) BRW_ALIGN1_3SRC_EXEC_TYPE_##x
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
[BRW_REGISTER_TYPE_DF] = { GFX10_ALIGN1_3SRC_REG_TYPE_DF, E(FLOAT) },
[BRW_REGISTER_TYPE_F] = { GFX10_ALIGN1_3SRC_REG_TYPE_F, E(FLOAT) },
[BRW_REGISTER_TYPE_HF] = { GFX10_ALIGN1_3SRC_REG_TYPE_HF, E(FLOAT) },
[BRW_REGISTER_TYPE_D] = { GFX10_ALIGN1_3SRC_REG_TYPE_D, E(INT) },
[BRW_REGISTER_TYPE_UD] = { GFX10_ALIGN1_3SRC_REG_TYPE_UD, E(INT) },
[BRW_REGISTER_TYPE_W] = { GFX10_ALIGN1_3SRC_REG_TYPE_W, E(INT) },
[BRW_REGISTER_TYPE_UW] = { GFX10_ALIGN1_3SRC_REG_TYPE_UW, E(INT) },
[BRW_REGISTER_TYPE_B] = { GFX10_ALIGN1_3SRC_REG_TYPE_B, E(INT) },
[BRW_REGISTER_TYPE_UB] = { GFX10_ALIGN1_3SRC_REG_TYPE_UB, E(INT) },
}, gfx11_hw_3src_type[] = {
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
[BRW_REGISTER_TYPE_NF] = { GFX11_ALIGN1_3SRC_REG_TYPE_NF, E(FLOAT) },
[BRW_REGISTER_TYPE_F] = { GFX10_ALIGN1_3SRC_REG_TYPE_F, E(FLOAT) },
[BRW_REGISTER_TYPE_HF] = { GFX10_ALIGN1_3SRC_REG_TYPE_HF, E(FLOAT) },
[BRW_REGISTER_TYPE_D] = { GFX10_ALIGN1_3SRC_REG_TYPE_D, E(INT) },
[BRW_REGISTER_TYPE_UD] = { GFX10_ALIGN1_3SRC_REG_TYPE_UD, E(INT) },
[BRW_REGISTER_TYPE_W] = { GFX10_ALIGN1_3SRC_REG_TYPE_W, E(INT) },
[BRW_REGISTER_TYPE_UW] = { GFX10_ALIGN1_3SRC_REG_TYPE_UW, E(INT) },
[BRW_REGISTER_TYPE_B] = { GFX10_ALIGN1_3SRC_REG_TYPE_B, E(INT) },
[BRW_REGISTER_TYPE_UB] = { GFX10_ALIGN1_3SRC_REG_TYPE_UB, E(INT) },
}, gfx12_hw_3src_type[] = {
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
[BRW_REGISTER_TYPE_F] = { GFX12_HW_REG_TYPE_UINT(2), E(FLOAT), },
[BRW_REGISTER_TYPE_HF] = { GFX12_HW_REG_TYPE_UINT(1), E(FLOAT), },
[BRW_REGISTER_TYPE_D] = { GFX12_HW_REG_TYPE_SINT(2), E(INT), },
[BRW_REGISTER_TYPE_UD] = { GFX12_HW_REG_TYPE_UINT(2), E(INT), },
[BRW_REGISTER_TYPE_W] = { GFX12_HW_REG_TYPE_SINT(1), E(INT), },
[BRW_REGISTER_TYPE_UW] = { GFX12_HW_REG_TYPE_UINT(1), E(INT), },
[BRW_REGISTER_TYPE_B] = { GFX12_HW_REG_TYPE_SINT(0), E(INT), },
[BRW_REGISTER_TYPE_UB] = { GFX12_HW_REG_TYPE_UINT(0), E(INT), },
}, gfx125_hw_3src_type[] = {
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
[BRW_REGISTER_TYPE_DF] = { GFX12_HW_REG_TYPE_UINT(3), E(FLOAT), },
[BRW_REGISTER_TYPE_F] = { GFX12_HW_REG_TYPE_UINT(2), E(FLOAT), },
[BRW_REGISTER_TYPE_HF] = { GFX12_HW_REG_TYPE_UINT(1), E(FLOAT), },
[BRW_REGISTER_TYPE_Q] = { GFX12_HW_REG_TYPE_SINT(3), E(INT), },
[BRW_REGISTER_TYPE_UQ] = { GFX12_HW_REG_TYPE_UINT(3), E(INT), },
[BRW_REGISTER_TYPE_D] = { GFX12_HW_REG_TYPE_SINT(2), E(INT), },
[BRW_REGISTER_TYPE_UD] = { GFX12_HW_REG_TYPE_UINT(2), E(INT), },
[BRW_REGISTER_TYPE_W] = { GFX12_HW_REG_TYPE_SINT(1), E(INT), },
[BRW_REGISTER_TYPE_UW] = { GFX12_HW_REG_TYPE_UINT(1), E(INT), },
[BRW_REGISTER_TYPE_B] = { GFX12_HW_REG_TYPE_SINT(0), E(INT), },
[BRW_REGISTER_TYPE_UB] = { GFX12_HW_REG_TYPE_UINT(0), E(INT), },
#undef E
};
/**
* Convert a brw_reg_type enumeration value into the hardware representation.
*
* The hardware encoding may depend on whether the value is an immediate.
*/
unsigned
brw_reg_type_to_hw_type(const struct intel_device_info *devinfo,
enum brw_reg_file file,
enum brw_reg_type type)
{
const struct hw_type *table;
if (devinfo->verx10 >= 125) {
assert(type < ARRAY_SIZE(gfx125_hw_type));
table = gfx125_hw_type;
} else if (devinfo->ver >= 12) {
assert(type < ARRAY_SIZE(gfx12_hw_type));
table = gfx12_hw_type;
} else if (devinfo->ver >= 11) {
assert(type < ARRAY_SIZE(gfx11_hw_type));
table = gfx11_hw_type;
} else if (devinfo->ver >= 8) {
assert(type < ARRAY_SIZE(gfx8_hw_type));
table = gfx8_hw_type;
} else if (devinfo->ver >= 7) {
assert(type < ARRAY_SIZE(gfx7_hw_type));
table = gfx7_hw_type;
} else if (devinfo->ver >= 6) {
assert(type < ARRAY_SIZE(gfx6_hw_type));
table = gfx6_hw_type;
} else {
assert(type < ARRAY_SIZE(gfx4_hw_type));
table = gfx4_hw_type;
}
if (file == BRW_IMMEDIATE_VALUE) {
assert(table[type].imm_type != (enum hw_imm_type)INVALID);
return table[type].imm_type;
} else {
assert(table[type].reg_type != (enum hw_reg_type)INVALID);
return table[type].reg_type;
}
}
/**
* Convert the hardware representation into a brw_reg_type enumeration value.
*
* The hardware encoding may depend on whether the value is an immediate.
*/
enum brw_reg_type
brw_hw_type_to_reg_type(const struct intel_device_info *devinfo,
enum brw_reg_file file, unsigned hw_type)
{
const struct hw_type *table;
if (devinfo->verx10 >= 125) {
table = gfx125_hw_type;
} else if (devinfo->ver >= 12) {
table = gfx12_hw_type;
} else if (devinfo->ver >= 11) {
table = gfx11_hw_type;
} else if (devinfo->ver >= 8) {
table = gfx8_hw_type;
} else if (devinfo->ver >= 7) {
table = gfx7_hw_type;
} else if (devinfo->ver >= 6) {
table = gfx6_hw_type;
} else {
table = gfx4_hw_type;
}
if (file == BRW_IMMEDIATE_VALUE) {
for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) {
if (table[i].imm_type == (enum hw_imm_type)hw_type) {
return i;
}
}
} else {
for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) {
if (table[i].reg_type == (enum hw_reg_type)hw_type) {
return i;
}
}
}
return INVALID_REG_TYPE;
}
/**
* Convert a brw_reg_type enumeration value into the hardware representation
* for a 3-src align16 instruction
*/
unsigned
brw_reg_type_to_a16_hw_3src_type(const struct intel_device_info *devinfo,
enum brw_reg_type type)
{
const struct hw_3src_type *table;
if (devinfo->ver >= 8) {
assert(type < ARRAY_SIZE(gfx8_hw_3src_type));
table = gfx8_hw_3src_type;
} else if (devinfo->ver >= 7) {
assert(type < ARRAY_SIZE(gfx7_hw_3src_type));
table = gfx7_hw_3src_type;
} else {
assert(type < ARRAY_SIZE(gfx6_hw_3src_type));
table = gfx6_hw_3src_type;
}
assert(table[type].reg_type != (enum hw_3src_reg_type)INVALID);
return table[type].reg_type;
}
/**
* Convert a brw_reg_type enumeration value into the hardware representation
* for a 3-src align1 instruction
*/
unsigned
brw_reg_type_to_a1_hw_3src_type(const struct intel_device_info *devinfo,
enum brw_reg_type type)
{
if (devinfo->verx10 >= 125) {
assert(type < ARRAY_SIZE(gfx125_hw_3src_type));
return gfx125_hw_3src_type[type].reg_type;
} else if (devinfo->ver >= 12) {
assert(type < ARRAY_SIZE(gfx12_hw_3src_type));
return gfx12_hw_3src_type[type].reg_type;
} else if (devinfo->ver >= 11) {
assert(type < ARRAY_SIZE(gfx11_hw_3src_type));
return gfx11_hw_3src_type[type].reg_type;
} else {
assert(type < ARRAY_SIZE(gfx10_hw_3src_align1_type));
return gfx10_hw_3src_align1_type[type].reg_type;
}
}
/**
* Convert the hardware representation for a 3-src align16 instruction into a
* brw_reg_type enumeration value.
*/
enum brw_reg_type
brw_a16_hw_3src_type_to_reg_type(const struct intel_device_info *devinfo,
unsigned hw_type)
{
const struct hw_3src_type *table = NULL;
if (devinfo->ver >= 8) {
table = gfx8_hw_3src_type;
} else if (devinfo->ver >= 7) {
table = gfx7_hw_3src_type;
} else if (devinfo->ver >= 6) {
table = gfx6_hw_3src_type;
}
for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) {
if (table[i].reg_type == hw_type) {
return i;
}
}
return INVALID_REG_TYPE;
}
/**
* Convert the hardware representation for a 3-src align1 instruction into a
* brw_reg_type enumeration value.
*/
enum brw_reg_type
brw_a1_hw_3src_type_to_reg_type(const struct intel_device_info *devinfo,
unsigned hw_type, unsigned exec_type)
{
const struct hw_3src_type *table =
(devinfo->verx10 >= 125 ? gfx125_hw_3src_type :
devinfo->ver >= 12 ? gfx12_hw_3src_type :
devinfo->ver >= 11 ? gfx11_hw_3src_type :
gfx10_hw_3src_align1_type);
for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) {
if (table[i].reg_type == hw_type &&
table[i].exec_type == exec_type) {
return i;
}
}
return INVALID_REG_TYPE;
}
/**
* Return the element size given a register type.
*/
unsigned
brw_reg_type_to_size(enum brw_reg_type type)
{
static const unsigned type_size[] = {
[BRW_REGISTER_TYPE_NF] = 8,
[BRW_REGISTER_TYPE_DF] = 8,
[BRW_REGISTER_TYPE_F] = 4,
[BRW_REGISTER_TYPE_HF] = 2,
[BRW_REGISTER_TYPE_VF] = 4,
[BRW_REGISTER_TYPE_Q] = 8,
[BRW_REGISTER_TYPE_UQ] = 8,
[BRW_REGISTER_TYPE_D] = 4,
[BRW_REGISTER_TYPE_UD] = 4,
[BRW_REGISTER_TYPE_W] = 2,
[BRW_REGISTER_TYPE_UW] = 2,
[BRW_REGISTER_TYPE_B] = 1,
[BRW_REGISTER_TYPE_UB] = 1,
[BRW_REGISTER_TYPE_V] = 2,
[BRW_REGISTER_TYPE_UV] = 2,
};
if (type >= ARRAY_SIZE(type_size))
return -1;
return type_size[type];
}
/**
* Converts a BRW_REGISTER_TYPE_* enum to a short string (F, UD, and so on).
*
* This is different than reg_encoding from brw_disasm.c in that it operates
* on the abstract enum values, rather than the generation-specific encoding.
*/
const char *
brw_reg_type_to_letters(enum brw_reg_type type)
{
static const char letters[][3] = {
[BRW_REGISTER_TYPE_NF] = "NF",
[BRW_REGISTER_TYPE_DF] = "DF",
[BRW_REGISTER_TYPE_F] = "F",
[BRW_REGISTER_TYPE_HF] = "HF",
[BRW_REGISTER_TYPE_VF] = "VF",
[BRW_REGISTER_TYPE_Q] = "Q",
[BRW_REGISTER_TYPE_UQ] = "UQ",
[BRW_REGISTER_TYPE_D] = "D",
[BRW_REGISTER_TYPE_UD] = "UD",
[BRW_REGISTER_TYPE_W] = "W",
[BRW_REGISTER_TYPE_UW] = "UW",
[BRW_REGISTER_TYPE_B] = "B",
[BRW_REGISTER_TYPE_UB] = "UB",
[BRW_REGISTER_TYPE_V] = "V",
[BRW_REGISTER_TYPE_UV] = "UV",
};
if (type >= ARRAY_SIZE(letters))
return "INVALID";
assert(type < ARRAY_SIZE(letters));
return letters[type];
}

View file

@ -0,0 +1,209 @@
/*
* Copyright © 2017 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_REG_TYPE_H
#define BRW_REG_TYPE_H
#include <stdbool.h>
#ifdef __cplusplus
extern "C" {
#endif
#ifdef HAVE_FUNC_ATTRIBUTE_PURE
#define ATTRIBUTE_PURE __attribute__((__pure__))
#else
#define ATTRIBUTE_PURE
#endif
enum brw_reg_file;
struct intel_device_info;
/*
* The ordering has been chosen so that no enum value is the same as a
* compatible hardware encoding.
*/
enum PACKED brw_reg_type {
/** Floating-point types: @{ */
BRW_REGISTER_TYPE_NF, /* >64-bit (accumulator-only) native float (gfx11+) */
BRW_REGISTER_TYPE_DF, /* 64-bit float (double float) */
BRW_REGISTER_TYPE_F, /* 32-bit float */
BRW_REGISTER_TYPE_HF, /* 16-bit float (half float) */
BRW_REGISTER_TYPE_VF, /* 32-bit vector of 4 8-bit floats */
/** @} */
/** Integer types: @{ */
BRW_REGISTER_TYPE_Q, /* 64-bit signed integer (quad word) */
BRW_REGISTER_TYPE_UQ, /* 64-bit unsigned integer (quad word) */
BRW_REGISTER_TYPE_D, /* 32-bit signed integer (double word) */
BRW_REGISTER_TYPE_UD, /* 32-bit unsigned integer (double word) */
BRW_REGISTER_TYPE_W, /* 16-bit signed integer (word) */
BRW_REGISTER_TYPE_UW, /* 16-bit unsigned integer (word) */
BRW_REGISTER_TYPE_B, /* 8-bit signed integer (byte) */
BRW_REGISTER_TYPE_UB, /* 8-bit unsigned integer (byte) */
BRW_REGISTER_TYPE_V, /* vector of 8 signed 4-bit integers (treated as W) */
BRW_REGISTER_TYPE_UV, /* vector of 8 unsigned 4-bit integers (treated as UW) */
/** @} */
BRW_REGISTER_TYPE_LAST = BRW_REGISTER_TYPE_UV
};
static inline bool
brw_reg_type_is_floating_point(enum brw_reg_type type)
{
switch (type) {
case BRW_REGISTER_TYPE_NF:
case BRW_REGISTER_TYPE_DF:
case BRW_REGISTER_TYPE_F:
case BRW_REGISTER_TYPE_HF:
return true;
default:
return false;
}
}
static inline bool
brw_reg_type_is_integer(enum brw_reg_type type)
{
switch (type) {
case BRW_REGISTER_TYPE_Q:
case BRW_REGISTER_TYPE_UQ:
case BRW_REGISTER_TYPE_D:
case BRW_REGISTER_TYPE_UD:
case BRW_REGISTER_TYPE_W:
case BRW_REGISTER_TYPE_UW:
case BRW_REGISTER_TYPE_B:
case BRW_REGISTER_TYPE_UB:
return true;
default:
return false;
}
}
static inline bool
brw_reg_type_is_unsigned_integer(enum brw_reg_type tp)
{
return tp == BRW_REGISTER_TYPE_UB ||
tp == BRW_REGISTER_TYPE_UW ||
tp == BRW_REGISTER_TYPE_UD ||
tp == BRW_REGISTER_TYPE_UQ;
}
/*
* Returns a type based on a reference_type (word, float, half-float) and a
* given bit_size.
*/
static inline enum brw_reg_type
brw_reg_type_from_bit_size(unsigned bit_size,
enum brw_reg_type reference_type)
{
switch(reference_type) {
case BRW_REGISTER_TYPE_HF:
case BRW_REGISTER_TYPE_F:
case BRW_REGISTER_TYPE_DF:
switch(bit_size) {
case 16:
return BRW_REGISTER_TYPE_HF;
case 32:
return BRW_REGISTER_TYPE_F;
case 64:
return BRW_REGISTER_TYPE_DF;
default:
unreachable("Invalid bit size");
}
case BRW_REGISTER_TYPE_B:
case BRW_REGISTER_TYPE_W:
case BRW_REGISTER_TYPE_D:
case BRW_REGISTER_TYPE_Q:
switch(bit_size) {
case 8:
return BRW_REGISTER_TYPE_B;
case 16:
return BRW_REGISTER_TYPE_W;
case 32:
return BRW_REGISTER_TYPE_D;
case 64:
return BRW_REGISTER_TYPE_Q;
default:
unreachable("Invalid bit size");
}
case BRW_REGISTER_TYPE_UB:
case BRW_REGISTER_TYPE_UW:
case BRW_REGISTER_TYPE_UD:
case BRW_REGISTER_TYPE_UQ:
switch(bit_size) {
case 8:
return BRW_REGISTER_TYPE_UB;
case 16:
return BRW_REGISTER_TYPE_UW;
case 32:
return BRW_REGISTER_TYPE_UD;
case 64:
return BRW_REGISTER_TYPE_UQ;
default:
unreachable("Invalid bit size");
}
default:
unreachable("Unknown type");
}
}
#define INVALID_REG_TYPE ((enum brw_reg_type)-1)
#define INVALID_HW_REG_TYPE ((unsigned)-1)
unsigned
brw_reg_type_to_hw_type(const struct intel_device_info *devinfo,
enum brw_reg_file file, enum brw_reg_type type);
enum brw_reg_type ATTRIBUTE_PURE
brw_hw_type_to_reg_type(const struct intel_device_info *devinfo,
enum brw_reg_file file, unsigned hw_type);
unsigned
brw_reg_type_to_a16_hw_3src_type(const struct intel_device_info *devinfo,
enum brw_reg_type type);
unsigned
brw_reg_type_to_a1_hw_3src_type(const struct intel_device_info *devinfo,
enum brw_reg_type type);
enum brw_reg_type
brw_a16_hw_3src_type_to_reg_type(const struct intel_device_info *devinfo,
unsigned hw_type);
enum brw_reg_type
brw_a1_hw_3src_type_to_reg_type(const struct intel_device_info *devinfo,
unsigned hw_type, unsigned exec_type);
unsigned
brw_reg_type_to_size(enum brw_reg_type type);
const char *
brw_reg_type_to_letters(enum brw_reg_type type);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -0,0 +1,292 @@
/*
* Copyright © 2020 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_RT_H
#define BRW_RT_H
#include <stdint.h>
#include "compiler/shader_enums.h"
#include "util/macros.h"
#ifdef __cplusplus
extern "C" {
#endif
/** Vulkan defines shaderGroupHandleSize = 32 */
#define BRW_RT_SBT_HANDLE_SIZE 32
/** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */
#define BRW_RT_DISPATCH_GLOBALS_SIZE 80
/** Offset after the RT dispatch globals at which "push" constants live */
#define BRW_RT_PUSH_CONST_OFFSET 128
/** Stride of the resume SBT */
#define BRW_BTD_RESUME_SBT_STRIDE 8
/* Vulkan always uses exactly two levels of BVH: world and object. At the API
* level, these are referred to as top and bottom.
*/
enum brw_rt_bvh_level {
BRW_RT_BVH_LEVEL_WORLD = 0,
BRW_RT_BVH_LEVEL_OBJECT = 1,
};
#define BRW_RT_MAX_BVH_LEVELS 2
enum brw_rt_bvh_node_type {
BRW_RT_BVH_NODE_TYPE_INTERNAL = 0,
BRW_RT_BVH_NODE_TYPE_INSTANCE = 1,
BRW_RT_BVH_NODE_TYPE_PROCEDURAL = 3,
BRW_RT_BVH_NODE_TYPE_QUAD = 4,
};
/** HitKind values returned for triangle geometry
*
* This enum must match the SPIR-V enum.
*/
enum brw_rt_hit_kind {
BRW_RT_HIT_KIND_FRONT_FACE = 0xfe,
BRW_RT_HIT_KIND_BACK_FACE = 0xff,
};
/** Ray flags
*
* This enum must match the SPIR-V RayFlags enum.
*/
enum brw_rt_ray_flags {
BRW_RT_RAY_FLAG_FORCE_OPAQUE = 0x01,
BRW_RT_RAY_FLAG_FORCE_NON_OPAQUE = 0x02,
BRW_RT_RAY_FLAG_TERMINATE_ON_FIRST_HIT = 0x04,
BRW_RT_RAY_FLAG_SKIP_CLOSEST_HIT_SHADER = 0x08,
BRW_RT_RAY_FLAG_CULL_BACK_FACING_TRIANGLES = 0x10,
BRW_RT_RAY_FLAG_CULL_FRONT_FACING_TRIANGLES = 0x20,
BRW_RT_RAY_FLAG_CULL_OPAQUE = 0x40,
BRW_RT_RAY_FLAG_CULL_NON_OPAQUE = 0x80,
BRW_RT_RAY_FLAG_SKIP_TRIANGLES = 0x100,
BRW_RT_RAY_FLAG_SKIP_AABBS = 0x200,
};
struct brw_rt_scratch_layout {
/** Number of stack IDs per DSS */
uint32_t stack_ids_per_dss;
/** Start offset (in bytes) of the hardware MemRay stack */
uint32_t ray_stack_start;
/** Stride (in bytes) of the hardware MemRay stack */
uint32_t ray_stack_stride;
/** Start offset (in bytes) of the SW stacks */
uint64_t sw_stack_start;
/** Size (in bytes) of the SW stack for a single shader invocation */
uint32_t sw_stack_size;
/** Total size (in bytes) of the RT scratch memory area */
uint64_t total_size;
};
/** Parameters passed to the raygen trampoline shader
*
* This struct is carefully construected to be 32B and must be passed to the
* raygen trampoline shader as as inline constant data.
*/
struct brw_rt_raygen_trampoline_params {
/** The GPU address of the RT_DISPATCH_GLOBALS */
uint64_t rt_disp_globals_addr;
/** The GPU address of the BINDLESS_SHADER_RECORD for the raygen shader */
uint64_t raygen_bsr_addr;
/** 1 if this is an indirect dispatch, 0 otherwise */
uint8_t is_indirect;
/** The integer log2 of the local group size
*
* Ray-tracing shaders don't have a concept of local vs. global workgroup
* size. They only have a single 3D launch size. The raygen trampoline
* shader is always dispatched with a local workgroup size equal to the
* SIMD width but the shape of the local workgroup is determined at
* dispatch time based on the shape of the launch and passed to the
* trampoline via this field. (There's no sense having a Z dimension on
* the local workgroup if the launch is 2D.)
*
* We use the integer log2 of the size because there's no point in
* non-power-of-two sizes and shifts are cheaper than division.
*/
uint8_t local_group_size_log2[3];
uint32_t pad[3];
};
/** Size of the "hot zone" in bytes
*
* The hot zone is a SW-defined data structure which is a single uvec4
* containing two bits of information:
*
* - hotzone.x: Stack offset (in bytes)
*
* This is the offset (in bytes) into the per-thread scratch space at which
* the current shader's stack starts. This is incremented by the calling
* shader prior to any shader call type instructions and gets decremented
* by the resume shader as part of completing the return operation.
*
*
* - hotzone.yzw: The launch ID associated with the current thread
*
* Inside a bindless shader, the only information we have is the DSS ID
* from the hardware EU and a per-DSS stack ID. In particular, the three-
* dimensional launch ID is lost the moment we leave the raygen trampoline.
*/
#define BRW_RT_SIZEOF_HOTZONE 16
/* From the BSpec "Address Computation for Memory Based Data Structures:
* Ray and TraversalStack (Async Ray Tracing)":
*
* sizeof(Ray) = 64B, sizeof(HitInfo) = 32B, sizeof(TravStack) = 32B.
*/
#define BRW_RT_SIZEOF_RAY 64
#define BRW_RT_SIZEOF_HIT_INFO 32
#define BRW_RT_SIZEOF_TRAV_STACK 32
/* From the BSpec:
*
* syncStackSize = (maxBVHLevels % 2 == 1) ?
* (sizeof(HitInfo) * 2 +
* (sizeof(Ray) + sizeof(TravStack)) * maxBVHLevels + 32B) :
* (sizeof(HitInfo) * 2 +
* (sizeof(Ray) + sizeof(TravStack)) * maxBVHLevels);
*
* The select is just to align to 64B.
*/
#define BRW_RT_SIZEOF_RAY_QUERY \
(BRW_RT_SIZEOF_HIT_INFO * 2 + \
(BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS + \
(BRW_RT_MAX_BVH_LEVELS % 2 ? 32 : 0))
#define BRW_RT_SIZEOF_SHADOW_RAY_QUERY \
(BRW_RT_SIZEOF_HIT_INFO * 2 + \
(BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS)
#define BRW_RT_SIZEOF_HW_STACK \
(BRW_RT_SIZEOF_HIT_INFO * 2 + \
BRW_RT_SIZEOF_RAY * BRW_RT_MAX_BVH_LEVELS + \
BRW_RT_SIZEOF_TRAV_STACK * BRW_RT_MAX_BVH_LEVELS)
/* This is a mesa-defined region for hit attribute data */
#define BRW_RT_SIZEOF_HIT_ATTRIB_DATA 64
#define BRW_RT_OFFSETOF_HIT_ATTRIB_DATA BRW_RT_SIZEOF_HW_STACK
#define BRW_RT_ASYNC_STACK_STRIDE \
ALIGN_POT(BRW_RT_OFFSETOF_HIT_ATTRIB_DATA + \
BRW_RT_SIZEOF_HIT_ATTRIB_DATA, 64)
static inline void
brw_rt_compute_scratch_layout(struct brw_rt_scratch_layout *layout,
const struct intel_device_info *devinfo,
uint32_t stack_ids_per_dss,
uint32_t sw_stack_size)
{
layout->stack_ids_per_dss = stack_ids_per_dss;
const uint32_t dss_count = intel_device_info_dual_subslice_id_bound(devinfo);
const uint32_t num_stack_ids = dss_count * stack_ids_per_dss;
uint64_t size = 0;
/* The first thing in our scratch area is an array of "hot zones" which
* store the stack offset as well as the launch IDs for each active
* invocation.
*/
size += BRW_RT_SIZEOF_HOTZONE * num_stack_ids;
/* Next, we place the HW ray stacks */
assert(size % 64 == 0); /* Cache-line aligned */
assert(size < UINT32_MAX);
layout->ray_stack_start = size;
layout->ray_stack_stride = BRW_RT_ASYNC_STACK_STRIDE;
size += num_stack_ids * layout->ray_stack_stride;
/* Finally, we place the SW stacks for the individual ray-tracing shader
* invocations. We align these to 64B to ensure that we don't have any
* shared cache lines which could hurt performance.
*/
assert(size % 64 == 0);
layout->sw_stack_start = size;
layout->sw_stack_size = ALIGN(sw_stack_size, 64);
/* Currently it's always the case that sw_stack_size is a power of
* two, but power-of-two SW stack sizes are prone to causing
* collisions in the hashing function used by the L3 to map memory
* addresses to banks, which can cause stack accesses from most
* DSSes to bottleneck on a single L3 bank. Fix it by padding the
* SW stack by a single cacheline if it was a power of two.
*/
if (layout->sw_stack_size > 64 &&
util_is_power_of_two_nonzero(layout->sw_stack_size))
layout->sw_stack_size += 64;
size += num_stack_ids * layout->sw_stack_size;
layout->total_size = size;
}
static inline uint32_t
brw_rt_ray_queries_hw_stacks_size(const struct intel_device_info *devinfo)
{
/* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids
* which includes all the threads.
*/
uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE];
uint32_t max_simd_size = 16; /* Cannot run in SIMD32 with ray queries */
return max_eu_id * max_simd_size * BRW_RT_SIZEOF_RAY_QUERY;
}
static inline uint32_t
brw_rt_ray_queries_shadow_stack_size(const struct intel_device_info *devinfo)
{
/* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids
* which includes all the threads.
*/
uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE];
uint32_t max_simd_size = 16; /* Cannot run in SIMD32 with ray queries */
return max_eu_id * max_simd_size * BRW_RT_SIZEOF_SHADOW_RAY_QUERY;
}
static inline uint32_t
brw_rt_ray_queries_shadow_stacks_size(const struct intel_device_info *devinfo,
uint32_t ray_queries)
{
/* Don't bother a shadow stack if we only have a single query. We can
* directly write in the HW buffer.
*/
return (ray_queries > 1 ? ray_queries : 0) * brw_rt_ray_queries_shadow_stack_size(devinfo) +
ray_queries * 4; /* Ctrl + Level data */
}
#ifdef __cplusplus
}
#endif
#endif /* BRW_RT_H */

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,196 @@
/*
* Copyright © 2010 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_SHADER_H
#define BRW_SHADER_H
#include <stdint.h>
#include "brw_cfg.h"
#include "brw_compiler.h"
#include "compiler/nir/nir.h"
#ifdef __cplusplus
#include "brw_ir_analysis.h"
#include "brw_ir_allocator.h"
enum instruction_scheduler_mode {
SCHEDULE_PRE,
SCHEDULE_PRE_NON_LIFO,
SCHEDULE_PRE_LIFO,
SCHEDULE_POST,
SCHEDULE_NONE,
};
#define UBO_START ((1 << 16) - 4)
struct backend_shader {
protected:
backend_shader(const struct brw_compiler *compiler,
const struct brw_compile_params *params,
const nir_shader *shader,
struct brw_stage_prog_data *stage_prog_data,
bool debug_enabled);
public:
virtual ~backend_shader();
const struct brw_compiler *compiler;
void *log_data; /* Passed to compiler->*_log functions */
const struct intel_device_info * const devinfo;
const nir_shader *nir;
struct brw_stage_prog_data * const stage_prog_data;
/** ralloc context for temporary data used during compile */
void *mem_ctx;
/**
* List of either fs_inst or vec4_instruction (inheriting from
* backend_instruction)
*/
exec_list instructions;
cfg_t *cfg;
brw_analysis<brw::idom_tree, backend_shader> idom_analysis;
gl_shader_stage stage;
bool debug_enabled;
brw::simple_allocator alloc;
virtual void dump_instruction_to_file(const backend_instruction *inst, FILE *file) const = 0;
virtual void dump_instructions_to_file(FILE *file) const;
/* Convenience functions based on the above. */
void dump_instruction(const backend_instruction *inst, FILE *file = stderr) const {
dump_instruction_to_file(inst, file);
}
void dump_instructions(const char *name = nullptr) const;
void calculate_cfg();
virtual void invalidate_analysis(brw::analysis_dependency_class c);
};
#else
struct backend_shader;
#endif /* __cplusplus */
enum brw_reg_type brw_type_for_base_type(const struct glsl_type *type);
uint32_t brw_math_function(enum opcode op);
const char *brw_instruction_name(const struct brw_isa_info *isa,
enum opcode op);
bool brw_saturate_immediate(enum brw_reg_type type, struct brw_reg *reg);
bool brw_negate_immediate(enum brw_reg_type type, struct brw_reg *reg);
bool brw_abs_immediate(enum brw_reg_type type, struct brw_reg *reg);
bool opt_predicated_break(struct backend_shader *s);
#ifdef __cplusplus
extern "C" {
#endif
/* brw_fs_reg_allocate.cpp */
void brw_fs_alloc_reg_sets(struct brw_compiler *compiler);
/* brw_vec4_reg_allocate.cpp */
void brw_vec4_alloc_reg_set(struct brw_compiler *compiler);
/* brw_disasm.c */
extern const char *const conditional_modifier[16];
extern const char *const pred_ctrl_align16[16];
/* Per-thread scratch space is a power-of-two multiple of 1KB. */
static inline unsigned
brw_get_scratch_size(int size)
{
return MAX2(1024, util_next_power_of_two(size));
}
static inline nir_variable_mode
brw_nir_no_indirect_mask(const struct brw_compiler *compiler,
gl_shader_stage stage)
{
const struct intel_device_info *devinfo = compiler->devinfo;
const bool is_scalar = compiler->scalar_stage[stage];
nir_variable_mode indirect_mask = (nir_variable_mode) 0;
switch (stage) {
case MESA_SHADER_VERTEX:
case MESA_SHADER_FRAGMENT:
indirect_mask |= nir_var_shader_in;
break;
case MESA_SHADER_GEOMETRY:
if (!is_scalar)
indirect_mask |= nir_var_shader_in;
break;
default:
/* Everything else can handle indirect inputs */
break;
}
if (is_scalar && stage != MESA_SHADER_TESS_CTRL &&
stage != MESA_SHADER_TASK &&
stage != MESA_SHADER_MESH)
indirect_mask |= nir_var_shader_out;
/* On HSW+, we allow indirects in scalar shaders. They get implemented
* using nir_lower_vars_to_explicit_types and nir_lower_explicit_io in
* brw_postprocess_nir.
*
* We haven't plumbed through the indirect scratch messages on gfx6 or
* earlier so doing indirects via scratch doesn't work there. On gfx7 and
* earlier the scratch space size is limited to 12kB. If we allowed
* indirects as scratch all the time, we may easily exceed this limit
* without having any fallback.
*/
if (is_scalar && devinfo->verx10 <= 70)
indirect_mask |= nir_var_function_temp;
return indirect_mask;
}
bool brw_texture_offset(const nir_tex_instr *tex, unsigned src,
uint32_t *offset_bits);
/**
* Scratch data used when compiling a GLSL geometry shader.
*/
struct brw_gs_compile
{
struct brw_gs_prog_key key;
struct intel_vue_map input_vue_map;
unsigned control_data_bits_per_vertex;
unsigned control_data_header_size_bits;
};
#ifdef __cplusplus
}
#endif
#endif /* BRW_SHADER_H */

View file

@ -0,0 +1,268 @@
/*
* Copyright © 2021 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_private.h"
#include "compiler/shader_info.h"
#include "intel/dev/intel_debug.h"
#include "intel/dev/intel_device_info.h"
#include "util/ralloc.h"
unsigned
brw_required_dispatch_width(const struct shader_info *info)
{
if ((int)info->subgroup_size >= (int)SUBGROUP_SIZE_REQUIRE_8) {
assert(gl_shader_stage_uses_workgroup(info->stage));
/* These enum values are expressly chosen to be equal to the subgroup
* size that they require.
*/
return (unsigned)info->subgroup_size;
} else {
return 0;
}
}
static inline bool
test_bit(unsigned mask, unsigned bit) {
return mask & (1u << bit);
}
namespace {
struct brw_cs_prog_data *
get_cs_prog_data(brw_simd_selection_state &state)
{
if (std::holds_alternative<struct brw_cs_prog_data *>(state.prog_data))
return std::get<struct brw_cs_prog_data *>(state.prog_data);
else
return nullptr;
}
struct brw_stage_prog_data *
get_prog_data(brw_simd_selection_state &state)
{
if (std::holds_alternative<struct brw_cs_prog_data *>(state.prog_data))
return &std::get<struct brw_cs_prog_data *>(state.prog_data)->base;
else if (std::holds_alternative<struct brw_bs_prog_data *>(state.prog_data))
return &std::get<struct brw_bs_prog_data *>(state.prog_data)->base;
else
return nullptr;
}
}
bool
brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd)
{
assert(simd < SIMD_COUNT);
assert(!state.compiled[simd]);
const auto cs_prog_data = get_cs_prog_data(state);
const auto prog_data = get_prog_data(state);
const unsigned width = 8u << simd;
/* For shaders with variable size workgroup, in most cases we can compile
* all the variants (exceptions are bindless dispatch & ray queries), since
* the choice will happen only at dispatch time.
*/
const bool workgroup_size_variable = cs_prog_data && cs_prog_data->local_size[0] == 0;
if (!workgroup_size_variable) {
if (state.spilled[simd]) {
state.error[simd] = "Would spill";
return false;
}
if (state.required_width && state.required_width != width) {
state.error[simd] = "Different than required dispatch width";
return false;
}
if (cs_prog_data) {
const unsigned workgroup_size = cs_prog_data->local_size[0] *
cs_prog_data->local_size[1] *
cs_prog_data->local_size[2];
unsigned max_threads = state.devinfo->max_cs_workgroup_threads;
const unsigned min_simd = state.devinfo->ver >= 20 ? 1 : 0;
if (simd > min_simd && state.compiled[simd - 1] &&
workgroup_size <= (width / 2)) {
state.error[simd] = "Workgroup size already fits in smaller SIMD";
return false;
}
if (DIV_ROUND_UP(workgroup_size, width) > max_threads) {
state.error[simd] = "Would need more than max_threads to fit all invocations";
return false;
}
}
/* The SIMD32 is only enabled for cases it is needed unless forced.
*
* TODO: Use performance_analysis and drop this rule.
*/
if (width == 32 && state.devinfo->ver < 20) {
if (!INTEL_DEBUG(DEBUG_DO32) && (state.compiled[0] || state.compiled[1])) {
state.error[simd] = "SIMD32 not required (use INTEL_DEBUG=do32 to force)";
return false;
}
}
}
if (width == 8 && state.devinfo->ver >= 20) {
state.error[simd] = "SIMD8 not supported on Xe2+";
return false;
}
if (width == 32 && cs_prog_data && cs_prog_data->base.ray_queries > 0) {
state.error[simd] = "Ray queries not supported";
return false;
}
if (width == 32 && cs_prog_data && cs_prog_data->uses_btd_stack_ids) {
state.error[simd] = "Bindless shader calls not supported";
return false;
}
uint64_t start;
switch (prog_data->stage) {
case MESA_SHADER_COMPUTE:
start = DEBUG_CS_SIMD8;
break;
case MESA_SHADER_TASK:
start = DEBUG_TS_SIMD8;
break;
case MESA_SHADER_MESH:
start = DEBUG_MS_SIMD8;
break;
case MESA_SHADER_RAYGEN:
case MESA_SHADER_ANY_HIT:
case MESA_SHADER_CLOSEST_HIT:
case MESA_SHADER_MISS:
case MESA_SHADER_INTERSECTION:
case MESA_SHADER_CALLABLE:
start = DEBUG_RT_SIMD8;
break;
default:
unreachable("unknown shader stage in brw_simd_should_compile");
}
const bool env_skip[] = {
(intel_simd & (start << 0)) == 0,
(intel_simd & (start << 1)) == 0,
(intel_simd & (start << 2)) == 0,
};
static_assert(ARRAY_SIZE(env_skip) == SIMD_COUNT);
if (unlikely(env_skip[simd])) {
state.error[simd] = "Disabled by INTEL_DEBUG environment variable";
return false;
}
return true;
}
void
brw_simd_mark_compiled(brw_simd_selection_state &state, unsigned simd, bool spilled)
{
assert(simd < SIMD_COUNT);
assert(!state.compiled[simd]);
auto cs_prog_data = get_cs_prog_data(state);
state.compiled[simd] = true;
if (cs_prog_data)
cs_prog_data->prog_mask |= 1u << simd;
/* If a SIMD spilled, all the larger ones would spill too. */
if (spilled) {
for (unsigned i = simd; i < SIMD_COUNT; i++) {
state.spilled[i] = true;
if (cs_prog_data)
cs_prog_data->prog_spilled |= 1u << i;
}
}
}
int
brw_simd_select(const struct brw_simd_selection_state &state)
{
for (int i = SIMD_COUNT - 1; i >= 0; i--) {
if (state.compiled[i] && !state.spilled[i])
return i;
}
for (int i = SIMD_COUNT - 1; i >= 0; i--) {
if (state.compiled[i])
return i;
}
return -1;
}
int
brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo,
const struct brw_cs_prog_data *prog_data,
const unsigned *sizes)
{
if (!sizes || (prog_data->local_size[0] == sizes[0] &&
prog_data->local_size[1] == sizes[1] &&
prog_data->local_size[2] == sizes[2])) {
brw_simd_selection_state simd_state{
.prog_data = const_cast<struct brw_cs_prog_data *>(prog_data),
};
/* Propagate the prog_data information back to the simd_state,
* so we can use select() directly.
*/
for (int i = 0; i < SIMD_COUNT; i++) {
simd_state.compiled[i] = test_bit(prog_data->prog_mask, i);
simd_state.spilled[i] = test_bit(prog_data->prog_spilled, i);
}
return brw_simd_select(simd_state);
}
struct brw_cs_prog_data cloned = *prog_data;
for (unsigned i = 0; i < 3; i++)
cloned.local_size[i] = sizes[i];
cloned.prog_mask = 0;
cloned.prog_spilled = 0;
brw_simd_selection_state simd_state{
.devinfo = devinfo,
.prog_data = &cloned,
};
for (unsigned simd = 0; simd < SIMD_COUNT; simd++) {
/* We are not recompiling, so use original results of prog_mask and
* prog_spilled as they will already contain all possible compilations.
*/
if (brw_simd_should_compile(simd_state, simd) &&
test_bit(prog_data->prog_mask, simd)) {
brw_simd_mark_compiled(simd_state, simd, test_bit(prog_data->prog_spilled, simd));
}
}
return brw_simd_select(simd_state);
}

Some files were not shown because too many files have changed in this diff Show more