diff --git a/src/intel/compiler/elk/brw_asm.h b/src/intel/compiler/elk/brw_asm.h new file mode 100644 index 00000000000..d6d9ce47b03 --- /dev/null +++ b/src/intel/compiler/elk/brw_asm.h @@ -0,0 +1,122 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef BRW_ASM_H +#define BRW_ASM_H + +#include +#include +#include + +#include "compiler/brw_reg.h" +#include "compiler/brw_reg_type.h" +#include "compiler/brw_eu_defines.h" +#include "compiler/brw_inst.h" +#include "compiler/brw_eu.h" +#include "dev/intel_device_info.h" +#include "util/list.h" + +/* glibc < 2.27 defines OVERFLOW in /usr/include/math.h. */ +#undef OVERFLOW + +int yyparse(void); +int yylex(void); +char *lex_text(void); + +extern struct brw_codegen *p; +extern int errors; +extern char *input_filename; + +extern struct list_head instr_labels; +extern struct list_head target_labels; + +struct condition { + unsigned cond_modifier:4; + unsigned flag_reg_nr:1; + unsigned flag_subreg_nr:1; +}; + +struct predicate { + unsigned pred_control:4; + unsigned pred_inv:1; + unsigned flag_reg_nr:1; + unsigned flag_subreg_nr:1; +}; + +enum instoption_type { + INSTOPTION_FLAG, + INSTOPTION_DEP_INFO, +}; + +struct instoption { + enum instoption_type type; + union { + unsigned uint_value; + struct tgl_swsb depinfo_value; + }; +}; + +struct options { + unsigned access_mode:1; + unsigned compression_control:2; + unsigned thread_control:2; + unsigned no_dd_check:1; // Dependency control + unsigned no_dd_clear:1; // Dependency control + unsigned mask_control:1; + unsigned debug_control:1; + unsigned acc_wr_control:1; + unsigned end_of_thread:1; + unsigned compaction:1; + unsigned qtr_ctrl:2; + unsigned nib_ctrl:1; + unsigned is_compr:1; + struct tgl_swsb depinfo; +}; + +struct msgdesc { + unsigned ex_bso:1; + unsigned src1_len:5; +}; + +enum instr_label_type { + INSTR_LABEL_JIP, + INSTR_LABEL_UIP, +}; + +struct instr_label { + struct list_head link; + + char *name; + int offset; + enum instr_label_type type; +}; + +struct target_label { + struct list_head link; + + char *name; + int offset; +}; + +#endif /* BRW_ASM_H */ diff --git a/src/intel/compiler/elk/brw_asm_tool.c b/src/intel/compiler/elk/brw_asm_tool.c new file mode 100644 index 00000000000..6e4a5fce52d --- /dev/null +++ b/src/intel/compiler/elk/brw_asm_tool.c @@ -0,0 +1,385 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include "brw_asm.h" +#include "intel/compiler/brw_disasm_info.h" + +enum opt_output_type { + OPT_OUTPUT_HEX, + OPT_OUTPUT_C_LITERAL, + OPT_OUTPUT_BIN, +}; + +extern FILE *yyin; +struct brw_codegen *p; +static enum opt_output_type output_type = OPT_OUTPUT_BIN; +char *input_filename = NULL; +int errors; + +struct list_head instr_labels; +struct list_head target_labels; + +static void +print_help(const char *progname, FILE *file) +{ + fprintf(file, + "Usage: %s [OPTION] inputfile\n" + "Assemble i965 instructions from input file.\n\n" + " -h, --help display this help and exit\n" + " -t, --type=OUTPUT_TYPE OUTPUT_TYPE can be 'bin' (default if omitted),\n" + " 'c_literal', or 'hex'\n" + " -o, --output specify output file\n" + " --compact print compacted instructions\n" + " -g, --gen=platform assemble instructions for given \n" + " platform (3 letter platform name)\n" + "Example:\n" + " i965_asm -g kbl input.asm -t hex -o output\n", + progname); +} + +static uint32_t +get_dword(const brw_inst *inst, int idx) +{ + uint32_t dword; + memcpy(&dword, (char *)inst + 4 * idx, sizeof(dword)); + return dword; +} + +static void +print_instruction(FILE *output, bool compact, const brw_inst *instruction) +{ + int byte_limit; + + byte_limit = (compact == true) ? 8 : 16; + + switch (output_type) { + case OPT_OUTPUT_HEX: { + fprintf(output, "%02x", ((unsigned char *)instruction)[0]); + + for (unsigned i = 1; i < byte_limit; i++) { + fprintf(output, " %02x", ((unsigned char *)instruction)[i]); + } + break; + } + case OPT_OUTPUT_C_LITERAL: { + fprintf(output, "\t0x%08x,", get_dword(instruction, 0)); + + for (unsigned i = 1; i < byte_limit / 4; i++) + fprintf(output, " 0x%08x,", get_dword(instruction, i)); + + break; + } + case OPT_OUTPUT_BIN: + fwrite(instruction, 1, byte_limit, output); + break; + } + + if (output_type != OPT_OUTPUT_BIN) { + fprintf(output, "\n"); + } +} + +static struct intel_device_info * +i965_disasm_init(uint16_t pci_id) +{ + struct intel_device_info *devinfo; + + devinfo = malloc(sizeof *devinfo); + if (devinfo == NULL) + return NULL; + + if (!intel_get_device_info_from_pci_id(pci_id, devinfo)) { + fprintf(stderr, "can't find device information: pci_id=0x%x\n", + pci_id); + free(devinfo); + return NULL; + } + + return devinfo; +} + +static bool +i965_postprocess_labels() +{ + if (p->devinfo->ver < 6) { + return true; + } + + void *store = p->store; + + struct target_label *tlabel; + struct instr_label *ilabel, *s; + + const unsigned to_bytes_scale = brw_jump_scale(p->devinfo); + + LIST_FOR_EACH_ENTRY(tlabel, &target_labels, link) { + LIST_FOR_EACH_ENTRY_SAFE(ilabel, s, &instr_labels, link) { + if (!strcmp(tlabel->name, ilabel->name)) { + brw_inst *inst = store + ilabel->offset; + + int relative_offset = (tlabel->offset - ilabel->offset) / sizeof(brw_inst); + relative_offset *= to_bytes_scale; + + unsigned opcode = brw_inst_opcode(p->isa, inst); + + if (ilabel->type == INSTR_LABEL_JIP) { + switch (opcode) { + case BRW_OPCODE_IF: + case BRW_OPCODE_ELSE: + case BRW_OPCODE_ENDIF: + case BRW_OPCODE_WHILE: + if (p->devinfo->ver >= 7) { + brw_inst_set_jip(p->devinfo, inst, relative_offset); + } else if (p->devinfo->ver == 6) { + brw_inst_set_gfx6_jump_count(p->devinfo, inst, relative_offset); + } + break; + case BRW_OPCODE_BREAK: + case BRW_OPCODE_HALT: + case BRW_OPCODE_CONTINUE: + brw_inst_set_jip(p->devinfo, inst, relative_offset); + break; + default: + fprintf(stderr, "Unknown opcode %d with JIP label\n", opcode); + return false; + } + } else { + switch (opcode) { + case BRW_OPCODE_IF: + case BRW_OPCODE_ELSE: + if (p->devinfo->ver > 7) { + brw_inst_set_uip(p->devinfo, inst, relative_offset); + } else if (p->devinfo->ver == 7) { + brw_inst_set_uip(p->devinfo, inst, relative_offset); + } else if (p->devinfo->ver == 6) { + // Nothing + } + break; + case BRW_OPCODE_WHILE: + case BRW_OPCODE_ENDIF: + fprintf(stderr, "WHILE/ENDIF cannot have UIP offset\n"); + return false; + case BRW_OPCODE_BREAK: + case BRW_OPCODE_CONTINUE: + case BRW_OPCODE_HALT: + brw_inst_set_uip(p->devinfo, inst, relative_offset); + break; + default: + fprintf(stderr, "Unknown opcode %d with UIP label\n", opcode); + return false; + } + } + + list_del(&ilabel->link); + } + } + } + + LIST_FOR_EACH_ENTRY(ilabel, &instr_labels, link) { + fprintf(stderr, "Unknown label '%s'\n", ilabel->name); + } + + return list_is_empty(&instr_labels); +} + +int main(int argc, char **argv) +{ + char *output_file = NULL; + char c; + FILE *output = stdout; + bool help = false, compact = false; + void *store; + uint64_t pci_id = 0; + int offset = 0, err; + int start_offset = 0; + struct disasm_info *disasm_info; + struct intel_device_info *devinfo = NULL; + int result = EXIT_FAILURE; + list_inithead(&instr_labels); + list_inithead(&target_labels); + + const struct option i965_asm_opts[] = { + { "help", no_argument, (int *) &help, true }, + { "type", required_argument, NULL, 't' }, + { "gen", required_argument, NULL, 'g' }, + { "output", required_argument, NULL, 'o' }, + { "compact", no_argument, (int *) &compact, true }, + { NULL, 0, NULL, 0 } + }; + + while ((c = getopt_long(argc, argv, ":t:g:o:h", i965_asm_opts, NULL)) != -1) { + switch (c) { + case 'g': { + const int id = intel_device_name_to_pci_device_id(optarg); + if (id < 0) { + fprintf(stderr, "can't parse gen: '%s', expected 3 letter " + "platform name\n", optarg); + goto end; + } else { + pci_id = id; + } + break; + } + case 'h': + help = true; + print_help(argv[0], stderr); + goto end; + case 't': { + if (strcmp(optarg, "hex") == 0) { + output_type = OPT_OUTPUT_HEX; + } else if (strcmp(optarg, "c_literal") == 0) { + output_type = OPT_OUTPUT_C_LITERAL; + } else if (strcmp(optarg, "bin") == 0) { + output_type = OPT_OUTPUT_BIN; + } else { + fprintf(stderr, "invalid value for --type: %s\n", optarg); + goto end; + } + break; + } + case 'o': + output_file = strdup(optarg); + break; + case 0: + break; + case ':': + fprintf(stderr, "%s: option `-%c' requires an argument\n", + argv[0], optopt); + goto end; + case '?': + default: + fprintf(stderr, "%s: option `-%c' is invalid: ignored\n", + argv[0], optopt); + goto end; + } + } + + if (help || !pci_id) { + print_help(argv[0], stderr); + goto end; + } + + if (!argv[optind]) { + fprintf(stderr, "Please specify input file\n"); + goto end; + } + + input_filename = strdup(argv[optind]); + yyin = fopen(input_filename, "r"); + if (!yyin) { + fprintf(stderr, "Unable to read input file : %s\n", + input_filename); + goto end; + } + + if (output_file) { + output = fopen(output_file, "w"); + if (!output) { + fprintf(stderr, "Couldn't open output file\n"); + goto end; + } + } + + devinfo = i965_disasm_init(pci_id); + if (!devinfo) { + fprintf(stderr, "Unable to allocate memory for " + "intel_device_info struct instance.\n"); + goto end; + } + + struct brw_isa_info isa; + brw_init_isa_info(&isa, devinfo); + + p = rzalloc(NULL, struct brw_codegen); + brw_init_codegen(&isa, p, p); + p->automatic_exec_sizes = false; + + err = yyparse(); + if (err || errors) + goto end; + + if (!i965_postprocess_labels()) + goto end; + + store = p->store; + + disasm_info = disasm_initialize(p->isa, NULL); + if (!disasm_info) { + fprintf(stderr, "Unable to initialize disasm_info struct instance\n"); + goto end; + } + + if (output_type == OPT_OUTPUT_C_LITERAL) + fprintf(output, "{\n"); + + brw_validate_instructions(p->isa, p->store, 0, + p->next_insn_offset, disasm_info); + + const int nr_insn = (p->next_insn_offset - start_offset) / 16; + + if (compact) + brw_compact_instructions(p, start_offset, disasm_info); + + for (int i = 0; i < nr_insn; i++) { + const brw_inst *insn = store + offset; + bool compacted = false; + + if (compact && brw_inst_cmpt_control(p->devinfo, insn)) { + offset += 8; + compacted = true; + } else { + offset += 16; + } + + print_instruction(output, compacted, insn); + } + + ralloc_free(disasm_info); + + if (output_type == OPT_OUTPUT_C_LITERAL) + fprintf(output, "}"); + + result = EXIT_SUCCESS; + goto end; + +end: + free(input_filename); + free(output_file); + + if (yyin) + fclose(yyin); + + if (output) + fclose(output); + + if (p) + ralloc_free(p); + + if (devinfo) + free(devinfo); + + exit(result); +} diff --git a/src/intel/compiler/elk/brw_cfg.cpp b/src/intel/compiler/elk/brw_cfg.cpp new file mode 100644 index 00000000000..01cb42635c1 --- /dev/null +++ b/src/intel/compiler/elk/brw_cfg.cpp @@ -0,0 +1,833 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt + * + */ + +#include "brw_cfg.h" +#include "util/u_dynarray.h" +#include "brw_shader.h" + +/** @file brw_cfg.cpp + * + * Walks the shader instructions generated and creates a set of basic + * blocks with successor/predecessor edges connecting them. + */ + +using namespace brw; + +static bblock_t * +pop_stack(exec_list *list) +{ + bblock_link *link = (bblock_link *)list->get_tail(); + bblock_t *block = link->block; + link->link.remove(); + + return block; +} + +static exec_node * +link(void *mem_ctx, bblock_t *block, enum bblock_link_kind kind) +{ + bblock_link *l = new(mem_ctx) bblock_link(block, kind); + return &l->link; +} + +void +push_stack(exec_list *list, void *mem_ctx, bblock_t *block) +{ + /* The kind of the link is immaterial, but we need to provide one since + * this is (ab)using the edge data structure in order to implement a stack. + */ + list->push_tail(link(mem_ctx, block, bblock_link_logical)); +} + +bblock_t::bblock_t(cfg_t *cfg) : + cfg(cfg), start_ip(0), end_ip(0), end_ip_delta(0), num(0) +{ + instructions.make_empty(); + parents.make_empty(); + children.make_empty(); +} + +void +bblock_t::add_successor(void *mem_ctx, bblock_t *successor, + enum bblock_link_kind kind) +{ + successor->parents.push_tail(::link(mem_ctx, this, kind)); + children.push_tail(::link(mem_ctx, successor, kind)); +} + +bool +bblock_t::is_predecessor_of(const bblock_t *block, + enum bblock_link_kind kind) const +{ + foreach_list_typed_safe (bblock_link, parent, link, &block->parents) { + if (parent->block == this && parent->kind <= kind) { + return true; + } + } + + return false; +} + +bool +bblock_t::is_successor_of(const bblock_t *block, + enum bblock_link_kind kind) const +{ + foreach_list_typed_safe (bblock_link, child, link, &block->children) { + if (child->block == this && child->kind <= kind) { + return true; + } + } + + return false; +} + +static bool +ends_block(const backend_instruction *inst) +{ + enum opcode op = inst->opcode; + + return op == BRW_OPCODE_IF || + op == BRW_OPCODE_ELSE || + op == BRW_OPCODE_CONTINUE || + op == BRW_OPCODE_BREAK || + op == BRW_OPCODE_DO || + op == BRW_OPCODE_WHILE; +} + +static bool +starts_block(const backend_instruction *inst) +{ + enum opcode op = inst->opcode; + + return op == BRW_OPCODE_DO || + op == BRW_OPCODE_ENDIF; +} + +bool +bblock_t::can_combine_with(const bblock_t *that) const +{ + if ((const bblock_t *)this->link.next != that) + return false; + + if (ends_block(this->end()) || + starts_block(that->start())) + return false; + + return true; +} + +void +bblock_t::combine_with(bblock_t *that) +{ + assert(this->can_combine_with(that)); + foreach_list_typed (bblock_link, link, link, &that->parents) { + assert(link->block == this); + } + + this->end_ip = that->end_ip; + this->instructions.append_list(&that->instructions); + + this->cfg->remove_block(that); +} + +void +bblock_t::dump(FILE *file) const +{ + const backend_shader *s = this->cfg->s; + + int ip = this->start_ip; + foreach_inst_in_block(backend_instruction, inst, this) { + fprintf(file, "%5d: ", ip); + s->dump_instruction(inst, file); + ip++; + } +} + +void +bblock_t::unlink_list(exec_list *list) +{ + assert(list == &parents || list == &children); + const bool remove_parent = list == &children; + + foreach_list_typed_safe(bblock_link, link, link, list) { + /* Also break the links from the other block back to this block. */ + exec_list *sub_list = remove_parent ? &link->block->parents : &link->block->children; + + foreach_list_typed_safe(bblock_link, sub_link, link, sub_list) { + if (sub_link->block == this) { + sub_link->link.remove(); + ralloc_free(sub_link); + } + } + + link->link.remove(); + ralloc_free(link); + } +} + +cfg_t::cfg_t(const backend_shader *s, exec_list *instructions) : + s(s) +{ + mem_ctx = ralloc_context(NULL); + block_list.make_empty(); + blocks = NULL; + num_blocks = 0; + + bblock_t *cur = NULL; + int ip = 0; + + bblock_t *entry = new_block(); + bblock_t *cur_if = NULL; /**< BB ending with IF. */ + bblock_t *cur_else = NULL; /**< BB ending with ELSE. */ + bblock_t *cur_do = NULL; /**< BB starting with DO. */ + bblock_t *cur_while = NULL; /**< BB immediately following WHILE. */ + exec_list if_stack, else_stack, do_stack, while_stack; + bblock_t *next; + + set_next_block(&cur, entry, ip); + + foreach_in_list_safe(backend_instruction, inst, instructions) { + /* set_next_block wants the post-incremented ip */ + ip++; + + inst->exec_node::remove(); + + switch (inst->opcode) { + case BRW_OPCODE_IF: + cur->instructions.push_tail(inst); + + /* Push our information onto a stack so we can recover from + * nested ifs. + */ + push_stack(&if_stack, mem_ctx, cur_if); + push_stack(&else_stack, mem_ctx, cur_else); + + cur_if = cur; + cur_else = NULL; + + /* Set up our immediately following block, full of "then" + * instructions. + */ + next = new_block(); + cur_if->add_successor(mem_ctx, next, bblock_link_logical); + + set_next_block(&cur, next, ip); + break; + + case BRW_OPCODE_ELSE: + cur->instructions.push_tail(inst); + + cur_else = cur; + + next = new_block(); + assert(cur_if != NULL); + cur_if->add_successor(mem_ctx, next, bblock_link_logical); + cur_else->add_successor(mem_ctx, next, bblock_link_physical); + + set_next_block(&cur, next, ip); + break; + + case BRW_OPCODE_ENDIF: { + bblock_t *cur_endif; + + if (cur->instructions.is_empty()) { + /* New block was just created; use it. */ + cur_endif = cur; + } else { + cur_endif = new_block(); + + cur->add_successor(mem_ctx, cur_endif, bblock_link_logical); + + set_next_block(&cur, cur_endif, ip - 1); + } + + cur->instructions.push_tail(inst); + + if (cur_else) { + cur_else->add_successor(mem_ctx, cur_endif, bblock_link_logical); + } else { + assert(cur_if != NULL); + cur_if->add_successor(mem_ctx, cur_endif, bblock_link_logical); + } + + assert(cur_if->end()->opcode == BRW_OPCODE_IF); + assert(!cur_else || cur_else->end()->opcode == BRW_OPCODE_ELSE); + + /* Pop the stack so we're in the previous if/else/endif */ + cur_if = pop_stack(&if_stack); + cur_else = pop_stack(&else_stack); + break; + } + case BRW_OPCODE_DO: + /* Push our information onto a stack so we can recover from + * nested loops. + */ + push_stack(&do_stack, mem_ctx, cur_do); + push_stack(&while_stack, mem_ctx, cur_while); + + /* Set up the block just after the while. Don't know when exactly + * it will start, yet. + */ + cur_while = new_block(); + + if (cur->instructions.is_empty()) { + /* New block was just created; use it. */ + cur_do = cur; + } else { + cur_do = new_block(); + + cur->add_successor(mem_ctx, cur_do, bblock_link_logical); + + set_next_block(&cur, cur_do, ip - 1); + } + + cur->instructions.push_tail(inst); + + /* Represent divergent execution of the loop as a pair of alternative + * edges coming out of the DO instruction: For any physical iteration + * of the loop a given logical thread can either start off enabled + * (which is represented as the "next" successor), or disabled (if it + * has reached a non-uniform exit of the loop during a previous + * iteration, which is represented as the "cur_while" successor). + * + * The disabled edge will be taken by the logical thread anytime we + * arrive at the DO instruction through a back-edge coming from a + * conditional exit of the loop where divergent control flow started. + * + * This guarantees that there is a control-flow path from any + * divergence point of the loop into the convergence point + * (immediately past the WHILE instruction) such that it overlaps the + * whole IP region of divergent control flow (potentially the whole + * loop) *and* doesn't imply the execution of any instructions part + * of the loop (since the corresponding execution mask bit will be + * disabled for a diverging thread). + * + * This way we make sure that any variables that are live throughout + * the region of divergence for an inactive logical thread are also + * considered to interfere with any other variables assigned by + * active logical threads within the same physical region of the + * program, since otherwise we would risk cross-channel data + * corruption. + */ + next = new_block(); + cur->add_successor(mem_ctx, next, bblock_link_logical); + cur->add_successor(mem_ctx, cur_while, bblock_link_physical); + set_next_block(&cur, next, ip); + break; + + case BRW_OPCODE_CONTINUE: + cur->instructions.push_tail(inst); + + /* A conditional CONTINUE may start a region of divergent control + * flow until the start of the next loop iteration (*not* until the + * end of the loop which is why the successor is not the top-level + * divergence point at cur_do). The live interval of any variable + * extending through a CONTINUE edge is guaranteed to overlap the + * whole region of divergent execution, because any variable live-out + * at the CONTINUE instruction will also be live-in at the top of the + * loop, and therefore also live-out at the bottom-most point of the + * loop which is reachable from the top (since a control flow path + * exists from a definition of the variable through this CONTINUE + * instruction, the top of the loop, the (reachable) bottom of the + * loop, the top of the loop again, into a use of the variable). + */ + assert(cur_do != NULL); + cur->add_successor(mem_ctx, cur_do->next(), bblock_link_logical); + + next = new_block(); + if (inst->predicate) + cur->add_successor(mem_ctx, next, bblock_link_logical); + else + cur->add_successor(mem_ctx, next, bblock_link_physical); + + set_next_block(&cur, next, ip); + break; + + case BRW_OPCODE_BREAK: + cur->instructions.push_tail(inst); + + /* A conditional BREAK instruction may start a region of divergent + * control flow until the end of the loop if the condition is + * non-uniform, in which case the loop will execute additional + * iterations with the present channel disabled. We model this as a + * control flow path from the divergence point to the convergence + * point that overlaps the whole IP range of the loop and skips over + * the execution of any other instructions part of the loop. + * + * See the DO case for additional explanation. + */ + assert(cur_do != NULL); + cur->add_successor(mem_ctx, cur_do, bblock_link_physical); + cur->add_successor(mem_ctx, cur_while, bblock_link_logical); + + next = new_block(); + if (inst->predicate) + cur->add_successor(mem_ctx, next, bblock_link_logical); + else + cur->add_successor(mem_ctx, next, bblock_link_physical); + + set_next_block(&cur, next, ip); + break; + + case BRW_OPCODE_WHILE: + cur->instructions.push_tail(inst); + + assert(cur_do != NULL && cur_while != NULL); + + /* A conditional WHILE instruction may start a region of divergent + * control flow until the end of the loop, just like the BREAK + * instruction. See the BREAK case for more details. OTOH an + * unconditional WHILE instruction is non-divergent (just like an + * unconditional CONTINUE), and will necessarily lead to the + * execution of an additional iteration of the loop for all enabled + * channels, so we may skip over the divergence point at the top of + * the loop to keep the CFG as unambiguous as possible. + */ + if (inst->predicate) { + cur->add_successor(mem_ctx, cur_do, bblock_link_logical); + } else { + cur->add_successor(mem_ctx, cur_do->next(), bblock_link_logical); + } + + set_next_block(&cur, cur_while, ip); + + /* Pop the stack so we're in the previous loop */ + cur_do = pop_stack(&do_stack); + cur_while = pop_stack(&while_stack); + break; + + default: + cur->instructions.push_tail(inst); + break; + } + } + + cur->end_ip = ip - 1; + + make_block_array(); +} + +cfg_t::~cfg_t() +{ + ralloc_free(mem_ctx); +} + +void +cfg_t::remove_block(bblock_t *block) +{ + foreach_list_typed_safe (bblock_link, predecessor, link, &block->parents) { + /* cfg_t::validate checks that predecessor and successor lists are well + * formed, so it is known that the loop here would find exactly one + * block. Set old_link_kind to silence "variable used but not set" + * warnings. + */ + bblock_link_kind old_link_kind = bblock_link_logical; + + /* Remove block from all of its predecessors' successor lists. */ + foreach_list_typed_safe (bblock_link, successor, link, + &predecessor->block->children) { + if (block == successor->block) { + old_link_kind = successor->kind; + successor->link.remove(); + ralloc_free(successor); + break; + } + } + + /* Add removed-block's successors to its predecessors' successor lists. */ + foreach_list_typed (bblock_link, successor, link, &block->children) { + bool need_to_link = true; + bblock_link_kind new_link_kind = MAX2(old_link_kind, successor->kind); + + foreach_list_typed_safe (bblock_link, child, link, &predecessor->block->children) { + /* There is already a link between the two blocks. If the links + * are the same kind or the link is logical, do nothing. If the + * existing link is physical and the proposed new link is logical, + * promote the existing link to logical. + * + * This is accomplished by taking the minimum of the existing link + * kind and the proposed link kind. + */ + if (child->block == successor->block) { + child->kind = MIN2(child->kind, new_link_kind); + need_to_link = false; + break; + } + } + + if (need_to_link) { + predecessor->block->children.push_tail(link(mem_ctx, + successor->block, + new_link_kind)); + } + } + } + + foreach_list_typed_safe (bblock_link, successor, link, &block->children) { + /* cfg_t::validate checks that predecessor and successor lists are well + * formed, so it is known that the loop here would find exactly one + * block. Set old_link_kind to silence "variable used but not set" + * warnings. + */ + bblock_link_kind old_link_kind = bblock_link_logical; + + /* Remove block from all of its childrens' parents lists. */ + foreach_list_typed_safe (bblock_link, predecessor, link, + &successor->block->parents) { + if (block == predecessor->block) { + old_link_kind = predecessor->kind; + predecessor->link.remove(); + ralloc_free(predecessor); + } + } + + /* Add removed-block's predecessors to its successors' predecessor lists. */ + foreach_list_typed (bblock_link, predecessor, link, &block->parents) { + bool need_to_link = true; + bblock_link_kind new_link_kind = MAX2(old_link_kind, predecessor->kind); + + foreach_list_typed_safe (bblock_link, parent, link, &successor->block->parents) { + /* There is already a link between the two blocks. If the links + * are the same kind or the link is logical, do nothing. If the + * existing link is physical and the proposed new link is logical, + * promote the existing link to logical. + * + * This is accomplished by taking the minimum of the existing link + * kind and the proposed link kind. + */ + if (parent->block == predecessor->block) { + parent->kind = MIN2(parent->kind, new_link_kind); + need_to_link = false; + break; + } + } + + if (need_to_link) { + successor->block->parents.push_tail(link(mem_ctx, + predecessor->block, + new_link_kind)); + } + } + } + + block->link.remove(); + + for (int b = block->num; b < this->num_blocks - 1; b++) { + this->blocks[b] = this->blocks[b + 1]; + this->blocks[b]->num = b; + } + + this->blocks[this->num_blocks - 1]->num = this->num_blocks - 2; + this->num_blocks--; +} + +bblock_t * +cfg_t::new_block() +{ + bblock_t *block = new(mem_ctx) bblock_t(this); + + return block; +} + +void +cfg_t::set_next_block(bblock_t **cur, bblock_t *block, int ip) +{ + if (*cur) { + (*cur)->end_ip = ip - 1; + } + + block->start_ip = ip; + block->num = num_blocks++; + block_list.push_tail(&block->link); + *cur = block; +} + +void +cfg_t::make_block_array() +{ + blocks = ralloc_array(mem_ctx, bblock_t *, num_blocks); + + int i = 0; + foreach_block (block, this) { + blocks[i++] = block; + } + assert(i == num_blocks); +} + +namespace { + +struct link_desc { + char kind; + int num; +}; + +int +compare_link_desc(const void *a, const void *b) +{ + const link_desc *la = (const link_desc *)a; + const link_desc *lb = (const link_desc *)b; + + return la->num < lb->num ? -1 : + la->num > lb->num ? +1 : + la->kind < lb->kind ? -1 : + la->kind > lb->kind ? +1 : + 0; +} + +void +sort_links(util_dynarray *scratch, exec_list *list) +{ + util_dynarray_clear(scratch); + foreach_list_typed(bblock_link, link, link, list) { + link_desc l; + l.kind = link->kind == bblock_link_logical ? '-' : '~'; + l.num = link->block->num; + util_dynarray_append(scratch, link_desc, l); + } + qsort(scratch->data, util_dynarray_num_elements(scratch, link_desc), + sizeof(link_desc), compare_link_desc); +} + +} /* namespace */ + +void +cfg_t::dump(FILE *file) +{ + const idom_tree *idom = (s ? &s->idom_analysis.require() : NULL); + + /* Temporary storage to sort the lists of blocks. This normalizes the + * output, making it possible to use it for certain tests. + */ + util_dynarray scratch; + util_dynarray_init(&scratch, NULL); + + foreach_block (block, this) { + if (idom && idom->parent(block)) + fprintf(file, "START B%d IDOM(B%d)", block->num, + idom->parent(block)->num); + else + fprintf(file, "START B%d IDOM(none)", block->num); + + sort_links(&scratch, &block->parents); + util_dynarray_foreach(&scratch, link_desc, l) + fprintf(file, " <%cB%d", l->kind, l->num); + fprintf(file, "\n"); + + if (s != NULL) + block->dump(file); + fprintf(file, "END B%d", block->num); + + sort_links(&scratch, &block->children); + util_dynarray_foreach(&scratch, link_desc, l) + fprintf(file, " %c>B%d", l->kind, l->num); + fprintf(file, "\n"); + } + + util_dynarray_fini(&scratch); +} + +/* Calculates the immediate dominator of each block, according to "A Simple, + * Fast Dominance Algorithm" by Keith D. Cooper, Timothy J. Harvey, and Ken + * Kennedy. + * + * The authors claim that for control flow graphs of sizes normally encountered + * (less than 1000 nodes) that this algorithm is significantly faster than + * others like Lengauer-Tarjan. + */ +idom_tree::idom_tree(const backend_shader *s) : + num_parents(s->cfg->num_blocks), + parents(new bblock_t *[num_parents]()) +{ + bool changed; + + parents[0] = s->cfg->blocks[0]; + + do { + changed = false; + + foreach_block(block, s->cfg) { + if (block->num == 0) + continue; + + bblock_t *new_idom = NULL; + foreach_list_typed(bblock_link, parent_link, link, &block->parents) { + if (parent(parent_link->block)) { + new_idom = (new_idom ? intersect(new_idom, parent_link->block) : + parent_link->block); + } + } + + if (parent(block) != new_idom) { + parents[block->num] = new_idom; + changed = true; + } + } + } while (changed); +} + +idom_tree::~idom_tree() +{ + delete[] parents; +} + +bblock_t * +idom_tree::intersect(bblock_t *b1, bblock_t *b2) const +{ + /* Note, the comparisons here are the opposite of what the paper says + * because we index blocks from beginning -> end (i.e. reverse post-order) + * instead of post-order like they assume. + */ + while (b1->num != b2->num) { + while (b1->num > b2->num) + b1 = parent(b1); + while (b2->num > b1->num) + b2 = parent(b2); + } + assert(b1); + return b1; +} + +void +idom_tree::dump() const +{ + printf("digraph DominanceTree {\n"); + for (unsigned i = 0; i < num_parents; i++) + printf("\t%d -> %d\n", parents[i]->num, i); + printf("}\n"); +} + +void +cfg_t::dump_cfg() +{ + printf("digraph CFG {\n"); + for (int b = 0; b < num_blocks; b++) { + bblock_t *block = this->blocks[b]; + + foreach_list_typed_safe (bblock_link, child, link, &block->children) { + printf("\t%d -> %d\n", b, child->block->num); + } + } + printf("}\n"); +} + +#define cfgv_assert(assertion) \ + { \ + if (!(assertion)) { \ + fprintf(stderr, "ASSERT: CFG validation in %s failed!\n", stage_abbrev); \ + fprintf(stderr, "%s:%d: '%s' failed\n", __FILE__, __LINE__, #assertion); \ + abort(); \ + } \ + } + +#ifndef NDEBUG +void +cfg_t::validate(const char *stage_abbrev) +{ + foreach_block(block, this) { + foreach_list_typed(bblock_link, successor, link, &block->children) { + /* Each successor of a block must have one predecessor link back to + * the block. + */ + bool successor_links_back_to_predecessor = false; + bblock_t *succ_block = successor->block; + + foreach_list_typed(bblock_link, predecessor, link, &succ_block->parents) { + if (predecessor->block == block) { + cfgv_assert(!successor_links_back_to_predecessor); + cfgv_assert(successor->kind == predecessor->kind); + successor_links_back_to_predecessor = true; + } + } + + cfgv_assert(successor_links_back_to_predecessor); + + /* Each successor block must appear only once in the list of + * successors. + */ + foreach_list_typed_from(bblock_link, later_successor, link, + &block->children, successor->link.next) { + cfgv_assert(successor->block != later_successor->block); + } + } + + foreach_list_typed(bblock_link, predecessor, link, &block->parents) { + /* Each predecessor of a block must have one successor link back to + * the block. + */ + bool predecessor_links_back_to_successor = false; + bblock_t *pred_block = predecessor->block; + + foreach_list_typed(bblock_link, successor, link, &pred_block->children) { + if (successor->block == block) { + cfgv_assert(!predecessor_links_back_to_successor); + cfgv_assert(successor->kind == predecessor->kind); + predecessor_links_back_to_successor = true; + } + } + + cfgv_assert(predecessor_links_back_to_successor); + + /* Each precessor block must appear only once in the list of + * precessors. + */ + foreach_list_typed_from(bblock_link, later_precessor, link, + &block->parents, predecessor->link.next) { + cfgv_assert(predecessor->block != later_precessor->block); + } + } + + backend_instruction *first_inst = block->start(); + if (first_inst->opcode == BRW_OPCODE_DO) { + /* DO instructions both begin and end a block, so the DO instruction + * must be the only instruction in the block. + */ + cfgv_assert(exec_list_is_singular(&block->instructions)); + + /* A block starting with DO should have exactly two successors. One + * is a physical link to the block starting after the WHILE + * instruction. The other is a logical link to the block starting the + * body of the loop. + */ + bblock_t *physical_block = nullptr; + bblock_t *logical_block = nullptr; + + foreach_list_typed(bblock_link, child, link, &block->children) { + if (child->kind == bblock_link_physical) { + cfgv_assert(physical_block == nullptr); + physical_block = child->block; + } else { + cfgv_assert(logical_block == nullptr); + logical_block = child->block; + } + } + + cfgv_assert(logical_block != nullptr); + cfgv_assert(physical_block != nullptr); + } + } +} +#endif diff --git a/src/intel/compiler/elk/brw_cfg.h b/src/intel/compiler/elk/brw_cfg.h new file mode 100644 index 00000000000..7784ab43784 --- /dev/null +++ b/src/intel/compiler/elk/brw_cfg.h @@ -0,0 +1,532 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt + * + */ + +#ifndef BRW_CFG_H +#define BRW_CFG_H + +#include "brw_ir.h" +#ifdef __cplusplus +#include "brw_ir_analysis.h" +#endif + +struct bblock_t; + +/** + * CFG edge types. + * + * A logical edge represents a potential control flow path of the original + * scalar program, while a physical edge represents a control flow path that + * may not have existed in the original program but was introduced during + * vectorization in order to implement divergent control flow of different + * shader invocations within the same SIMD thread. + * + * All logical edges in the CFG are considered to be physical edges but not + * the other way around -- I.e. the logical CFG is a subset of the physical + * one. + */ +enum bblock_link_kind { + bblock_link_logical = 0, + bblock_link_physical +}; + +struct bblock_link { +#ifdef __cplusplus + DECLARE_RALLOC_CXX_OPERATORS(bblock_link) + + bblock_link(bblock_t *block, enum bblock_link_kind kind) + : block(block), kind(kind) + { + } +#endif + + struct exec_node link; + struct bblock_t *block; + + /* Type of this CFG edge. Because bblock_link_logical also implies + * bblock_link_physical, the proper way to test for membership of edge 'l' + * in CFG kind 'k' is 'l.kind <= k'. + */ + enum bblock_link_kind kind; +}; + +struct backend_shader; +struct cfg_t; + +struct bblock_t { +#ifdef __cplusplus + DECLARE_RALLOC_CXX_OPERATORS(bblock_t) + + explicit bblock_t(cfg_t *cfg); + + void add_successor(void *mem_ctx, bblock_t *successor, + enum bblock_link_kind kind); + bool is_predecessor_of(const bblock_t *block, + enum bblock_link_kind kind) const; + bool is_successor_of(const bblock_t *block, + enum bblock_link_kind kind) const; + bool can_combine_with(const bblock_t *that) const; + void combine_with(bblock_t *that); + void dump(FILE *file = stderr) const; + + backend_instruction *start(); + const backend_instruction *start() const; + backend_instruction *end(); + const backend_instruction *end() const; + + bblock_t *next(); + const bblock_t *next() const; + bblock_t *prev(); + const bblock_t *prev() const; + + bool starts_with_control_flow() const; + bool ends_with_control_flow() const; + + backend_instruction *first_non_control_flow_inst(); + backend_instruction *last_non_control_flow_inst(); + +private: + /** + * \sa unlink_parents, unlink_children + */ + void unlink_list(exec_list *); + +public: + void unlink_parents() + { + unlink_list(&parents); + } + + void unlink_children() + { + unlink_list(&children); + } +#endif + + struct exec_node link; + struct cfg_t *cfg; + + int start_ip; + int end_ip; + + /** + * Change in end_ip since the last time IPs of later blocks were updated. + */ + int end_ip_delta; + + struct exec_list instructions; + struct exec_list parents; + struct exec_list children; + int num; +}; + +static inline struct backend_instruction * +bblock_start(struct bblock_t *block) +{ + return (struct backend_instruction *)exec_list_get_head(&block->instructions); +} + +static inline const struct backend_instruction * +bblock_start_const(const struct bblock_t *block) +{ + return (const struct backend_instruction *)exec_list_get_head_const(&block->instructions); +} + +static inline struct backend_instruction * +bblock_end(struct bblock_t *block) +{ + return (struct backend_instruction *)exec_list_get_tail(&block->instructions); +} + +static inline const struct backend_instruction * +bblock_end_const(const struct bblock_t *block) +{ + return (const struct backend_instruction *)exec_list_get_tail_const(&block->instructions); +} + +static inline struct bblock_t * +bblock_next(struct bblock_t *block) +{ + if (exec_node_is_tail_sentinel(block->link.next)) + return NULL; + + return (struct bblock_t *)block->link.next; +} + +static inline const struct bblock_t * +bblock_next_const(const struct bblock_t *block) +{ + if (exec_node_is_tail_sentinel(block->link.next)) + return NULL; + + return (const struct bblock_t *)block->link.next; +} + +static inline struct bblock_t * +bblock_prev(struct bblock_t *block) +{ + if (exec_node_is_head_sentinel(block->link.prev)) + return NULL; + + return (struct bblock_t *)block->link.prev; +} + +static inline const struct bblock_t * +bblock_prev_const(const struct bblock_t *block) +{ + if (exec_node_is_head_sentinel(block->link.prev)) + return NULL; + + return (const struct bblock_t *)block->link.prev; +} + +static inline bool +bblock_starts_with_control_flow(const struct bblock_t *block) +{ + enum opcode op = bblock_start_const(block)->opcode; + return op == BRW_OPCODE_DO || op == BRW_OPCODE_ENDIF; +} + +static inline bool +bblock_ends_with_control_flow(const struct bblock_t *block) +{ + enum opcode op = bblock_end_const(block)->opcode; + return op == BRW_OPCODE_IF || + op == BRW_OPCODE_ELSE || + op == BRW_OPCODE_WHILE || + op == BRW_OPCODE_BREAK || + op == BRW_OPCODE_CONTINUE; +} + +static inline struct backend_instruction * +bblock_first_non_control_flow_inst(struct bblock_t *block) +{ + struct backend_instruction *inst = bblock_start(block); + if (bblock_starts_with_control_flow(block)) +#ifdef __cplusplus + inst = (struct backend_instruction *)inst->next; +#else + inst = (struct backend_instruction *)inst->link.next; +#endif + return inst; +} + +static inline struct backend_instruction * +bblock_last_non_control_flow_inst(struct bblock_t *block) +{ + struct backend_instruction *inst = bblock_end(block); + if (bblock_ends_with_control_flow(block)) +#ifdef __cplusplus + inst = (struct backend_instruction *)inst->prev; +#else + inst = (struct backend_instruction *)inst->link.prev; +#endif + return inst; +} + +#ifdef __cplusplus +inline backend_instruction * +bblock_t::start() +{ + return bblock_start(this); +} + +inline const backend_instruction * +bblock_t::start() const +{ + return bblock_start_const(this); +} + +inline backend_instruction * +bblock_t::end() +{ + return bblock_end(this); +} + +inline const backend_instruction * +bblock_t::end() const +{ + return bblock_end_const(this); +} + +inline bblock_t * +bblock_t::next() +{ + return bblock_next(this); +} + +inline const bblock_t * +bblock_t::next() const +{ + return bblock_next_const(this); +} + +inline bblock_t * +bblock_t::prev() +{ + return bblock_prev(this); +} + +inline const bblock_t * +bblock_t::prev() const +{ + return bblock_prev_const(this); +} + +inline bool +bblock_t::starts_with_control_flow() const +{ + return bblock_starts_with_control_flow(this); +} + +inline bool +bblock_t::ends_with_control_flow() const +{ + return bblock_ends_with_control_flow(this); +} + +inline backend_instruction * +bblock_t::first_non_control_flow_inst() +{ + return bblock_first_non_control_flow_inst(this); +} + +inline backend_instruction * +bblock_t::last_non_control_flow_inst() +{ + return bblock_last_non_control_flow_inst(this); +} +#endif + +struct cfg_t { +#ifdef __cplusplus + DECLARE_RALLOC_CXX_OPERATORS(cfg_t) + + cfg_t(const backend_shader *s, exec_list *instructions); + ~cfg_t(); + + void remove_block(bblock_t *block); + + bblock_t *first_block(); + const bblock_t *first_block() const; + bblock_t *last_block(); + const bblock_t *last_block() const; + + bblock_t *new_block(); + void set_next_block(bblock_t **cur, bblock_t *block, int ip); + void make_block_array(); + + void dump(FILE *file = stderr); + void dump_cfg(); + +#ifdef NDEBUG + void validate(UNUSED const char *stage_abbrev) { } +#else + void validate(const char *stage_abbrev); +#endif + + /** + * Propagate bblock_t::end_ip_delta data through the CFG. + */ + inline void adjust_block_ips(); + +#endif + const struct backend_shader *s; + void *mem_ctx; + + /** Ordered list (by ip) of basic blocks */ + struct exec_list block_list; + struct bblock_t **blocks; + int num_blocks; +}; + +static inline struct bblock_t * +cfg_first_block(struct cfg_t *cfg) +{ + return (struct bblock_t *)exec_list_get_head(&cfg->block_list); +} + +static inline const struct bblock_t * +cfg_first_block_const(const struct cfg_t *cfg) +{ + return (const struct bblock_t *)exec_list_get_head_const(&cfg->block_list); +} + +static inline struct bblock_t * +cfg_last_block(struct cfg_t *cfg) +{ + return (struct bblock_t *)exec_list_get_tail(&cfg->block_list); +} + +static inline const struct bblock_t * +cfg_last_block_const(const struct cfg_t *cfg) +{ + return (const struct bblock_t *)exec_list_get_tail_const(&cfg->block_list); +} + +#ifdef __cplusplus +inline bblock_t * +cfg_t::first_block() +{ + return cfg_first_block(this); +} + +const inline bblock_t * +cfg_t::first_block() const +{ + return cfg_first_block_const(this); +} + +inline bblock_t * +cfg_t::last_block() +{ + return cfg_last_block(this); +} + +const inline bblock_t * +cfg_t::last_block() const +{ + return cfg_last_block_const(this); +} +#endif + +/* Note that this is implemented with a double for loop -- break will + * break from the inner loop only! + */ +#define foreach_block_and_inst(__block, __type, __inst, __cfg) \ + foreach_block (__block, __cfg) \ + foreach_inst_in_block (__type, __inst, __block) + +/* Note that this is implemented with a double for loop -- break will + * break from the inner loop only! + */ +#define foreach_block_and_inst_safe(__block, __type, __inst, __cfg) \ + foreach_block_safe (__block, __cfg) \ + foreach_inst_in_block_safe (__type, __inst, __block) + +#define foreach_block(__block, __cfg) \ + foreach_list_typed (bblock_t, __block, link, &(__cfg)->block_list) + +#define foreach_block_reverse(__block, __cfg) \ + foreach_list_typed_reverse (bblock_t, __block, link, &(__cfg)->block_list) + +#define foreach_block_safe(__block, __cfg) \ + foreach_list_typed_safe (bblock_t, __block, link, &(__cfg)->block_list) + +#define foreach_block_reverse_safe(__block, __cfg) \ + foreach_list_typed_reverse_safe (bblock_t, __block, link, &(__cfg)->block_list) + +#define foreach_inst_in_block(__type, __inst, __block) \ + foreach_in_list(__type, __inst, &(__block)->instructions) + +#define foreach_inst_in_block_safe(__type, __inst, __block) \ + for (__type *__inst = (__type *)__block->instructions.head_sentinel.next, \ + *__next = (__type *)__inst->next; \ + __next != NULL; \ + __inst = __next, \ + __next = (__type *)__next->next) + +#define foreach_inst_in_block_reverse(__type, __inst, __block) \ + foreach_in_list_reverse(__type, __inst, &(__block)->instructions) + +#define foreach_inst_in_block_reverse_safe(__type, __inst, __block) \ + foreach_in_list_reverse_safe(__type, __inst, &(__block)->instructions) + +#define foreach_inst_in_block_starting_from(__type, __scan_inst, __inst) \ + for (__type *__scan_inst = (__type *)__inst->next; \ + !__scan_inst->is_tail_sentinel(); \ + __scan_inst = (__type *)__scan_inst->next) + +#define foreach_inst_in_block_reverse_starting_from(__type, __scan_inst, __inst) \ + for (__type *__scan_inst = (__type *)__inst->prev; \ + !__scan_inst->is_head_sentinel(); \ + __scan_inst = (__type *)__scan_inst->prev) + +#ifdef __cplusplus +inline void +cfg_t::adjust_block_ips() +{ + int delta = 0; + + foreach_block(block, this) { + block->start_ip += delta; + block->end_ip += delta; + + delta += block->end_ip_delta; + + block->end_ip_delta = 0; + } +} + +namespace brw { + /** + * Immediate dominator tree analysis of a shader. + */ + struct idom_tree { + idom_tree(const backend_shader *s); + ~idom_tree(); + + bool + validate(const backend_shader *) const + { + /* FINISHME */ + return true; + } + + analysis_dependency_class + dependency_class() const + { + return DEPENDENCY_BLOCKS; + } + + const bblock_t * + parent(const bblock_t *b) const + { + assert(unsigned(b->num) < num_parents); + return parents[b->num]; + } + + bblock_t * + parent(bblock_t *b) const + { + assert(unsigned(b->num) < num_parents); + return parents[b->num]; + } + + bblock_t * + intersect(bblock_t *b1, bblock_t *b2) const; + + void + dump() const; + + private: + unsigned num_parents; + bblock_t **parents; + }; +} +#endif + +#endif /* BRW_CFG_H */ diff --git a/src/intel/compiler/elk/brw_clip.h b/src/intel/compiler/elk/brw_clip.h new file mode 100644 index 00000000000..4ca89455963 --- /dev/null +++ b/src/intel/compiler/elk/brw_clip.h @@ -0,0 +1,163 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell + */ + +#ifndef BRW_CLIP_H +#define BRW_CLIP_H + +#include "brw_compiler.h" +#include "brw_eu.h" + +/* Initial 3 verts, plus at most 6 additional verts from intersections + * with fixed planes, plus at most 8 additional verts from intersections + * with user clip planes + */ +#define MAX_VERTS (3+6+8) + +#define PRIM_MASK (0x1f) + +struct brw_clip_compile { + struct brw_codegen func; + struct brw_clip_prog_key key; + struct brw_clip_prog_data prog_data; + + struct { + struct brw_reg R0; + struct brw_reg vertex[MAX_VERTS]; + + struct brw_reg t; + struct brw_reg t0, t1; + struct brw_reg dp0, dp1; + + struct brw_reg dpPrev; + struct brw_reg dp; + struct brw_reg loopcount; + struct brw_reg nr_verts; + struct brw_reg planemask; + + struct brw_reg inlist; + struct brw_reg outlist; + struct brw_reg freelist; + + struct brw_reg dir; + struct brw_reg tmp0, tmp1; + struct brw_reg offset; + + struct brw_reg fixed_planes; + struct brw_reg plane_equation; + + struct brw_reg ff_sync; + + /* Bitmask indicating which coordinate attribute should be used for + * comparison to each clipping plane. A 0 indicates that VARYING_SLOT_POS + * should be used, because it's one of the fixed +/- x/y/z planes that + * constitute the bounds of the view volume. A 1 indicates that + * VARYING_SLOT_CLIP_VERTEX should be used (if available) since it's a user- + * defined clipping plane. + */ + struct brw_reg vertex_src_mask; + + /* Offset into the vertex of the current plane's clipdistance value */ + struct brw_reg clipdistance_offset; + } reg; + + /* Number of registers storing VUE data */ + GLuint nr_regs; + + GLuint first_tmp; + GLuint last_tmp; + + bool need_direction; + + struct intel_vue_map vue_map; +}; + +/** + * True if the given varying is one of the outputs of the vertex shader. + */ +static inline bool brw_clip_have_varying(struct brw_clip_compile *c, + GLuint varying) +{ + return (c->key.attrs & BITFIELD64_BIT(varying)) ? 1 : 0; +} + +/* Points are only culled, so no need for a clip routine, however it + * works out easier to have a dummy one. + */ +void brw_emit_unfilled_clip( struct brw_clip_compile *c ); +void brw_emit_tri_clip( struct brw_clip_compile *c ); +void brw_emit_line_clip( struct brw_clip_compile *c ); +void brw_emit_point_clip( struct brw_clip_compile *c ); + +/* brw_clip_tri.c, for use by the unfilled clip routine: + */ +void brw_clip_tri_init_vertices( struct brw_clip_compile *c ); +void brw_clip_tri_flat_shade( struct brw_clip_compile *c ); +void brw_clip_tri( struct brw_clip_compile *c ); +void brw_clip_tri_emit_polygon( struct brw_clip_compile *c ); +void brw_clip_tri_alloc_regs( struct brw_clip_compile *c, + GLuint nr_verts ); + + +/* Utils: + */ + +void brw_clip_interp_vertex( struct brw_clip_compile *c, + struct brw_indirect dest_ptr, + struct brw_indirect v0_ptr, /* from */ + struct brw_indirect v1_ptr, /* to */ + struct brw_reg t0, + bool force_edgeflag ); + +void brw_clip_init_planes( struct brw_clip_compile *c ); + +void brw_clip_emit_vue(struct brw_clip_compile *c, + struct brw_indirect vert, + enum brw_urb_write_flags flags, + GLuint header); + +void brw_clip_kill_thread(struct brw_clip_compile *c); + +struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c ); +struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c ); + +void brw_clip_copy_flatshaded_attributes( struct brw_clip_compile *c, + GLuint to, GLuint from ); + +void brw_clip_init_clipmask( struct brw_clip_compile *c ); + +struct brw_reg get_tmp( struct brw_clip_compile *c ); + +void brw_clip_project_position(struct brw_clip_compile *c, + struct brw_reg pos ); +void brw_clip_ff_sync(struct brw_clip_compile *c); +void brw_clip_init_ff_sync(struct brw_clip_compile *c); + +#endif diff --git a/src/intel/compiler/elk/brw_clip_line.c b/src/intel/compiler/elk/brw_clip_line.c new file mode 100644 index 00000000000..b71173277d9 --- /dev/null +++ b/src/intel/compiler/elk/brw_clip_line.c @@ -0,0 +1,303 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell + */ + +#include "brw_clip.h" +#include "brw_prim.h" + +static void brw_clip_line_alloc_regs( struct brw_clip_compile *c ) +{ + const struct intel_device_info *devinfo = c->func.devinfo; + GLuint i = 0,j; + + /* Register usage is static, precompute here: + */ + c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++; + + if (c->key.nr_userclip) { + c->reg.fixed_planes = brw_vec4_grf(i, 0); + i += (6 + c->key.nr_userclip + 1) / 2; + + c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2; + } + else + c->prog_data.curb_read_length = 0; + + + /* Payload vertices plus space for more generated vertices: + */ + for (j = 0; j < 4; j++) { + c->reg.vertex[j] = brw_vec4_grf(i, 0); + i += c->nr_regs; + } + + c->reg.t = brw_vec1_grf(i, 0); + c->reg.t0 = brw_vec1_grf(i, 1); + c->reg.t1 = brw_vec1_grf(i, 2); + c->reg.planemask = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD); + c->reg.plane_equation = brw_vec4_grf(i, 4); + i++; + + c->reg.dp0 = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */ + c->reg.dp1 = brw_vec1_grf(i, 4); + i++; + + if (!c->key.nr_userclip) { + c->reg.fixed_planes = brw_vec8_grf(i, 0); + i++; + } + + c->reg.vertex_src_mask = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD); + c->reg.clipdistance_offset = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_W); + i++; + + if (devinfo->ver == 5) { + c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD); + i++; + } + + c->first_tmp = i; + c->last_tmp = i; + + c->prog_data.urb_read_length = c->nr_regs; /* ? */ + c->prog_data.total_grf = i; +} + + +/* Line clipping, more or less following the following algorithm: + * + * for (p=0;p t1) t1 = t; + * } else { + * GLfloat t = dp0 / (dp0 - dp1); + * if (t > t0) t0 = t; + * } + * + * if (t0 + t1 >= 1.0) + * return; + * } + * } + * + * interp( ctx, newvtx0, vtx0, vtx1, t0 ); + * interp( ctx, newvtx1, vtx1, vtx0, t1 ); + * + */ +static void clip_and_emit_line( struct brw_clip_compile *c ) +{ + struct brw_codegen *p = &c->func; + struct brw_indirect vtx0 = brw_indirect(0, 0); + struct brw_indirect vtx1 = brw_indirect(1, 0); + struct brw_indirect newvtx0 = brw_indirect(2, 0); + struct brw_indirect newvtx1 = brw_indirect(3, 0); + struct brw_indirect plane_ptr = brw_indirect(4, 0); + struct brw_reg v1_null_ud = retype(vec1(brw_null_reg()), BRW_REGISTER_TYPE_UD); + GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS); + GLint clipdist0_offset = c->key.nr_userclip + ? brw_varying_to_offset(&c->vue_map, VARYING_SLOT_CLIP_DIST0) + : 0; + + brw_MOV(p, get_addr_reg(vtx0), brw_address(c->reg.vertex[0])); + brw_MOV(p, get_addr_reg(vtx1), brw_address(c->reg.vertex[1])); + brw_MOV(p, get_addr_reg(newvtx0), brw_address(c->reg.vertex[2])); + brw_MOV(p, get_addr_reg(newvtx1), brw_address(c->reg.vertex[3])); + brw_MOV(p, get_addr_reg(plane_ptr), brw_clip_plane0_address(c)); + + /* Note: init t0, t1 together: + */ + brw_MOV(p, vec2(c->reg.t0), brw_imm_f(0)); + + brw_clip_init_planes(c); + brw_clip_init_clipmask(c); + + /* -ve rhw workaround */ + if (p->devinfo->has_negative_rhw_bug) { + brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2), + brw_imm_ud(1<<20)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); + brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(0x3f)); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL); + } + + /* Set the initial vertex source mask: The first 6 planes are the bounds + * of the view volume; the next 8 planes are the user clipping planes. + */ + brw_MOV(p, c->reg.vertex_src_mask, brw_imm_ud(0x3fc0)); + + /* Set the initial clipdistance offset to be 6 floats before gl_ClipDistance[0]. + * We'll increment 6 times before we start hitting actual user clipping. */ + brw_MOV(p, c->reg.clipdistance_offset, brw_imm_d(clipdist0_offset - 6*sizeof(float))); + + brw_DO(p, BRW_EXECUTE_1); + { + /* if (planemask & 1) + */ + brw_AND(p, v1_null_ud, c->reg.planemask, brw_imm_ud(1)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); + + brw_IF(p, BRW_EXECUTE_1); + { + brw_AND(p, v1_null_ud, c->reg.vertex_src_mask, brw_imm_ud(1)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); + brw_IF(p, BRW_EXECUTE_1); + { + /* user clip distance: just fetch the correct float from each vertex */ + struct brw_indirect temp_ptr = brw_indirect(7, 0); + brw_ADD(p, get_addr_reg(temp_ptr), get_addr_reg(vtx0), c->reg.clipdistance_offset); + brw_MOV(p, c->reg.dp0, deref_1f(temp_ptr, 0)); + brw_ADD(p, get_addr_reg(temp_ptr), get_addr_reg(vtx1), c->reg.clipdistance_offset); + brw_MOV(p, c->reg.dp1, deref_1f(temp_ptr, 0)); + } + brw_ELSE(p); + { + /* fixed plane: fetch the hpos, dp4 against the plane. */ + if (c->key.nr_userclip) + brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0)); + else + brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0)); + + brw_DP4(p, vec4(c->reg.dp0), deref_4f(vtx0, hpos_offset), c->reg.plane_equation); + brw_DP4(p, vec4(c->reg.dp1), deref_4f(vtx1, hpos_offset), c->reg.plane_equation); + } + brw_ENDIF(p); + + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, vec1(c->reg.dp1), brw_imm_f(0.0f)); + + brw_IF(p, BRW_EXECUTE_1); + { + /* + * Both can be negative on GM965/G965 due to RHW workaround + * if so, this object should be rejected. + */ + if (p->devinfo->has_negative_rhw_bug) { + brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE, c->reg.dp0, brw_imm_f(0.0)); + brw_IF(p, BRW_EXECUTE_1); + { + brw_clip_kill_thread(c); + } + brw_ENDIF(p); + } + + brw_ADD(p, c->reg.t, c->reg.dp1, negate(c->reg.dp0)); + brw_math_invert(p, c->reg.t, c->reg.t); + brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp1); + + brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t1 ); + brw_MOV(p, c->reg.t1, c->reg.t); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, + BRW_PREDICATE_NORMAL); + } + brw_ELSE(p); + { + /* Coming back in. We know that both cannot be negative + * because the line would have been culled in that case. + */ + + /* If both are positive, do nothing */ + /* Only on GM965/G965 */ + if (p->devinfo->has_negative_rhw_bug) { + brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0)); + brw_IF(p, BRW_EXECUTE_1); + } + + { + brw_ADD(p, c->reg.t, c->reg.dp0, negate(c->reg.dp1)); + brw_math_invert(p, c->reg.t, c->reg.t); + brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp0); + + brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t0 ); + brw_MOV(p, c->reg.t0, c->reg.t); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, + BRW_PREDICATE_NORMAL); + } + + if (p->devinfo->has_negative_rhw_bug) { + brw_ENDIF(p); + } + } + brw_ENDIF(p); + } + brw_ENDIF(p); + + /* plane_ptr++; + */ + brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c)); + + /* while (planemask>>=1) != 0 + */ + brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); + brw_SHR(p, c->reg.vertex_src_mask, c->reg.vertex_src_mask, brw_imm_ud(1)); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL); + brw_ADD(p, c->reg.clipdistance_offset, c->reg.clipdistance_offset, brw_imm_w(sizeof(float))); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL); + } + brw_WHILE(p); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL); + + brw_ADD(p, c->reg.t, c->reg.t0, c->reg.t1); + brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.t, brw_imm_f(1.0)); + brw_IF(p, BRW_EXECUTE_1); + { + brw_clip_interp_vertex(c, newvtx0, vtx0, vtx1, c->reg.t0, false); + brw_clip_interp_vertex(c, newvtx1, vtx1, vtx0, c->reg.t1, false); + + brw_clip_emit_vue(c, newvtx0, BRW_URB_WRITE_ALLOCATE_COMPLETE, + (_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT) + | URB_WRITE_PRIM_START); + brw_clip_emit_vue(c, newvtx1, BRW_URB_WRITE_EOT_COMPLETE, + (_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT) + | URB_WRITE_PRIM_END); + } + brw_ENDIF(p); + brw_clip_kill_thread(c); +} + + + +void brw_emit_line_clip( struct brw_clip_compile *c ) +{ + brw_clip_line_alloc_regs(c); + brw_clip_init_ff_sync(c); + + if (c->key.contains_flat_varying) { + if (c->key.pv_first) + brw_clip_copy_flatshaded_attributes(c, 1, 0); + else + brw_clip_copy_flatshaded_attributes(c, 0, 1); + } + + clip_and_emit_line(c); +} diff --git a/src/intel/compiler/elk/brw_clip_point.c b/src/intel/compiler/elk/brw_clip_point.c new file mode 100644 index 00000000000..1cfb5f23357 --- /dev/null +++ b/src/intel/compiler/elk/brw_clip_point.c @@ -0,0 +1,45 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell + */ + +#include "brw_clip.h" + + +/* Point clipping, nothing to do? + */ +void brw_emit_point_clip( struct brw_clip_compile *c ) +{ + /* Send an empty message to kill the thread: + */ + brw_clip_tri_alloc_regs(c, 0); + brw_clip_init_ff_sync(c); + + brw_clip_kill_thread(c); +} diff --git a/src/intel/compiler/elk/brw_clip_tri.c b/src/intel/compiler/elk/brw_clip_tri.c new file mode 100644 index 00000000000..a5bc2b85c12 --- /dev/null +++ b/src/intel/compiler/elk/brw_clip_tri.c @@ -0,0 +1,659 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell + */ + +#include "brw_clip.h" +#include "brw_prim.h" + +static void release_tmps( struct brw_clip_compile *c ) +{ + c->last_tmp = c->first_tmp; +} + + +void brw_clip_tri_alloc_regs( struct brw_clip_compile *c, + GLuint nr_verts ) +{ + const struct intel_device_info *devinfo = c->func.devinfo; + GLuint i = 0,j; + + /* Register usage is static, precompute here: + */ + c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++; + + if (c->key.nr_userclip) { + c->reg.fixed_planes = brw_vec4_grf(i, 0); + i += (6 + c->key.nr_userclip + 1) / 2; + + c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2; + } + else + c->prog_data.curb_read_length = 0; + + + /* Payload vertices plus space for more generated vertices: + */ + for (j = 0; j < nr_verts; j++) { + c->reg.vertex[j] = brw_vec4_grf(i, 0); + i += c->nr_regs; + } + + if (c->vue_map.num_slots % 2 && nr_verts > 0) { + /* The VUE has an odd number of slots so the last register is only half + * used. Fill the second half with zero. + */ + for (j = 0; j < 3; j++) { + GLuint delta = brw_vue_slot_to_offset(c->vue_map.num_slots); + + brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0)); + } + } + + c->reg.t = brw_vec1_grf(i, 0); + c->reg.loopcount = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_D); + c->reg.nr_verts = retype(brw_vec1_grf(i, 2), BRW_REGISTER_TYPE_UD); + c->reg.planemask = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD); + c->reg.plane_equation = brw_vec4_grf(i, 4); + i++; + + c->reg.dpPrev = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */ + c->reg.dp = brw_vec1_grf(i, 4); + i++; + + c->reg.inlist = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0); + i++; + + c->reg.outlist = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0); + i++; + + c->reg.freelist = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0); + i++; + + if (!c->key.nr_userclip) { + c->reg.fixed_planes = brw_vec8_grf(i, 0); + i++; + } + + if (c->key.do_unfilled) { + c->reg.dir = brw_vec4_grf(i, 0); + c->reg.offset = brw_vec4_grf(i, 4); + i++; + c->reg.tmp0 = brw_vec4_grf(i, 0); + c->reg.tmp1 = brw_vec4_grf(i, 4); + i++; + } + + c->reg.vertex_src_mask = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD); + c->reg.clipdistance_offset = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_W); + i++; + + if (devinfo->ver == 5) { + c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD); + i++; + } + + c->first_tmp = i; + c->last_tmp = i; + + c->prog_data.urb_read_length = c->nr_regs; /* ? */ + c->prog_data.total_grf = i; +} + + + +void brw_clip_tri_init_vertices( struct brw_clip_compile *c ) +{ + struct brw_codegen *p = &c->func; + struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */ + + /* Initial list of indices for incoming vertices: + */ + brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK)); + brw_CMP(p, + vec1(brw_null_reg()), + BRW_CONDITIONAL_EQ, + tmp0, + brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE)); + + /* XXX: Is there an easier way to do this? Need to reverse every + * second tristrip element: Can ignore sometimes? + */ + brw_IF(p, BRW_EXECUTE_1); + { + brw_MOV(p, get_element(c->reg.inlist, 0), brw_address(c->reg.vertex[1]) ); + brw_MOV(p, get_element(c->reg.inlist, 1), brw_address(c->reg.vertex[0]) ); + if (c->need_direction) + brw_MOV(p, c->reg.dir, brw_imm_f(-1)); + } + brw_ELSE(p); + { + brw_MOV(p, get_element(c->reg.inlist, 0), brw_address(c->reg.vertex[0]) ); + brw_MOV(p, get_element(c->reg.inlist, 1), brw_address(c->reg.vertex[1]) ); + if (c->need_direction) + brw_MOV(p, c->reg.dir, brw_imm_f(1)); + } + brw_ENDIF(p); + + brw_MOV(p, get_element(c->reg.inlist, 2), brw_address(c->reg.vertex[2]) ); + brw_MOV(p, brw_vec8_grf(c->reg.outlist.nr, 0), brw_imm_f(0)); + brw_MOV(p, c->reg.nr_verts, brw_imm_ud(3)); +} + + + +void brw_clip_tri_flat_shade( struct brw_clip_compile *c ) +{ + struct brw_codegen *p = &c->func; + struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */ + + brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK)); + brw_CMP(p, + vec1(brw_null_reg()), + BRW_CONDITIONAL_EQ, + tmp0, + brw_imm_ud(_3DPRIM_POLYGON)); + + brw_IF(p, BRW_EXECUTE_1); + { + brw_clip_copy_flatshaded_attributes(c, 1, 0); + brw_clip_copy_flatshaded_attributes(c, 2, 0); + } + brw_ELSE(p); + { + if (c->key.pv_first) { + brw_CMP(p, + vec1(brw_null_reg()), + BRW_CONDITIONAL_EQ, + tmp0, + brw_imm_ud(_3DPRIM_TRIFAN)); + brw_IF(p, BRW_EXECUTE_1); + { + brw_clip_copy_flatshaded_attributes(c, 0, 1); + brw_clip_copy_flatshaded_attributes(c, 2, 1); + } + brw_ELSE(p); + { + brw_clip_copy_flatshaded_attributes(c, 1, 0); + brw_clip_copy_flatshaded_attributes(c, 2, 0); + } + brw_ENDIF(p); + } + else { + brw_clip_copy_flatshaded_attributes(c, 0, 2); + brw_clip_copy_flatshaded_attributes(c, 1, 2); + } + } + brw_ENDIF(p); +} + + +/** + * Loads the clip distance for a vertex into `dst`, and ends with + * a comparison of it to zero with the condition `cond`. + * + * - If using a fixed plane, the distance is dot(hpos, plane). + * - If using a user clip plane, the distance is directly available in the vertex. + */ +static inline void +load_clip_distance(struct brw_clip_compile *c, struct brw_indirect vtx, + struct brw_reg dst, GLuint hpos_offset, int cond) +{ + struct brw_codegen *p = &c->func; + + dst = vec4(dst); + brw_AND(p, vec1(brw_null_reg()), c->reg.vertex_src_mask, brw_imm_ud(1)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); + brw_IF(p, BRW_EXECUTE_1); + { + struct brw_indirect temp_ptr = brw_indirect(7, 0); + brw_ADD(p, get_addr_reg(temp_ptr), get_addr_reg(vtx), c->reg.clipdistance_offset); + brw_MOV(p, vec1(dst), deref_1f(temp_ptr, 0)); + } + brw_ELSE(p); + { + brw_MOV(p, dst, deref_4f(vtx, hpos_offset)); + brw_DP4(p, dst, dst, c->reg.plane_equation); + } + brw_ENDIF(p); + + brw_CMP(p, brw_null_reg(), cond, vec1(dst), brw_imm_f(0.0f)); +} + + +/* Use mesa's clipping algorithms, translated to GFX4 assembly. + */ +void brw_clip_tri( struct brw_clip_compile *c ) +{ + struct brw_codegen *p = &c->func; + struct brw_indirect vtx = brw_indirect(0, 0); + struct brw_indirect vtxPrev = brw_indirect(1, 0); + struct brw_indirect vtxOut = brw_indirect(2, 0); + struct brw_indirect plane_ptr = brw_indirect(3, 0); + struct brw_indirect inlist_ptr = brw_indirect(4, 0); + struct brw_indirect outlist_ptr = brw_indirect(5, 0); + struct brw_indirect freelist_ptr = brw_indirect(6, 0); + GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS); + GLint clipdist0_offset = c->key.nr_userclip + ? brw_varying_to_offset(&c->vue_map, VARYING_SLOT_CLIP_DIST0) + : 0; + + brw_MOV(p, get_addr_reg(vtxPrev), brw_address(c->reg.vertex[2]) ); + brw_MOV(p, get_addr_reg(plane_ptr), brw_clip_plane0_address(c)); + brw_MOV(p, get_addr_reg(inlist_ptr), brw_address(c->reg.inlist)); + brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist)); + + brw_MOV(p, get_addr_reg(freelist_ptr), brw_address(c->reg.vertex[3]) ); + + /* Set the initial vertex source mask: The first 6 planes are the bounds + * of the view volume; the next 8 planes are the user clipping planes. + */ + brw_MOV(p, c->reg.vertex_src_mask, brw_imm_ud(0x3fc0)); + + /* Set the initial clipdistance offset to be 6 floats before gl_ClipDistance[0]. + * We'll increment 6 times before we start hitting actual user clipping. */ + brw_MOV(p, c->reg.clipdistance_offset, brw_imm_d(clipdist0_offset - 6*sizeof(float))); + + brw_DO(p, BRW_EXECUTE_1); + { + /* if (planemask & 1) + */ + brw_AND(p, vec1(brw_null_reg()), c->reg.planemask, brw_imm_ud(1)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); + + brw_IF(p, BRW_EXECUTE_1); + { + /* vtxOut = freelist_ptr++ + */ + brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(freelist_ptr) ); + brw_ADD(p, get_addr_reg(freelist_ptr), get_addr_reg(freelist_ptr), brw_imm_uw(c->nr_regs * REG_SIZE)); + + if (c->key.nr_userclip) + brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0)); + else + brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0)); + + brw_MOV(p, c->reg.loopcount, c->reg.nr_verts); + brw_MOV(p, c->reg.nr_verts, brw_imm_ud(0)); + + brw_DO(p, BRW_EXECUTE_1); + { + /* vtx = *input_ptr; + */ + brw_MOV(p, get_addr_reg(vtx), deref_1uw(inlist_ptr, 0)); + + load_clip_distance(c, vtxPrev, c->reg.dpPrev, hpos_offset, BRW_CONDITIONAL_L); + /* (prev < 0.0f) */ + brw_IF(p, BRW_EXECUTE_1); + { + load_clip_distance(c, vtx, c->reg.dp, hpos_offset, BRW_CONDITIONAL_GE); + /* IS_POSITIVE(next) + */ + brw_IF(p, BRW_EXECUTE_1); + { + + /* Coming back in. + */ + brw_ADD(p, c->reg.t, c->reg.dpPrev, negate(c->reg.dp)); + brw_math_invert(p, c->reg.t, c->reg.t); + brw_MUL(p, c->reg.t, c->reg.t, c->reg.dpPrev); + + /* If (vtxOut == 0) vtxOut = vtxPrev + */ + brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) ); + brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtxPrev)); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, + BRW_PREDICATE_NORMAL); + + brw_clip_interp_vertex(c, vtxOut, vtxPrev, vtx, c->reg.t, false); + + /* *outlist_ptr++ = vtxOut; + * nr_verts++; + * vtxOut = 0; + */ + brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut)); + brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short))); + brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1)); + brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) ); + } + brw_ENDIF(p); + + } + brw_ELSE(p); + { + /* *outlist_ptr++ = vtxPrev; + * nr_verts++; + */ + brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxPrev)); + brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short))); + brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1)); + + load_clip_distance(c, vtx, c->reg.dp, hpos_offset, BRW_CONDITIONAL_L); + /* (next < 0.0f) + */ + brw_IF(p, BRW_EXECUTE_1); + { + /* Going out of bounds. Avoid division by zero as we + * know dp != dpPrev from DIFFERENT_SIGNS, above. + */ + brw_ADD(p, c->reg.t, c->reg.dp, negate(c->reg.dpPrev)); + brw_math_invert(p, c->reg.t, c->reg.t); + brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp); + + /* If (vtxOut == 0) vtxOut = vtx + */ + brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) ); + brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtx)); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, + BRW_PREDICATE_NORMAL); + + brw_clip_interp_vertex(c, vtxOut, vtx, vtxPrev, c->reg.t, true); + + /* *outlist_ptr++ = vtxOut; + * nr_verts++; + * vtxOut = 0; + */ + brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut)); + brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short))); + brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1)); + brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) ); + } + brw_ENDIF(p); + } + brw_ENDIF(p); + + /* vtxPrev = vtx; + * inlist_ptr++; + */ + brw_MOV(p, get_addr_reg(vtxPrev), get_addr_reg(vtx)); + brw_ADD(p, get_addr_reg(inlist_ptr), get_addr_reg(inlist_ptr), brw_imm_uw(sizeof(short))); + + /* while (--loopcount != 0) + */ + brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); + } + brw_WHILE(p); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL); + + /* vtxPrev = *(outlist_ptr-1) OR: outlist[nr_verts-1] + * inlist = outlist + * inlist_ptr = &inlist[0] + * outlist_ptr = &outlist[0] + */ + brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_w(-2)); + brw_MOV(p, get_addr_reg(vtxPrev), deref_1uw(outlist_ptr, 0)); + brw_MOV(p, brw_vec8_grf(c->reg.inlist.nr, 0), brw_vec8_grf(c->reg.outlist.nr, 0)); + brw_MOV(p, get_addr_reg(inlist_ptr), brw_address(c->reg.inlist)); + brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist)); + } + brw_ENDIF(p); + + /* plane_ptr++; + */ + brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c)); + + /* nr_verts >= 3 + */ + brw_CMP(p, + vec1(brw_null_reg()), + BRW_CONDITIONAL_GE, + c->reg.nr_verts, + brw_imm_ud(3)); + brw_set_default_predicate_control(p, BRW_PREDICATE_NORMAL); + + /* && (planemask>>=1) != 0 + */ + brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); + brw_SHR(p, c->reg.vertex_src_mask, c->reg.vertex_src_mask, brw_imm_ud(1)); + brw_ADD(p, c->reg.clipdistance_offset, c->reg.clipdistance_offset, brw_imm_w(sizeof(float))); + } + brw_WHILE(p); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); +} + + + +void brw_clip_tri_emit_polygon(struct brw_clip_compile *c) +{ + struct brw_codegen *p = &c->func; + + /* for (loopcount = nr_verts-2; loopcount > 0; loopcount--) + */ + brw_ADD(p, + c->reg.loopcount, + c->reg.nr_verts, + brw_imm_d(-2)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_G); + + brw_IF(p, BRW_EXECUTE_1); + { + struct brw_indirect v0 = brw_indirect(0, 0); + struct brw_indirect vptr = brw_indirect(1, 0); + + brw_MOV(p, get_addr_reg(vptr), brw_address(c->reg.inlist)); + brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0)); + + brw_clip_emit_vue(c, v0, BRW_URB_WRITE_ALLOCATE_COMPLETE, + ((_3DPRIM_TRIFAN << URB_WRITE_PRIM_TYPE_SHIFT) + | URB_WRITE_PRIM_START)); + + brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2)); + brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0)); + + brw_DO(p, BRW_EXECUTE_1); + { + brw_clip_emit_vue(c, v0, BRW_URB_WRITE_ALLOCATE_COMPLETE, + (_3DPRIM_TRIFAN << URB_WRITE_PRIM_TYPE_SHIFT)); + + brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2)); + brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0)); + + brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); + } + brw_WHILE(p); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL); + + brw_clip_emit_vue(c, v0, BRW_URB_WRITE_EOT_COMPLETE, + ((_3DPRIM_TRIFAN << URB_WRITE_PRIM_TYPE_SHIFT) + | URB_WRITE_PRIM_END)); + } + brw_ENDIF(p); +} + +static void do_clip_tri( struct brw_clip_compile *c ) +{ + brw_clip_init_planes(c); + + brw_clip_tri(c); +} + + +static void maybe_do_clip_tri( struct brw_clip_compile *c ) +{ + struct brw_codegen *p = &c->func; + + brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0)); + brw_IF(p, BRW_EXECUTE_1); + { + do_clip_tri(c); + } + brw_ENDIF(p); +} + +static void brw_clip_test( struct brw_clip_compile *c ) +{ + struct brw_reg t = retype(get_tmp(c), BRW_REGISTER_TYPE_UD); + struct brw_reg t1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD); + struct brw_reg t2 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD); + struct brw_reg t3 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD); + + struct brw_reg v0 = get_tmp(c); + struct brw_reg v1 = get_tmp(c); + struct brw_reg v2 = get_tmp(c); + + struct brw_indirect vt0 = brw_indirect(0, 0); + struct brw_indirect vt1 = brw_indirect(1, 0); + struct brw_indirect vt2 = brw_indirect(2, 0); + + struct brw_codegen *p = &c->func; + struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */ + + GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, + VARYING_SLOT_POS); + + brw_MOV(p, get_addr_reg(vt0), brw_address(c->reg.vertex[0])); + brw_MOV(p, get_addr_reg(vt1), brw_address(c->reg.vertex[1])); + brw_MOV(p, get_addr_reg(vt2), brw_address(c->reg.vertex[2])); + brw_MOV(p, v0, deref_4f(vt0, hpos_offset)); + brw_MOV(p, v1, deref_4f(vt1, hpos_offset)); + brw_MOV(p, v2, deref_4f(vt2, hpos_offset)); + brw_AND(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(~0x3f)); + + /* test nearz, xmin, ymin plane */ + /* clip.xyz < -clip.w */ + brw_CMP(p, t1, BRW_CONDITIONAL_L, v0, negate(get_element(v0, 3))); + brw_CMP(p, t2, BRW_CONDITIONAL_L, v1, negate(get_element(v1, 3))); + brw_CMP(p, t3, BRW_CONDITIONAL_L, v2, negate(get_element(v2, 3))); + + /* All vertices are outside of a plane, rejected */ + brw_AND(p, t, t1, t2); + brw_AND(p, t, t, t3); + brw_OR(p, tmp0, get_element(t, 0), get_element(t, 1)); + brw_OR(p, tmp0, tmp0, get_element(t, 2)); + brw_AND(p, brw_null_reg(), tmp0, brw_imm_ud(0x1)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); + brw_IF(p, BRW_EXECUTE_1); + { + brw_clip_kill_thread(c); + } + brw_ENDIF(p); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + + /* some vertices are inside a plane, some are outside,need to clip */ + brw_XOR(p, t, t1, t2); + brw_XOR(p, t1, t2, t3); + brw_OR(p, t, t, t1); + brw_AND(p, t, t, brw_imm_ud(0x1)); + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, + get_element(t, 0), brw_imm_ud(0)); + brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<5))); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL); + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, + get_element(t, 1), brw_imm_ud(0)); + brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<3))); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL); + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, + get_element(t, 2), brw_imm_ud(0)); + brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<1))); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL); + + /* test farz, xmax, ymax plane */ + /* clip.xyz > clip.w */ + brw_CMP(p, t1, BRW_CONDITIONAL_G, v0, get_element(v0, 3)); + brw_CMP(p, t2, BRW_CONDITIONAL_G, v1, get_element(v1, 3)); + brw_CMP(p, t3, BRW_CONDITIONAL_G, v2, get_element(v2, 3)); + + /* All vertices are outside of a plane, rejected */ + brw_AND(p, t, t1, t2); + brw_AND(p, t, t, t3); + brw_OR(p, tmp0, get_element(t, 0), get_element(t, 1)); + brw_OR(p, tmp0, tmp0, get_element(t, 2)); + brw_AND(p, brw_null_reg(), tmp0, brw_imm_ud(0x1)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); + brw_IF(p, BRW_EXECUTE_1); + { + brw_clip_kill_thread(c); + } + brw_ENDIF(p); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + + /* some vertices are inside a plane, some are outside,need to clip */ + brw_XOR(p, t, t1, t2); + brw_XOR(p, t1, t2, t3); + brw_OR(p, t, t, t1); + brw_AND(p, t, t, brw_imm_ud(0x1)); + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, + get_element(t, 0), brw_imm_ud(0)); + brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<4))); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL); + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, + get_element(t, 1), brw_imm_ud(0)); + brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<2))); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL); + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, + get_element(t, 2), brw_imm_ud(0)); + brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<0))); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL); + + release_tmps(c); +} + + +void brw_emit_tri_clip( struct brw_clip_compile *c ) +{ + struct brw_codegen *p = &c->func; + brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6); + brw_clip_tri_init_vertices(c); + brw_clip_init_clipmask(c); + brw_clip_init_ff_sync(c); + + /* if -ve rhw workaround bit is set, + do cliptest */ + if (p->devinfo->has_negative_rhw_bug) { + brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2), + brw_imm_ud(1<<20)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); + brw_IF(p, BRW_EXECUTE_1); + { + brw_clip_test(c); + } + brw_ENDIF(p); + } + /* Can't push into do_clip_tri because with polygon (or quad) + * flatshading, need to apply the flatshade here because we don't + * respect the PV when converting to trifan for emit: + */ + if (c->key.contains_flat_varying) + brw_clip_tri_flat_shade(c); + + if ((c->key.clip_mode == BRW_CLIP_MODE_NORMAL) || + (c->key.clip_mode == BRW_CLIP_MODE_KERNEL_CLIP)) + do_clip_tri(c); + else + maybe_do_clip_tri(c); + + brw_clip_tri_emit_polygon(c); + + /* Send an empty message to kill the thread: + */ + brw_clip_kill_thread(c); +} diff --git a/src/intel/compiler/elk/brw_clip_unfilled.c b/src/intel/compiler/elk/brw_clip_unfilled.c new file mode 100644 index 00000000000..c0e78acc7e8 --- /dev/null +++ b/src/intel/compiler/elk/brw_clip_unfilled.c @@ -0,0 +1,528 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell + */ + +#include "brw_clip.h" +#include "brw_prim.h" + + +/* This is performed against the original triangles, so no indirection + * required: +BZZZT! + */ +static void compute_tri_direction( struct brw_clip_compile *c ) +{ + struct brw_codegen *p = &c->func; + struct brw_reg e = c->reg.tmp0; + struct brw_reg f = c->reg.tmp1; + GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS); + struct brw_reg v0 = byte_offset(c->reg.vertex[0], hpos_offset); + struct brw_reg v1 = byte_offset(c->reg.vertex[1], hpos_offset); + struct brw_reg v2 = byte_offset(c->reg.vertex[2], hpos_offset); + + + struct brw_reg v0n = get_tmp(c); + struct brw_reg v1n = get_tmp(c); + struct brw_reg v2n = get_tmp(c); + + /* Convert to NDC. + * NOTE: We can't modify the original vertex coordinates, + * as it may impact further operations. + * So, we have to keep normalized coordinates in temp registers. + * + * TBD-KC + * Try to optimize unnecessary MOV's. + */ + brw_MOV(p, v0n, v0); + brw_MOV(p, v1n, v1); + brw_MOV(p, v2n, v2); + + brw_clip_project_position(c, v0n); + brw_clip_project_position(c, v1n); + brw_clip_project_position(c, v2n); + + /* Calculate the vectors of two edges of the triangle: + */ + brw_ADD(p, e, v0n, negate(v2n)); + brw_ADD(p, f, v1n, negate(v2n)); + + /* Take their crossproduct: + */ + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_MUL(p, vec4(brw_null_reg()), brw_swizzle(e, BRW_SWIZZLE_YZXW), + brw_swizzle(f, BRW_SWIZZLE_ZXYW)); + brw_MAC(p, vec4(e), negate(brw_swizzle(e, BRW_SWIZZLE_ZXYW)), + brw_swizzle(f, BRW_SWIZZLE_YZXW)); + brw_set_default_access_mode(p, BRW_ALIGN_1); + + brw_MUL(p, c->reg.dir, c->reg.dir, vec4(e)); +} + + +static void cull_direction( struct brw_clip_compile *c ) +{ + struct brw_codegen *p = &c->func; + GLuint conditional; + + assert (!(c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL && + c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL)); + + if (c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL) + conditional = BRW_CONDITIONAL_GE; + else + conditional = BRW_CONDITIONAL_L; + + brw_CMP(p, + vec1(brw_null_reg()), + conditional, + get_element(c->reg.dir, 2), + brw_imm_f(0)); + + brw_IF(p, BRW_EXECUTE_1); + { + brw_clip_kill_thread(c); + } + brw_ENDIF(p); +} + + + +static void copy_bfc( struct brw_clip_compile *c ) +{ + struct brw_codegen *p = &c->func; + GLuint conditional; + + /* Do we have any colors to copy? + */ + if (!(brw_clip_have_varying(c, VARYING_SLOT_COL0) && + brw_clip_have_varying(c, VARYING_SLOT_BFC0)) && + !(brw_clip_have_varying(c, VARYING_SLOT_COL1) && + brw_clip_have_varying(c, VARYING_SLOT_BFC1))) + return; + + /* In some weird degenerate cases we can end up testing the + * direction twice, once for culling and once for bfc copying. Oh + * well, that's what you get for setting weird GL state. + */ + if (c->key.copy_bfc_ccw) + conditional = BRW_CONDITIONAL_GE; + else + conditional = BRW_CONDITIONAL_L; + + brw_CMP(p, + vec1(brw_null_reg()), + conditional, + get_element(c->reg.dir, 2), + brw_imm_f(0)); + + brw_IF(p, BRW_EXECUTE_1); + { + GLuint i; + + for (i = 0; i < 3; i++) { + if (brw_clip_have_varying(c, VARYING_SLOT_COL0) && + brw_clip_have_varying(c, VARYING_SLOT_BFC0)) + brw_MOV(p, + byte_offset(c->reg.vertex[i], + brw_varying_to_offset(&c->vue_map, + VARYING_SLOT_COL0)), + byte_offset(c->reg.vertex[i], + brw_varying_to_offset(&c->vue_map, + VARYING_SLOT_BFC0))); + + if (brw_clip_have_varying(c, VARYING_SLOT_COL1) && + brw_clip_have_varying(c, VARYING_SLOT_BFC1)) + brw_MOV(p, + byte_offset(c->reg.vertex[i], + brw_varying_to_offset(&c->vue_map, + VARYING_SLOT_COL1)), + byte_offset(c->reg.vertex[i], + brw_varying_to_offset(&c->vue_map, + VARYING_SLOT_BFC1))); + } + } + brw_ENDIF(p); +} + + + + +/* + GLfloat iz = 1.0 / dir.z; + GLfloat ac = dir.x * iz; + GLfloat bc = dir.y * iz; + offset = ctx->Polygon.OffsetUnits * DEPTH_SCALE; + offset += MAX2( abs(ac), abs(bc) ) * ctx->Polygon.OffsetFactor; + if (ctx->Polygon.OffsetClamp && isfinite(ctx->Polygon.OffsetClamp)) { + if (ctx->Polygon.OffsetClamp < 0) + offset = MAX2( offset, ctx->Polygon.OffsetClamp ); + else + offset = MIN2( offset, ctx->Polygon.OffsetClamp ); + } + offset *= MRD; +*/ +static void compute_offset( struct brw_clip_compile *c ) +{ + struct brw_codegen *p = &c->func; + struct brw_reg off = c->reg.offset; + struct brw_reg dir = c->reg.dir; + + brw_math_invert(p, get_element(off, 2), get_element(dir, 2)); + brw_MUL(p, vec2(off), vec2(dir), get_element(off, 2)); + + brw_CMP(p, + vec1(brw_null_reg()), + BRW_CONDITIONAL_GE, + brw_abs(get_element(off, 0)), + brw_abs(get_element(off, 1))); + + brw_SEL(p, vec1(off), + brw_abs(get_element(off, 0)), brw_abs(get_element(off, 1))); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL); + + brw_MUL(p, vec1(off), vec1(off), brw_imm_f(c->key.offset_factor)); + brw_ADD(p, vec1(off), vec1(off), brw_imm_f(c->key.offset_units)); + if (c->key.offset_clamp && isfinite(c->key.offset_clamp)) { + brw_CMP(p, + vec1(brw_null_reg()), + c->key.offset_clamp < 0 ? BRW_CONDITIONAL_GE : BRW_CONDITIONAL_L, + vec1(off), + brw_imm_f(c->key.offset_clamp)); + brw_SEL(p, vec1(off), vec1(off), brw_imm_f(c->key.offset_clamp)); + } +} + + +static void merge_edgeflags( struct brw_clip_compile *c ) +{ + struct brw_codegen *p = &c->func; + struct brw_reg tmp0 = get_element_ud(c->reg.tmp0, 0); + + brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK)); + brw_CMP(p, + vec1(brw_null_reg()), + BRW_CONDITIONAL_EQ, + tmp0, + brw_imm_ud(_3DPRIM_POLYGON)); + + /* Get away with using reg.vertex because we know that this is not + * a _3DPRIM_TRISTRIP_REVERSE: + */ + brw_IF(p, BRW_EXECUTE_1); + { + brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<8)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_EQ); + brw_MOV(p, byte_offset(c->reg.vertex[0], + brw_varying_to_offset(&c->vue_map, + VARYING_SLOT_EDGE)), + brw_imm_f(0)); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL); + + brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<9)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_EQ); + brw_MOV(p, byte_offset(c->reg.vertex[2], + brw_varying_to_offset(&c->vue_map, + VARYING_SLOT_EDGE)), + brw_imm_f(0)); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL); + } + brw_ENDIF(p); +} + + + +static void apply_one_offset( struct brw_clip_compile *c, + struct brw_indirect vert ) +{ + struct brw_codegen *p = &c->func; + GLuint ndc_offset = brw_varying_to_offset(&c->vue_map, + BRW_VARYING_SLOT_NDC); + struct brw_reg z = deref_1f(vert, ndc_offset + + 2 * type_sz(BRW_REGISTER_TYPE_F)); + + brw_ADD(p, z, z, vec1(c->reg.offset)); +} + + + +/*********************************************************************** + * Output clipped polygon as an unfilled primitive: + */ +static void emit_lines(struct brw_clip_compile *c, + bool do_offset) +{ + struct brw_codegen *p = &c->func; + struct brw_indirect v0 = brw_indirect(0, 0); + struct brw_indirect v1 = brw_indirect(1, 0); + struct brw_indirect v0ptr = brw_indirect(2, 0); + struct brw_indirect v1ptr = brw_indirect(3, 0); + + /* Need a separate loop for offset: + */ + if (do_offset) { + brw_MOV(p, c->reg.loopcount, c->reg.nr_verts); + brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist)); + + brw_DO(p, BRW_EXECUTE_1); + { + brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0)); + brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2)); + + apply_one_offset(c, v0); + + brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_G); + } + brw_WHILE(p); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL); + } + + /* v1ptr = &inlist[nr_verts] + * *v1ptr = v0 + */ + brw_MOV(p, c->reg.loopcount, c->reg.nr_verts); + brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist)); + brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v0ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW)); + brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v1ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW)); + brw_MOV(p, deref_1uw(v1ptr, 0), deref_1uw(v0ptr, 0)); + + brw_DO(p, BRW_EXECUTE_1); + { + brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0)); + brw_MOV(p, get_addr_reg(v1), deref_1uw(v0ptr, 2)); + brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2)); + + /* draw edge if edgeflag != 0 */ + brw_CMP(p, + vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, + deref_1f(v0, brw_varying_to_offset(&c->vue_map, + VARYING_SLOT_EDGE)), + brw_imm_f(0)); + brw_IF(p, BRW_EXECUTE_1); + { + brw_clip_emit_vue(c, v0, BRW_URB_WRITE_ALLOCATE_COMPLETE, + (_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT) + | URB_WRITE_PRIM_START); + brw_clip_emit_vue(c, v1, BRW_URB_WRITE_ALLOCATE_COMPLETE, + (_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT) + | URB_WRITE_PRIM_END); + } + brw_ENDIF(p); + + brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); + } + brw_WHILE(p); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL); +} + + + +static void emit_points(struct brw_clip_compile *c, + bool do_offset ) +{ + struct brw_codegen *p = &c->func; + + struct brw_indirect v0 = brw_indirect(0, 0); + struct brw_indirect v0ptr = brw_indirect(2, 0); + + brw_MOV(p, c->reg.loopcount, c->reg.nr_verts); + brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist)); + + brw_DO(p, BRW_EXECUTE_1); + { + brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0)); + brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2)); + + /* draw if edgeflag != 0 + */ + brw_CMP(p, + vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, + deref_1f(v0, brw_varying_to_offset(&c->vue_map, + VARYING_SLOT_EDGE)), + brw_imm_f(0)); + brw_IF(p, BRW_EXECUTE_1); + { + if (do_offset) + apply_one_offset(c, v0); + + brw_clip_emit_vue(c, v0, BRW_URB_WRITE_ALLOCATE_COMPLETE, + (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) + | URB_WRITE_PRIM_START | URB_WRITE_PRIM_END); + } + brw_ENDIF(p); + + brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); + } + brw_WHILE(p); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL); +} + + + + + + + +static void emit_primitives( struct brw_clip_compile *c, + GLuint mode, + bool do_offset ) +{ + switch (mode) { + case BRW_CLIP_FILL_MODE_FILL: + brw_clip_tri_emit_polygon(c); + break; + + case BRW_CLIP_FILL_MODE_LINE: + emit_lines(c, do_offset); + break; + + case BRW_CLIP_FILL_MODE_POINT: + emit_points(c, do_offset); + break; + + case BRW_CLIP_FILL_MODE_CULL: + unreachable("not reached"); + } +} + + + +static void emit_unfilled_primitives( struct brw_clip_compile *c ) +{ + struct brw_codegen *p = &c->func; + + /* Direction culling has already been done. + */ + if (c->key.fill_ccw != c->key.fill_cw && + c->key.fill_ccw != BRW_CLIP_FILL_MODE_CULL && + c->key.fill_cw != BRW_CLIP_FILL_MODE_CULL) + { + brw_CMP(p, + vec1(brw_null_reg()), + BRW_CONDITIONAL_GE, + get_element(c->reg.dir, 2), + brw_imm_f(0)); + + brw_IF(p, BRW_EXECUTE_1); + { + emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw); + } + brw_ELSE(p); + { + emit_primitives(c, c->key.fill_cw, c->key.offset_cw); + } + brw_ENDIF(p); + } + else if (c->key.fill_cw != BRW_CLIP_FILL_MODE_CULL) { + emit_primitives(c, c->key.fill_cw, c->key.offset_cw); + } + else if (c->key.fill_ccw != BRW_CLIP_FILL_MODE_CULL) { + emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw); + } +} + + + + +static void check_nr_verts( struct brw_clip_compile *c ) +{ + struct brw_codegen *p = &c->func; + + brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.nr_verts, brw_imm_d(3)); + brw_IF(p, BRW_EXECUTE_1); + { + brw_clip_kill_thread(c); + } + brw_ENDIF(p); +} + + +void brw_emit_unfilled_clip( struct brw_clip_compile *c ) +{ + struct brw_codegen *p = &c->func; + + c->need_direction = ((c->key.offset_ccw || c->key.offset_cw) || + (c->key.fill_ccw != c->key.fill_cw) || + c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL || + c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL || + c->key.copy_bfc_cw || + c->key.copy_bfc_ccw); + + brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6); + brw_clip_tri_init_vertices(c); + brw_clip_init_ff_sync(c); + + assert(brw_clip_have_varying(c, VARYING_SLOT_EDGE)); + + if (c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL && + c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL) { + brw_clip_kill_thread(c); + return; + } + + merge_edgeflags(c); + + /* Need to use the inlist indirection here: + */ + if (c->need_direction) + compute_tri_direction(c); + + if (c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL || + c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL) + cull_direction(c); + + if (c->key.offset_ccw || + c->key.offset_cw) + compute_offset(c); + + if (c->key.copy_bfc_ccw || + c->key.copy_bfc_cw) + copy_bfc(c); + + /* Need to do this whether we clip or not: + */ + if (c->key.contains_flat_varying) + brw_clip_tri_flat_shade(c); + + brw_clip_init_clipmask(c); + brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0)); + brw_IF(p, BRW_EXECUTE_1); + { + brw_clip_init_planes(c); + brw_clip_tri(c); + check_nr_verts(c); + } + brw_ENDIF(p); + + emit_unfilled_primitives(c); + brw_clip_kill_thread(c); +} diff --git a/src/intel/compiler/elk/brw_clip_util.c b/src/intel/compiler/elk/brw_clip_util.c new file mode 100644 index 00000000000..270a6dc3225 --- /dev/null +++ b/src/intel/compiler/elk/brw_clip_util.c @@ -0,0 +1,464 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell + */ + +#include "brw_clip.h" + + +struct brw_reg get_tmp( struct brw_clip_compile *c ) +{ + struct brw_reg tmp = brw_vec4_grf(c->last_tmp, 0); + + if (++c->last_tmp > c->prog_data.total_grf) + c->prog_data.total_grf = c->last_tmp; + + return tmp; +} + +static void release_tmp( struct brw_clip_compile *c, struct brw_reg tmp ) +{ + if (tmp.nr == c->last_tmp-1) + c->last_tmp--; +} + + +static struct brw_reg make_plane_ud(GLuint x, GLuint y, GLuint z, GLuint w) +{ + return brw_imm_ud((w<<24) | (z<<16) | (y<<8) | x); +} + + +void brw_clip_init_planes( struct brw_clip_compile *c ) +{ + struct brw_codegen *p = &c->func; + + if (!c->key.nr_userclip) { + brw_MOV(p, get_element_ud(c->reg.fixed_planes, 0), make_plane_ud( 0, 0, 0xff, 1)); + brw_MOV(p, get_element_ud(c->reg.fixed_planes, 1), make_plane_ud( 0, 0, 1, 1)); + brw_MOV(p, get_element_ud(c->reg.fixed_planes, 2), make_plane_ud( 0, 0xff, 0, 1)); + brw_MOV(p, get_element_ud(c->reg.fixed_planes, 3), make_plane_ud( 0, 1, 0, 1)); + brw_MOV(p, get_element_ud(c->reg.fixed_planes, 4), make_plane_ud(0xff, 0, 0, 1)); + brw_MOV(p, get_element_ud(c->reg.fixed_planes, 5), make_plane_ud( 1, 0, 0, 1)); + } +} + + + +#define W 3 + +/* Project 'pos' to screen space (or back again), overwrite with results: + */ +void brw_clip_project_position(struct brw_clip_compile *c, struct brw_reg pos ) +{ + struct brw_codegen *p = &c->func; + + /* calc rhw + */ + brw_math_invert(p, get_element(pos, W), get_element(pos, W)); + + /* value.xyz *= value.rhw + */ + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_MUL(p, brw_writemask(pos, WRITEMASK_XYZ), pos, + brw_swizzle(pos, BRW_SWIZZLE_WWWW)); + brw_set_default_access_mode(p, BRW_ALIGN_1); +} + + +static void brw_clip_project_vertex( struct brw_clip_compile *c, + struct brw_indirect vert_addr ) +{ + struct brw_codegen *p = &c->func; + struct brw_reg tmp = get_tmp(c); + GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS); + GLuint ndc_offset = brw_varying_to_offset(&c->vue_map, + BRW_VARYING_SLOT_NDC); + + /* Fixup position. Extract from the original vertex and re-project + * to screen space: + */ + brw_MOV(p, tmp, deref_4f(vert_addr, hpos_offset)); + brw_clip_project_position(c, tmp); + brw_MOV(p, deref_4f(vert_addr, ndc_offset), tmp); + + release_tmp(c, tmp); +} + + + + +/* Interpolate between two vertices and put the result into a0.0. + * Increment a0.0 accordingly. + * + * Beware that dest_ptr can be equal to v0_ptr! + */ +void brw_clip_interp_vertex( struct brw_clip_compile *c, + struct brw_indirect dest_ptr, + struct brw_indirect v0_ptr, /* from */ + struct brw_indirect v1_ptr, /* to */ + struct brw_reg t0, + bool force_edgeflag) +{ + struct brw_codegen *p = &c->func; + struct brw_reg t_nopersp, v0_ndc_copy; + GLuint slot; + + /* Just copy the vertex header: + */ + /* + * After CLIP stage, only first 256 bits of the VUE are read + * back on Ironlake, so needn't change it + */ + brw_copy_indirect_to_indirect(p, dest_ptr, v0_ptr, 1); + + + /* First handle the 3D and NDC interpolation, in case we + * need noperspective interpolation. Doing it early has no + * performance impact in any case. + */ + + /* Take a copy of the v0 NDC coordinates, in case dest == v0. */ + if (c->key.contains_noperspective_varying) { + GLuint offset = brw_varying_to_offset(&c->vue_map, + BRW_VARYING_SLOT_NDC); + v0_ndc_copy = get_tmp(c); + brw_MOV(p, v0_ndc_copy, deref_4f(v0_ptr, offset)); + } + + /* Compute the new 3D position + * + * dest_hpos = v0_hpos * (1 - t0) + v1_hpos * t0 + */ + { + GLuint delta = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS); + struct brw_reg tmp = get_tmp(c); + brw_MUL(p, vec4(brw_null_reg()), deref_4f(v1_ptr, delta), t0); + brw_MAC(p, tmp, negate(deref_4f(v0_ptr, delta)), t0); + brw_ADD(p, deref_4f(dest_ptr, delta), deref_4f(v0_ptr, delta), tmp); + release_tmp(c, tmp); + } + + /* Recreate the projected (NDC) coordinate in the new vertex header */ + brw_clip_project_vertex(c, dest_ptr); + + /* If we have noperspective attributes, + * we need to compute the screen-space t + */ + if (c->key.contains_noperspective_varying) { + GLuint delta = brw_varying_to_offset(&c->vue_map, + BRW_VARYING_SLOT_NDC); + struct brw_reg tmp = get_tmp(c); + t_nopersp = get_tmp(c); + + /* t_nopersp = vec4(v1.xy, dest.xy) */ + brw_MOV(p, t_nopersp, deref_4f(v1_ptr, delta)); + brw_MOV(p, tmp, deref_4f(dest_ptr, delta)); + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_MOV(p, + brw_writemask(t_nopersp, WRITEMASK_ZW), + brw_swizzle(tmp, BRW_SWIZZLE_XYXY)); + + /* t_nopersp = vec4(v1.xy, dest.xy) - v0.xyxy */ + brw_ADD(p, t_nopersp, t_nopersp, + negate(brw_swizzle(v0_ndc_copy, BRW_SWIZZLE_XYXY))); + + /* Add the absolute values of the X and Y deltas so that if + * the points aren't in the same place on the screen we get + * nonzero values to divide. + * + * After that, we have vert1 - vert0 in t_nopersp.x and + * vertnew - vert0 in t_nopersp.y + * + * t_nopersp = vec2(|v1.x -v0.x| + |v1.y -v0.y|, + * |dest.x-v0.x| + |dest.y-v0.y|) + */ + brw_ADD(p, + brw_writemask(t_nopersp, WRITEMASK_XY), + brw_abs(brw_swizzle(t_nopersp, BRW_SWIZZLE_XZXZ)), + brw_abs(brw_swizzle(t_nopersp, BRW_SWIZZLE_YWYW))); + brw_set_default_access_mode(p, BRW_ALIGN_1); + + /* If the points are in the same place, just substitute a + * value to avoid divide-by-zero + */ + brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, + vec1(t_nopersp), + brw_imm_f(0)); + brw_IF(p, BRW_EXECUTE_1); + brw_MOV(p, t_nopersp, brw_imm_vf4(brw_float_to_vf(1.0), + brw_float_to_vf(0.0), + brw_float_to_vf(0.0), + brw_float_to_vf(0.0))); + brw_ENDIF(p); + + /* Now compute t_nopersp = t_nopersp.y/t_nopersp.x and broadcast it. */ + brw_math_invert(p, get_element(t_nopersp, 0), get_element(t_nopersp, 0)); + brw_MUL(p, vec1(t_nopersp), vec1(t_nopersp), + vec1(suboffset(t_nopersp, 1))); + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_MOV(p, t_nopersp, brw_swizzle(t_nopersp, BRW_SWIZZLE_XXXX)); + brw_set_default_access_mode(p, BRW_ALIGN_1); + + release_tmp(c, tmp); + release_tmp(c, v0_ndc_copy); + } + + /* Now we can iterate over each attribute + * (could be done in pairs?) + */ + for (slot = 0; slot < c->vue_map.num_slots; slot++) { + int varying = c->vue_map.slot_to_varying[slot]; + GLuint delta = brw_vue_slot_to_offset(slot); + + /* HPOS, NDC already handled above */ + if (varying == VARYING_SLOT_POS || varying == BRW_VARYING_SLOT_NDC) + continue; + + + if (varying == VARYING_SLOT_EDGE) { + if (force_edgeflag) + brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(1)); + else + brw_MOV(p, deref_4f(dest_ptr, delta), deref_4f(v0_ptr, delta)); + } else if (varying == VARYING_SLOT_PSIZ) { + /* PSIZ doesn't need interpolation because it isn't used by the + * fragment shader. + */ + } else if (varying < VARYING_SLOT_MAX) { + /* This is a true vertex result (and not a special value for the VUE + * header), so interpolate: + * + * New = attr0 + t*attr1 - t*attr0 + * + * Unless the attribute is flat shaded -- in which case just copy + * from one of the sources (doesn't matter which; already copied from pv) + */ + GLuint interp = c->key.interp_mode[slot]; + + if (interp != INTERP_MODE_FLAT) { + struct brw_reg tmp = get_tmp(c); + struct brw_reg t = + interp == INTERP_MODE_NOPERSPECTIVE ? t_nopersp : t0; + + brw_MUL(p, + vec4(brw_null_reg()), + deref_4f(v1_ptr, delta), + t); + + brw_MAC(p, + tmp, + negate(deref_4f(v0_ptr, delta)), + t); + + brw_ADD(p, + deref_4f(dest_ptr, delta), + deref_4f(v0_ptr, delta), + tmp); + + release_tmp(c, tmp); + } + else { + brw_MOV(p, + deref_4f(dest_ptr, delta), + deref_4f(v0_ptr, delta)); + } + } + } + + if (c->vue_map.num_slots % 2) { + GLuint delta = brw_vue_slot_to_offset(c->vue_map.num_slots); + + brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0)); + } + + if (c->key.contains_noperspective_varying) + release_tmp(c, t_nopersp); +} + +void brw_clip_emit_vue(struct brw_clip_compile *c, + struct brw_indirect vert, + enum brw_urb_write_flags flags, + GLuint header) +{ + struct brw_codegen *p = &c->func; + bool allocate = flags & BRW_URB_WRITE_ALLOCATE; + + brw_clip_ff_sync(c); + + /* Any URB entry that is allocated must subsequently be used or discarded, + * so it doesn't make sense to mark EOT and ALLOCATE at the same time. + */ + assert(!(allocate && (flags & BRW_URB_WRITE_EOT))); + + /* Copy the vertex from vertn into m1..mN+1: + */ + brw_copy_from_indirect(p, brw_message_reg(1), vert, c->nr_regs); + + /* Overwrite PrimType and PrimStart in the message header, for + * each vertex in turn: + */ + brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header)); + + + /* Send each vertex as a separate write to the urb. This + * is different to the concept in brw_sf_emit.c, where + * subsequent writes are used to build up a single urb + * entry. Each of these writes instantiates a separate + * urb entry - (I think... what about 'allocate'?) + */ + brw_urb_WRITE(p, + allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), + 0, + c->reg.R0, + flags, + c->nr_regs + 1, /* msg length */ + allocate ? 1 : 0, /* response_length */ + 0, /* urb offset */ + BRW_URB_SWIZZLE_NONE); +} + + + +void brw_clip_kill_thread(struct brw_clip_compile *c) +{ + struct brw_codegen *p = &c->func; + + brw_clip_ff_sync(c); + /* Send an empty message to kill the thread and release any + * allocated urb entry: + */ + brw_urb_WRITE(p, + retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), + 0, + c->reg.R0, + BRW_URB_WRITE_UNUSED | BRW_URB_WRITE_EOT_COMPLETE, + 1, /* msg len */ + 0, /* response len */ + 0, + BRW_URB_SWIZZLE_NONE); +} + + + + +struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c ) +{ + return brw_address(c->reg.fixed_planes); +} + + +struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c ) +{ + if (c->key.nr_userclip) { + return brw_imm_uw(16); + } + else { + return brw_imm_uw(4); + } +} + + +/* Distribute flatshaded attributes from provoking vertex prior to + * clipping. + */ +void brw_clip_copy_flatshaded_attributes( struct brw_clip_compile *c, + GLuint to, GLuint from ) +{ + struct brw_codegen *p = &c->func; + + for (int i = 0; i < c->vue_map.num_slots; i++) { + if (c->key.interp_mode[i] == INTERP_MODE_FLAT) { + brw_MOV(p, + byte_offset(c->reg.vertex[to], brw_vue_slot_to_offset(i)), + byte_offset(c->reg.vertex[from], brw_vue_slot_to_offset(i))); + } + } +} + + + +void brw_clip_init_clipmask( struct brw_clip_compile *c ) +{ + struct brw_codegen *p = &c->func; + struct brw_reg incoming = get_element_ud(c->reg.R0, 2); + + /* Shift so that lowest outcode bit is rightmost: + */ + brw_SHR(p, c->reg.planemask, incoming, brw_imm_ud(26)); + + if (c->key.nr_userclip) { + struct brw_reg tmp = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UD); + + /* Rearrange userclip outcodes so that they come directly after + * the fixed plane bits. + */ + if (p->devinfo->ver == 5 || p->devinfo->verx10 == 45) + brw_AND(p, tmp, incoming, brw_imm_ud(0xff<<14)); + else + brw_AND(p, tmp, incoming, brw_imm_ud(0x3f<<14)); + + brw_SHR(p, tmp, tmp, brw_imm_ud(8)); + brw_OR(p, c->reg.planemask, c->reg.planemask, tmp); + + release_tmp(c, tmp); + } +} + +void brw_clip_ff_sync(struct brw_clip_compile *c) +{ + struct brw_codegen *p = &c->func; + + if (p->devinfo->ver == 5) { + brw_AND(p, brw_null_reg(), c->reg.ff_sync, brw_imm_ud(0x1)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_Z); + brw_IF(p, BRW_EXECUTE_1); + { + brw_OR(p, c->reg.ff_sync, c->reg.ff_sync, brw_imm_ud(0x1)); + brw_ff_sync(p, + c->reg.R0, + 0, + c->reg.R0, + 1, /* allocate */ + 1, /* response length */ + 0 /* eot */); + } + brw_ENDIF(p); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + } +} + +void brw_clip_init_ff_sync(struct brw_clip_compile *c) +{ + struct brw_codegen *p = &c->func; + + if (p->devinfo->ver == 5) { + brw_MOV(p, c->reg.ff_sync, brw_imm_ud(0)); + } +} diff --git a/src/intel/compiler/elk/brw_compile_clip.c b/src/intel/compiler/elk/brw_compile_clip.c new file mode 100644 index 00000000000..25f476d4066 --- /dev/null +++ b/src/intel/compiler/elk/brw_compile_clip.c @@ -0,0 +1,97 @@ +/* + * Copyright © 2006 - 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_clip.h" +#include "brw_disasm.h" + +#include "dev/intel_debug.h" + +const unsigned * +brw_compile_clip(const struct brw_compiler *compiler, + void *mem_ctx, + const struct brw_clip_prog_key *key, + struct brw_clip_prog_data *prog_data, + struct intel_vue_map *vue_map, + unsigned *final_assembly_size) +{ + struct brw_clip_compile c; + memset(&c, 0, sizeof(c)); + + /* Begin the compilation: + */ + brw_init_codegen(&compiler->isa, &c.func, mem_ctx); + + c.func.single_program_flow = 1; + + c.key = *key; + c.vue_map = *vue_map; + + /* nr_regs is the number of registers filled by reading data from the VUE. + * This program accesses the entire VUE, so nr_regs needs to be the size of + * the VUE (measured in pairs, since two slots are stored in each + * register). + */ + c.nr_regs = (c.vue_map.num_slots + 1)/2; + + c.prog_data.clip_mode = c.key.clip_mode; /* XXX */ + + /* For some reason the thread is spawned with only 4 channels + * unmasked. + */ + brw_set_default_mask_control(&c.func, BRW_MASK_DISABLE); + + /* Would ideally have the option of producing a program which could + * do all three: + */ + switch (key->primitive) { + case MESA_PRIM_TRIANGLES: + if (key->do_unfilled) + brw_emit_unfilled_clip( &c ); + else + brw_emit_tri_clip( &c ); + break; + case MESA_PRIM_LINES: + brw_emit_line_clip( &c ); + break; + case MESA_PRIM_POINTS: + brw_emit_point_clip( &c ); + break; + default: + unreachable("not reached"); + } + + brw_compact_instructions(&c.func, 0, NULL); + + *prog_data = c.prog_data; + + const unsigned *program = brw_get_program(&c.func, final_assembly_size); + + if (INTEL_DEBUG(DEBUG_CLIP)) { + fprintf(stderr, "clip:\n"); + brw_disassemble_with_labels(&compiler->isa, + program, 0, *final_assembly_size, stderr); + fprintf(stderr, "\n"); + } + + return program; +} diff --git a/src/intel/compiler/elk/brw_compile_ff_gs.c b/src/intel/compiler/elk/brw_compile_ff_gs.c new file mode 100644 index 00000000000..200a1dd0415 --- /dev/null +++ b/src/intel/compiler/elk/brw_compile_ff_gs.c @@ -0,0 +1,662 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell + */ + +#include "brw_compiler.h" +#include "brw_disasm.h" +#include "brw_eu.h" +#include "brw_prim.h" + +#include "dev/intel_debug.h" + +#define MAX_GS_VERTS (4) + +struct brw_ff_gs_compile { + struct brw_codegen func; + struct brw_ff_gs_prog_key key; + struct brw_ff_gs_prog_data *prog_data; + + struct { + struct brw_reg R0; + + /** + * Register holding streamed vertex buffer pointers -- see the Sandy + * Bridge PRM, volume 2 part 1, section 4.4.2 (GS Thread Payload + * [DevSNB]). These pointers are delivered in GRF 1. + */ + struct brw_reg SVBI; + + struct brw_reg vertex[MAX_GS_VERTS]; + struct brw_reg header; + struct brw_reg temp; + + /** + * Register holding destination indices for streamed buffer writes. + * Only used for SOL programs. + */ + struct brw_reg destination_indices; + } reg; + + /* Number of registers used to store vertex data */ + GLuint nr_regs; + + struct intel_vue_map vue_map; +}; + +/** + * Allocate registers for GS. + * + * If sol_program is true, then: + * + * - The thread will be spawned with the "SVBI Payload Enable" bit set, so GRF + * 1 needs to be set aside to hold the streamed vertex buffer indices. + * + * - The thread will need to use the destination_indices register. + */ +static void brw_ff_gs_alloc_regs(struct brw_ff_gs_compile *c, + GLuint nr_verts, + bool sol_program) +{ + GLuint i = 0,j; + + /* Register usage is static, precompute here: + */ + c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++; + + /* Streamed vertex buffer indices */ + if (sol_program) + c->reg.SVBI = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD); + + /* Payload vertices plus space for more generated vertices: + */ + for (j = 0; j < nr_verts; j++) { + c->reg.vertex[j] = brw_vec4_grf(i, 0); + i += c->nr_regs; + } + + c->reg.header = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD); + c->reg.temp = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD); + + if (sol_program) { + c->reg.destination_indices = + retype(brw_vec4_grf(i++, 0), BRW_REGISTER_TYPE_UD); + } + + c->prog_data->urb_read_length = c->nr_regs; + c->prog_data->total_grf = i; +} + + +/** + * Set up the initial value of c->reg.header register based on c->reg.R0. + * + * The following information is passed to the GS thread in R0, and needs to be + * included in the first URB_WRITE or FF_SYNC message sent by the GS: + * + * - DWORD 0 [31:0] handle info (Gen4 only) + * - DWORD 5 [7:0] FFTID + * - DWORD 6 [31:0] Debug info + * - DWORD 7 [31:0] Debug info + * + * This function sets up the above data by copying by copying the contents of + * R0 to the header register. + */ +static void brw_ff_gs_initialize_header(struct brw_ff_gs_compile *c) +{ + struct brw_codegen *p = &c->func; + brw_MOV(p, c->reg.header, c->reg.R0); +} + +/** + * Overwrite DWORD 2 of c->reg.header with the given immediate unsigned value. + * + * In URB_WRITE messages, DWORD 2 contains the fields PrimType, PrimStart, + * PrimEnd, Increment CL_INVOCATIONS, and SONumPrimsWritten, many of which we + * need to be able to update on a per-vertex basis. + */ +static void brw_ff_gs_overwrite_header_dw2(struct brw_ff_gs_compile *c, + unsigned dw2) +{ + struct brw_codegen *p = &c->func; + brw_MOV(p, get_element_ud(c->reg.header, 2), brw_imm_ud(dw2)); +} + +/** + * Overwrite DWORD 2 of c->reg.header with the primitive type from c->reg.R0. + * + * When the thread is spawned, GRF 0 contains the primitive type in bits 4:0 + * of DWORD 2. URB_WRITE messages need the primitive type in bits 6:2 of + * DWORD 2. So this function extracts the primitive type field, bitshifts it + * appropriately, and stores it in c->reg.header. + */ +static void brw_ff_gs_overwrite_header_dw2_from_r0(struct brw_ff_gs_compile *c) +{ + struct brw_codegen *p = &c->func; + brw_AND(p, get_element_ud(c->reg.header, 2), get_element_ud(c->reg.R0, 2), + brw_imm_ud(0x1f)); + brw_SHL(p, get_element_ud(c->reg.header, 2), + get_element_ud(c->reg.header, 2), brw_imm_ud(2)); +} + +/** + * Apply an additive offset to DWORD 2 of c->reg.header. + * + * This is used to set/unset the "PrimStart" and "PrimEnd" flags appropriately + * for each vertex. + */ +static void brw_ff_gs_offset_header_dw2(struct brw_ff_gs_compile *c, + int offset) +{ + struct brw_codegen *p = &c->func; + brw_ADD(p, get_element_d(c->reg.header, 2), get_element_d(c->reg.header, 2), + brw_imm_d(offset)); +} + + +/** + * Emit a vertex using the URB_WRITE message. Use the contents of + * c->reg.header for the message header, and the registers starting at \c vert + * for the vertex data. + * + * If \c last is true, then this is the last vertex, so no further URB space + * should be allocated, and this message should end the thread. + * + * If \c last is false, then a new URB entry will be allocated, and its handle + * will be stored in DWORD 0 of c->reg.header for use in the next URB_WRITE + * message. + */ +static void brw_ff_gs_emit_vue(struct brw_ff_gs_compile *c, + struct brw_reg vert, + bool last) +{ + struct brw_codegen *p = &c->func; + int write_offset = 0; + bool complete = false; + + do { + /* We can't write more than 14 registers at a time to the URB */ + int write_len = MIN2(c->nr_regs - write_offset, 14); + if (write_len == c->nr_regs - write_offset) + complete = true; + + /* Copy the vertex from vertn into m1..mN+1: + */ + brw_copy8(p, brw_message_reg(1), offset(vert, write_offset), write_len); + + /* Send the vertex data to the URB. If this is the last write for this + * vertex, then we mark it as complete, and either end the thread or + * allocate another vertex URB entry (depending whether this is the last + * vertex). + */ + enum brw_urb_write_flags flags; + if (!complete) + flags = BRW_URB_WRITE_NO_FLAGS; + else if (last) + flags = BRW_URB_WRITE_EOT_COMPLETE; + else + flags = BRW_URB_WRITE_ALLOCATE_COMPLETE; + brw_urb_WRITE(p, + (flags & BRW_URB_WRITE_ALLOCATE) ? c->reg.temp + : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), + 0, + c->reg.header, + flags, + write_len + 1, /* msg length */ + (flags & BRW_URB_WRITE_ALLOCATE) ? 1 + : 0, /* response length */ + write_offset, /* urb offset */ + BRW_URB_SWIZZLE_NONE); + write_offset += write_len; + } while (!complete); + + if (!last) { + brw_MOV(p, get_element_ud(c->reg.header, 0), + get_element_ud(c->reg.temp, 0)); + } +} + +/** + * Send an FF_SYNC message to ensure that all previously spawned GS threads + * have finished sending primitives down the pipeline, and to allocate a URB + * entry for the first output vertex. Only needed on Ironlake+. + * + * This function modifies c->reg.header: in DWORD 1, it stores num_prim (which + * is needed by the FF_SYNC message), and in DWORD 0, it stores the handle to + * the allocated URB entry (which will be needed by the URB_WRITE meesage that + * follows). + */ +static void brw_ff_gs_ff_sync(struct brw_ff_gs_compile *c, int num_prim) +{ + struct brw_codegen *p = &c->func; + + brw_MOV(p, get_element_ud(c->reg.header, 1), brw_imm_ud(num_prim)); + brw_ff_sync(p, + c->reg.temp, + 0, + c->reg.header, + 1, /* allocate */ + 1, /* response length */ + 0 /* eot */); + brw_MOV(p, get_element_ud(c->reg.header, 0), + get_element_ud(c->reg.temp, 0)); +} + + +static void +brw_ff_gs_quads(struct brw_ff_gs_compile *c, + const struct brw_ff_gs_prog_key *key) +{ + brw_ff_gs_alloc_regs(c, 4, false); + brw_ff_gs_initialize_header(c); + /* Use polygons for correct edgeflag behaviour. Note that vertex 3 + * is the PV for quads, but vertex 0 for polygons: + */ + if (c->func.devinfo->ver == 5) + brw_ff_gs_ff_sync(c, 1); + brw_ff_gs_overwrite_header_dw2( + c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT) + | URB_WRITE_PRIM_START)); + if (key->pv_first) { + brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0); + brw_ff_gs_overwrite_header_dw2( + c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT); + brw_ff_gs_emit_vue(c, c->reg.vertex[1], 0); + brw_ff_gs_emit_vue(c, c->reg.vertex[2], 0); + brw_ff_gs_overwrite_header_dw2( + c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT) + | URB_WRITE_PRIM_END)); + brw_ff_gs_emit_vue(c, c->reg.vertex[3], 1); + } + else { + brw_ff_gs_emit_vue(c, c->reg.vertex[3], 0); + brw_ff_gs_overwrite_header_dw2( + c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT); + brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0); + brw_ff_gs_emit_vue(c, c->reg.vertex[1], 0); + brw_ff_gs_overwrite_header_dw2( + c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT) + | URB_WRITE_PRIM_END)); + brw_ff_gs_emit_vue(c, c->reg.vertex[2], 1); + } +} + +static void +brw_ff_gs_quad_strip(struct brw_ff_gs_compile *c, + const struct brw_ff_gs_prog_key *key) +{ + brw_ff_gs_alloc_regs(c, 4, false); + brw_ff_gs_initialize_header(c); + + if (c->func.devinfo->ver == 5) + brw_ff_gs_ff_sync(c, 1); + brw_ff_gs_overwrite_header_dw2( + c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT) + | URB_WRITE_PRIM_START)); + if (key->pv_first) { + brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0); + brw_ff_gs_overwrite_header_dw2( + c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT); + brw_ff_gs_emit_vue(c, c->reg.vertex[1], 0); + brw_ff_gs_emit_vue(c, c->reg.vertex[2], 0); + brw_ff_gs_overwrite_header_dw2( + c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT) + | URB_WRITE_PRIM_END)); + brw_ff_gs_emit_vue(c, c->reg.vertex[3], 1); + } + else { + brw_ff_gs_emit_vue(c, c->reg.vertex[2], 0); + brw_ff_gs_overwrite_header_dw2( + c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT); + brw_ff_gs_emit_vue(c, c->reg.vertex[3], 0); + brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0); + brw_ff_gs_overwrite_header_dw2( + c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT) + | URB_WRITE_PRIM_END)); + brw_ff_gs_emit_vue(c, c->reg.vertex[1], 1); + } +} + +static void brw_ff_gs_lines(struct brw_ff_gs_compile *c) +{ + brw_ff_gs_alloc_regs(c, 2, false); + brw_ff_gs_initialize_header(c); + + if (c->func.devinfo->ver == 5) + brw_ff_gs_ff_sync(c, 1); + brw_ff_gs_overwrite_header_dw2( + c, ((_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT) + | URB_WRITE_PRIM_START)); + brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0); + brw_ff_gs_overwrite_header_dw2( + c, ((_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT) + | URB_WRITE_PRIM_END)); + brw_ff_gs_emit_vue(c, c->reg.vertex[1], 1); +} + +/** + * Generate the geometry shader program used on Gen6 to perform stream output + * (transform feedback). + */ +static void +gfx6_sol_program(struct brw_ff_gs_compile *c, const struct brw_ff_gs_prog_key *key, + unsigned num_verts, bool check_edge_flags) +{ + struct brw_codegen *p = &c->func; + brw_inst *inst; + c->prog_data->svbi_postincrement_value = num_verts; + + brw_ff_gs_alloc_regs(c, num_verts, true); + brw_ff_gs_initialize_header(c); + + if (key->num_transform_feedback_bindings > 0) { + unsigned vertex, binding; + struct brw_reg destination_indices_uw = + vec8(retype(c->reg.destination_indices, BRW_REGISTER_TYPE_UW)); + + /* Note: since we use the binding table to keep track of buffer offsets + * and stride, the GS doesn't need to keep track of a separate pointer + * into each buffer; it uses a single pointer which increments by 1 for + * each vertex. So we use SVBI0 for this pointer, regardless of whether + * transform feedback is in interleaved or separate attribs mode. + * + * Make sure that the buffers have enough room for all the vertices. + */ + brw_ADD(p, get_element_ud(c->reg.temp, 0), + get_element_ud(c->reg.SVBI, 0), brw_imm_ud(num_verts)); + brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE, + get_element_ud(c->reg.temp, 0), + get_element_ud(c->reg.SVBI, 4)); + brw_IF(p, BRW_EXECUTE_1); + + /* Compute the destination indices to write to. Usually we use SVBI[0] + * + (0, 1, 2). However, for odd-numbered triangles in tristrips, the + * vertices come down the pipeline in reversed winding order, so we need + * to flip the order when writing to the transform feedback buffer. To + * ensure that flatshading accuracy is preserved, we need to write them + * in order SVBI[0] + (0, 2, 1) if we're using the first provoking + * vertex convention, and in order SVBI[0] + (1, 0, 2) if we're using + * the last provoking vertex convention. + * + * Note: since brw_imm_v can only be used in instructions in + * packed-word execution mode, and SVBI is a double-word, we need to + * first move the appropriate immediate constant ((0, 1, 2), (0, 2, 1), + * or (1, 0, 2)) to the destination_indices register, and then add SVBI + * using a separate instruction. Also, since the immediate constant is + * expressed as packed words, and we need to load double-words into + * destination_indices, we need to intersperse zeros to fill the upper + * halves of each double-word. + */ + brw_MOV(p, destination_indices_uw, + brw_imm_v(0x00020100)); /* (0, 1, 2) */ + if (num_verts == 3) { + /* Get primitive type into temp register. */ + brw_AND(p, get_element_ud(c->reg.temp, 0), + get_element_ud(c->reg.R0, 2), brw_imm_ud(0x1f)); + + /* Test if primitive type is TRISTRIP_REVERSE. We need to do this as + * an 8-wide comparison so that the conditional MOV that follows + * moves all 8 words correctly. + */ + brw_CMP(p, vec8(brw_null_reg()), BRW_CONDITIONAL_EQ, + get_element_ud(c->reg.temp, 0), + brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE)); + + /* If so, then overwrite destination_indices_uw with the appropriate + * reordering. + */ + inst = brw_MOV(p, destination_indices_uw, + brw_imm_v(key->pv_first ? 0x00010200 /* (0, 2, 1) */ + : 0x00020001)); /* (1, 0, 2) */ + brw_inst_set_pred_control(p->devinfo, inst, BRW_PREDICATE_NORMAL); + } + + assert(c->reg.destination_indices.width == BRW_EXECUTE_4); + brw_push_insn_state(p); + brw_set_default_exec_size(p, BRW_EXECUTE_4); + brw_ADD(p, c->reg.destination_indices, + c->reg.destination_indices, get_element_ud(c->reg.SVBI, 0)); + brw_pop_insn_state(p); + /* For each vertex, generate code to output each varying using the + * appropriate binding table entry. + */ + for (vertex = 0; vertex < num_verts; ++vertex) { + /* Set up the correct destination index for this vertex */ + brw_MOV(p, get_element_ud(c->reg.header, 5), + get_element_ud(c->reg.destination_indices, vertex)); + + for (binding = 0; binding < key->num_transform_feedback_bindings; + ++binding) { + unsigned char varying = + key->transform_feedback_bindings[binding]; + unsigned char slot = c->vue_map.varying_to_slot[varying]; + /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1: + * + * "Prior to End of Thread with a URB_WRITE, the kernel must + * ensure that all writes are complete by sending the final + * write as a committed write." + */ + bool final_write = + binding == key->num_transform_feedback_bindings - 1 && + vertex == num_verts - 1; + struct brw_reg vertex_slot = c->reg.vertex[vertex]; + vertex_slot.nr += slot / 2; + vertex_slot.subnr = (slot % 2) * 16; + /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w. */ + vertex_slot.swizzle = varying == VARYING_SLOT_PSIZ + ? BRW_SWIZZLE_WWWW : key->transform_feedback_swizzles[binding]; + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_push_insn_state(p); + brw_set_default_exec_size(p, BRW_EXECUTE_4); + + brw_MOV(p, stride(c->reg.header, 4, 4, 1), + retype(vertex_slot, BRW_REGISTER_TYPE_UD)); + brw_pop_insn_state(p); + + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_svb_write(p, + final_write ? c->reg.temp : brw_null_reg(), /* dest */ + 1, /* msg_reg_nr */ + c->reg.header, /* src0 */ + BRW_GFX6_SOL_BINDING_START + binding, /* binding_table_index */ + final_write); /* send_commit_msg */ + } + } + brw_ENDIF(p); + + /* Now, reinitialize the header register from R0 to restore the parts of + * the register that we overwrote while streaming out transform feedback + * data. + */ + brw_ff_gs_initialize_header(c); + + /* Finally, wait for the write commit to occur so that we can proceed to + * other things safely. + * + * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3: + * + * The write commit does not modify the destination register, but + * merely clears the dependency associated with the destination + * register. Thus, a simple “mov” instruction using the register as a + * source is sufficient to wait for the write commit to occur. + */ + brw_MOV(p, c->reg.temp, c->reg.temp); + } + + brw_ff_gs_ff_sync(c, 1); + + brw_ff_gs_overwrite_header_dw2_from_r0(c); + switch (num_verts) { + case 1: + brw_ff_gs_offset_header_dw2(c, + URB_WRITE_PRIM_START | URB_WRITE_PRIM_END); + brw_ff_gs_emit_vue(c, c->reg.vertex[0], true); + break; + case 2: + brw_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_START); + brw_ff_gs_emit_vue(c, c->reg.vertex[0], false); + brw_ff_gs_offset_header_dw2(c, + URB_WRITE_PRIM_END - URB_WRITE_PRIM_START); + brw_ff_gs_emit_vue(c, c->reg.vertex[1], true); + break; + case 3: + if (check_edge_flags) { + /* Only emit vertices 0 and 1 if this is the first triangle of the + * polygon. Otherwise they are redundant. + */ + brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), + get_element_ud(c->reg.R0, 2), + brw_imm_ud(BRW_GS_EDGE_INDICATOR_0)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); + brw_IF(p, BRW_EXECUTE_1); + } + brw_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_START); + brw_ff_gs_emit_vue(c, c->reg.vertex[0], false); + brw_ff_gs_offset_header_dw2(c, -URB_WRITE_PRIM_START); + brw_ff_gs_emit_vue(c, c->reg.vertex[1], false); + if (check_edge_flags) { + brw_ENDIF(p); + /* Only emit vertex 2 in PRIM_END mode if this is the last triangle + * of the polygon. Otherwise leave the primitive incomplete because + * there are more polygon vertices coming. + */ + brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), + get_element_ud(c->reg.R0, 2), + brw_imm_ud(BRW_GS_EDGE_INDICATOR_1)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); + brw_set_default_predicate_control(p, BRW_PREDICATE_NORMAL); + } + brw_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_END); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_ff_gs_emit_vue(c, c->reg.vertex[2], true); + break; + } +} + +const unsigned * +brw_compile_ff_gs_prog(struct brw_compiler *compiler, + void *mem_ctx, + const struct brw_ff_gs_prog_key *key, + struct brw_ff_gs_prog_data *prog_data, + struct intel_vue_map *vue_map, + unsigned *final_assembly_size) +{ + struct brw_ff_gs_compile c; + const GLuint *program; + + memset(&c, 0, sizeof(c)); + + c.key = *key; + c.vue_map = *vue_map; + c.nr_regs = (c.vue_map.num_slots + 1)/2; + c.prog_data = prog_data; + + mem_ctx = ralloc_context(NULL); + + /* Begin the compilation: + */ + brw_init_codegen(&compiler->isa, &c.func, mem_ctx); + + c.func.single_program_flow = 1; + + /* For some reason the thread is spawned with only 4 channels + * unmasked. + */ + brw_set_default_mask_control(&c.func, BRW_MASK_DISABLE); + + if (compiler->devinfo->ver >= 6) { + unsigned num_verts; + bool check_edge_flag; + /* On Sandybridge, we use the GS for implementing transform feedback + * (called "Stream Out" in the PRM). + */ + switch (key->primitive) { + case _3DPRIM_POINTLIST: + num_verts = 1; + check_edge_flag = false; + break; + case _3DPRIM_LINELIST: + case _3DPRIM_LINESTRIP: + case _3DPRIM_LINELOOP: + num_verts = 2; + check_edge_flag = false; + break; + case _3DPRIM_TRILIST: + case _3DPRIM_TRIFAN: + case _3DPRIM_TRISTRIP: + case _3DPRIM_RECTLIST: + num_verts = 3; + check_edge_flag = false; + break; + case _3DPRIM_QUADLIST: + case _3DPRIM_QUADSTRIP: + case _3DPRIM_POLYGON: + num_verts = 3; + check_edge_flag = true; + break; + default: + unreachable("Unexpected primitive type in Gen6 SOL program."); + } + gfx6_sol_program(&c, key, num_verts, check_edge_flag); + } else { + /* On Gen4-5, we use the GS to decompose certain types of primitives. + * Note that primitives which don't require a GS program have already + * been weeded out by now. + */ + switch (key->primitive) { + case _3DPRIM_QUADLIST: + brw_ff_gs_quads( &c, key ); + break; + case _3DPRIM_QUADSTRIP: + brw_ff_gs_quad_strip( &c, key ); + break; + case _3DPRIM_LINELOOP: + brw_ff_gs_lines( &c ); + break; + default: + return NULL; + } + } + + brw_compact_instructions(&c.func, 0, NULL); + + /* get the program + */ + program = brw_get_program(&c.func, final_assembly_size); + + if (INTEL_DEBUG(DEBUG_GS)) { + fprintf(stderr, "gs:\n"); + brw_disassemble_with_labels(&compiler->isa, c.func.store, + 0, *final_assembly_size, stderr); + fprintf(stderr, "\n"); + } + + return program; +} + diff --git a/src/intel/compiler/elk/brw_compile_sf.c b/src/intel/compiler/elk/brw_compile_sf.c new file mode 100644 index 00000000000..f9f23e3d2c9 --- /dev/null +++ b/src/intel/compiler/elk/brw_compile_sf.c @@ -0,0 +1,881 @@ +/* + * Copyright © 2006 - 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_compiler.h" +#include "brw_disasm.h" +#include "brw_eu.h" +#include "brw_prim.h" + +#include "dev/intel_debug.h" + +struct brw_sf_compile { + struct brw_codegen func; + struct brw_sf_prog_key key; + struct brw_sf_prog_data prog_data; + + struct brw_reg pv; + struct brw_reg det; + struct brw_reg dx0; + struct brw_reg dx2; + struct brw_reg dy0; + struct brw_reg dy2; + + /* z and 1/w passed in separately: + */ + struct brw_reg z[3]; + struct brw_reg inv_w[3]; + + /* The vertices: + */ + struct brw_reg vert[3]; + + /* Temporaries, allocated after last vertex reg. + */ + struct brw_reg inv_det; + struct brw_reg a1_sub_a0; + struct brw_reg a2_sub_a0; + struct brw_reg tmp; + + struct brw_reg m1Cx; + struct brw_reg m2Cy; + struct brw_reg m3C0; + + GLuint nr_verts; + GLuint nr_attr_regs; + GLuint nr_setup_regs; + int urb_entry_read_offset; + + /** The last known value of the f0.0 flag register. */ + unsigned flag_value; + + struct intel_vue_map vue_map; +}; + +/** + * Determine the vue slot corresponding to the given half of the given register. + */ +static inline int vert_reg_to_vue_slot(struct brw_sf_compile *c, GLuint reg, + int half) +{ + return (reg + c->urb_entry_read_offset) * 2 + half; +} + +/** + * Determine the varying corresponding to the given half of the given + * register. half=0 means the first half of a register, half=1 means the + * second half. + */ +static inline int vert_reg_to_varying(struct brw_sf_compile *c, GLuint reg, + int half) +{ + int vue_slot = vert_reg_to_vue_slot(c, reg, half); + return c->vue_map.slot_to_varying[vue_slot]; +} + +/** + * Determine the register corresponding to the given vue slot + */ +static struct brw_reg get_vue_slot(struct brw_sf_compile *c, + struct brw_reg vert, + int vue_slot) +{ + GLuint off = vue_slot / 2 - c->urb_entry_read_offset; + GLuint sub = vue_slot % 2; + + return brw_vec4_grf(vert.nr + off, sub * 4); +} + +/** + * Determine the register corresponding to the given varying. + */ +static struct brw_reg get_varying(struct brw_sf_compile *c, + struct brw_reg vert, + GLuint varying) +{ + int vue_slot = c->vue_map.varying_to_slot[varying]; + assert (vue_slot >= c->urb_entry_read_offset); + return get_vue_slot(c, vert, vue_slot); +} + +static bool +have_attr(struct brw_sf_compile *c, GLuint attr) +{ + return (c->key.attrs & BITFIELD64_BIT(attr)) ? 1 : 0; +} + +/*********************************************************************** + * Twoside lighting + */ +static void copy_bfc( struct brw_sf_compile *c, + struct brw_reg vert ) +{ + struct brw_codegen *p = &c->func; + GLuint i; + + for (i = 0; i < 2; i++) { + if (have_attr(c, VARYING_SLOT_COL0+i) && + have_attr(c, VARYING_SLOT_BFC0+i)) + brw_MOV(p, + get_varying(c, vert, VARYING_SLOT_COL0+i), + get_varying(c, vert, VARYING_SLOT_BFC0+i)); + } +} + + +static void do_twoside_color( struct brw_sf_compile *c ) +{ + struct brw_codegen *p = &c->func; + GLuint backface_conditional = c->key.frontface_ccw ? BRW_CONDITIONAL_G : BRW_CONDITIONAL_L; + + /* Already done in clip program: + */ + if (c->key.primitive == BRW_SF_PRIM_UNFILLED_TRIS) + return; + + /* If the vertex shader provides backface color, do the selection. The VS + * promises to set up the front color if the backface color is provided, but + * it may contain junk if never written to. + */ + if (!(have_attr(c, VARYING_SLOT_COL0) && have_attr(c, VARYING_SLOT_BFC0)) && + !(have_attr(c, VARYING_SLOT_COL1) && have_attr(c, VARYING_SLOT_BFC1))) + return; + + /* Need to use BRW_EXECUTE_4 and also do an 4-wide compare in order + * to get all channels active inside the IF. In the clipping code + * we run with NoMask, so it's not an option and we can use + * BRW_EXECUTE_1 for all comparisons. + */ + brw_CMP(p, vec4(brw_null_reg()), backface_conditional, c->det, brw_imm_f(0)); + brw_IF(p, BRW_EXECUTE_4); + { + switch (c->nr_verts) { + case 3: copy_bfc(c, c->vert[2]); FALLTHROUGH; + case 2: copy_bfc(c, c->vert[1]); FALLTHROUGH; + case 1: copy_bfc(c, c->vert[0]); + } + } + brw_ENDIF(p); +} + + + +/*********************************************************************** + * Flat shading + */ + +static void copy_flatshaded_attributes(struct brw_sf_compile *c, + struct brw_reg dst, + struct brw_reg src) +{ + struct brw_codegen *p = &c->func; + int i; + + for (i = 0; i < c->vue_map.num_slots; i++) { + if (c->key.interp_mode[i] == INTERP_MODE_FLAT) { + brw_MOV(p, + get_vue_slot(c, dst, i), + get_vue_slot(c, src, i)); + } + } +} + +static int count_flatshaded_attributes(struct brw_sf_compile *c) +{ + int i; + int count = 0; + + for (i = 0; i < c->vue_map.num_slots; i++) + if (c->key.interp_mode[i] == INTERP_MODE_FLAT) + count++; + + return count; +} + + + +/* Need to use a computed jump to copy flatshaded attributes as the + * vertices are ordered according to y-coordinate before reaching this + * point, so the PV could be anywhere. + */ +static void do_flatshade_triangle( struct brw_sf_compile *c ) +{ + struct brw_codegen *p = &c->func; + GLuint nr; + GLuint jmpi = 1; + + /* Already done in clip program: + */ + if (c->key.primitive == BRW_SF_PRIM_UNFILLED_TRIS) + return; + + if (p->devinfo->ver == 5) + jmpi = 2; + + nr = count_flatshaded_attributes(c); + + brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr*2+1))); + brw_JMPI(p, c->pv, BRW_PREDICATE_NONE); + + copy_flatshaded_attributes(c, c->vert[1], c->vert[0]); + copy_flatshaded_attributes(c, c->vert[2], c->vert[0]); + brw_JMPI(p, brw_imm_d(jmpi*(nr*4+1)), BRW_PREDICATE_NONE); + + copy_flatshaded_attributes(c, c->vert[0], c->vert[1]); + copy_flatshaded_attributes(c, c->vert[2], c->vert[1]); + brw_JMPI(p, brw_imm_d(jmpi*nr*2), BRW_PREDICATE_NONE); + + copy_flatshaded_attributes(c, c->vert[0], c->vert[2]); + copy_flatshaded_attributes(c, c->vert[1], c->vert[2]); +} + + +static void do_flatshade_line( struct brw_sf_compile *c ) +{ + struct brw_codegen *p = &c->func; + GLuint nr; + GLuint jmpi = 1; + + /* Already done in clip program: + */ + if (c->key.primitive == BRW_SF_PRIM_UNFILLED_TRIS) + return; + + if (p->devinfo->ver == 5) + jmpi = 2; + + nr = count_flatshaded_attributes(c); + + brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr+1))); + brw_JMPI(p, c->pv, BRW_PREDICATE_NONE); + copy_flatshaded_attributes(c, c->vert[1], c->vert[0]); + + brw_JMPI(p, brw_imm_ud(jmpi*nr), BRW_PREDICATE_NONE); + copy_flatshaded_attributes(c, c->vert[0], c->vert[1]); +} + + +/*********************************************************************** + * Triangle setup. + */ + + +static void alloc_regs( struct brw_sf_compile *c ) +{ + GLuint reg, i; + + /* Values computed by fixed function unit: + */ + c->pv = retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_D); + c->det = brw_vec1_grf(1, 2); + c->dx0 = brw_vec1_grf(1, 3); + c->dx2 = brw_vec1_grf(1, 4); + c->dy0 = brw_vec1_grf(1, 5); + c->dy2 = brw_vec1_grf(1, 6); + + /* z and 1/w passed in separately: + */ + c->z[0] = brw_vec1_grf(2, 0); + c->inv_w[0] = brw_vec1_grf(2, 1); + c->z[1] = brw_vec1_grf(2, 2); + c->inv_w[1] = brw_vec1_grf(2, 3); + c->z[2] = brw_vec1_grf(2, 4); + c->inv_w[2] = brw_vec1_grf(2, 5); + + /* The vertices: + */ + reg = 3; + for (i = 0; i < c->nr_verts; i++) { + c->vert[i] = brw_vec8_grf(reg, 0); + reg += c->nr_attr_regs; + } + + /* Temporaries, allocated after last vertex reg. + */ + c->inv_det = brw_vec1_grf(reg, 0); reg++; + c->a1_sub_a0 = brw_vec8_grf(reg, 0); reg++; + c->a2_sub_a0 = brw_vec8_grf(reg, 0); reg++; + c->tmp = brw_vec8_grf(reg, 0); reg++; + + /* Note grf allocation: + */ + c->prog_data.total_grf = reg; + + + /* Outputs of this program - interpolation coefficients for + * rasterization: + */ + c->m1Cx = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 1, 0); + c->m2Cy = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 2, 0); + c->m3C0 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 3, 0); +} + + +static void copy_z_inv_w( struct brw_sf_compile *c ) +{ + struct brw_codegen *p = &c->func; + GLuint i; + + /* Copy both scalars with a single MOV: + */ + for (i = 0; i < c->nr_verts; i++) + brw_MOV(p, vec2(suboffset(c->vert[i], 2)), vec2(c->z[i])); +} + + +static void invert_det( struct brw_sf_compile *c) +{ + /* Looks like we invert all 8 elements just to get 1/det in + * position 2 !?! + */ + gfx4_math(&c->func, + c->inv_det, + BRW_MATH_FUNCTION_INV, + 0, + c->det, + BRW_MATH_PRECISION_FULL); + +} + + +static bool +calculate_masks(struct brw_sf_compile *c, + GLuint reg, + GLushort *pc, + GLushort *pc_persp, + GLushort *pc_linear) +{ + bool is_last_attr = (reg == c->nr_setup_regs - 1); + enum glsl_interp_mode interp; + + *pc_persp = 0; + *pc_linear = 0; + *pc = 0xf; + + interp = c->key.interp_mode[vert_reg_to_vue_slot(c, reg, 0)]; + if (interp == INTERP_MODE_SMOOTH) { + *pc_linear = 0xf; + *pc_persp = 0xf; + } else if (interp == INTERP_MODE_NOPERSPECTIVE) + *pc_linear = 0xf; + + /* Maybe only process one attribute on the final round: + */ + if (vert_reg_to_varying(c, reg, 1) != BRW_VARYING_SLOT_COUNT) { + *pc |= 0xf0; + + interp = c->key.interp_mode[vert_reg_to_vue_slot(c, reg, 1)]; + if (interp == INTERP_MODE_SMOOTH) { + *pc_linear |= 0xf0; + *pc_persp |= 0xf0; + } else if (interp == INTERP_MODE_NOPERSPECTIVE) + *pc_linear |= 0xf0; + } + + return is_last_attr; +} + +/* Calculates the predicate control for which channels of a reg + * (containing 2 attrs) to do point sprite coordinate replacement on. + */ +static uint16_t +calculate_point_sprite_mask(struct brw_sf_compile *c, GLuint reg) +{ + int varying1, varying2; + uint16_t pc = 0; + + varying1 = vert_reg_to_varying(c, reg, 0); + if (varying1 >= VARYING_SLOT_TEX0 && varying1 <= VARYING_SLOT_TEX7) { + if (c->key.point_sprite_coord_replace & (1 << (varying1 - VARYING_SLOT_TEX0))) + pc |= 0x0f; + } + if (varying1 == BRW_VARYING_SLOT_PNTC) + pc |= 0x0f; + + varying2 = vert_reg_to_varying(c, reg, 1); + if (varying2 >= VARYING_SLOT_TEX0 && varying2 <= VARYING_SLOT_TEX7) { + if (c->key.point_sprite_coord_replace & (1 << (varying2 - + VARYING_SLOT_TEX0))) + pc |= 0xf0; + } + if (varying2 == BRW_VARYING_SLOT_PNTC) + pc |= 0xf0; + + return pc; +} + +static void +set_predicate_control_flag_value(struct brw_codegen *p, + struct brw_sf_compile *c, + unsigned value) +{ + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + + if (value != 0xff) { + if (value != c->flag_value) { + brw_MOV(p, brw_flag_reg(0, 0), brw_imm_uw(value)); + c->flag_value = value; + } + + brw_set_default_predicate_control(p, BRW_PREDICATE_NORMAL); + } +} + +static void brw_emit_tri_setup(struct brw_sf_compile *c, bool allocate) +{ + struct brw_codegen *p = &c->func; + GLuint i; + + c->flag_value = 0xff; + c->nr_verts = 3; + + if (allocate) + alloc_regs(c); + + invert_det(c); + copy_z_inv_w(c); + + if (c->key.do_twoside_color) + do_twoside_color(c); + + if (c->key.contains_flat_varying) + do_flatshade_triangle(c); + + + for (i = 0; i < c->nr_setup_regs; i++) + { + /* Pair of incoming attributes: + */ + struct brw_reg a0 = offset(c->vert[0], i); + struct brw_reg a1 = offset(c->vert[1], i); + struct brw_reg a2 = offset(c->vert[2], i); + GLushort pc, pc_persp, pc_linear; + bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear); + + if (pc_persp) + { + set_predicate_control_flag_value(p, c, pc_persp); + brw_MUL(p, a0, a0, c->inv_w[0]); + brw_MUL(p, a1, a1, c->inv_w[1]); + brw_MUL(p, a2, a2, c->inv_w[2]); + } + + + /* Calculate coefficients for interpolated values: + */ + if (pc_linear) + { + set_predicate_control_flag_value(p, c, pc_linear); + + brw_ADD(p, c->a1_sub_a0, a1, negate(a0)); + brw_ADD(p, c->a2_sub_a0, a2, negate(a0)); + + /* calculate dA/dx + */ + brw_MUL(p, brw_null_reg(), c->a1_sub_a0, c->dy2); + brw_MAC(p, c->tmp, c->a2_sub_a0, negate(c->dy0)); + brw_MUL(p, c->m1Cx, c->tmp, c->inv_det); + + /* calculate dA/dy + */ + brw_MUL(p, brw_null_reg(), c->a2_sub_a0, c->dx0); + brw_MAC(p, c->tmp, c->a1_sub_a0, negate(c->dx2)); + brw_MUL(p, c->m2Cy, c->tmp, c->inv_det); + } + + { + set_predicate_control_flag_value(p, c, pc); + /* start point for interpolation + */ + brw_MOV(p, c->m3C0, a0); + + /* Copy m0..m3 to URB. m0 is implicitly copied from r0 in + * the send instruction: + */ + brw_urb_WRITE(p, + brw_null_reg(), + 0, + brw_vec8_grf(0, 0), /* r0, will be copied to m0 */ + last ? BRW_URB_WRITE_EOT_COMPLETE + : BRW_URB_WRITE_NO_FLAGS, + 4, /* msg len */ + 0, /* response len */ + i*4, /* offset */ + BRW_URB_SWIZZLE_TRANSPOSE); /* XXX: Swizzle control "SF to windower" */ + } + } + + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); +} + + + +static void brw_emit_line_setup(struct brw_sf_compile *c, bool allocate) +{ + struct brw_codegen *p = &c->func; + GLuint i; + + c->flag_value = 0xff; + c->nr_verts = 2; + + if (allocate) + alloc_regs(c); + + invert_det(c); + copy_z_inv_w(c); + + if (c->key.contains_flat_varying) + do_flatshade_line(c); + + for (i = 0; i < c->nr_setup_regs; i++) + { + /* Pair of incoming attributes: + */ + struct brw_reg a0 = offset(c->vert[0], i); + struct brw_reg a1 = offset(c->vert[1], i); + GLushort pc, pc_persp, pc_linear; + bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear); + + if (pc_persp) + { + set_predicate_control_flag_value(p, c, pc_persp); + brw_MUL(p, a0, a0, c->inv_w[0]); + brw_MUL(p, a1, a1, c->inv_w[1]); + } + + /* Calculate coefficients for position, color: + */ + if (pc_linear) { + set_predicate_control_flag_value(p, c, pc_linear); + + brw_ADD(p, c->a1_sub_a0, a1, negate(a0)); + + brw_MUL(p, c->tmp, c->a1_sub_a0, c->dx0); + brw_MUL(p, c->m1Cx, c->tmp, c->inv_det); + + brw_MUL(p, c->tmp, c->a1_sub_a0, c->dy0); + brw_MUL(p, c->m2Cy, c->tmp, c->inv_det); + } + + { + set_predicate_control_flag_value(p, c, pc); + + /* start point for interpolation + */ + brw_MOV(p, c->m3C0, a0); + + /* Copy m0..m3 to URB. + */ + brw_urb_WRITE(p, + brw_null_reg(), + 0, + brw_vec8_grf(0, 0), + last ? BRW_URB_WRITE_EOT_COMPLETE + : BRW_URB_WRITE_NO_FLAGS, + 4, /* msg len */ + 0, /* response len */ + i*4, /* urb destination offset */ + BRW_URB_SWIZZLE_TRANSPOSE); + } + } + + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); +} + +static void brw_emit_point_sprite_setup(struct brw_sf_compile *c, bool allocate) +{ + struct brw_codegen *p = &c->func; + GLuint i; + + c->flag_value = 0xff; + c->nr_verts = 1; + + if (allocate) + alloc_regs(c); + + copy_z_inv_w(c); + for (i = 0; i < c->nr_setup_regs; i++) + { + struct brw_reg a0 = offset(c->vert[0], i); + GLushort pc, pc_persp, pc_linear, pc_coord_replace; + bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear); + + pc_coord_replace = calculate_point_sprite_mask(c, i); + pc_persp &= ~pc_coord_replace; + + if (pc_persp) { + set_predicate_control_flag_value(p, c, pc_persp); + brw_MUL(p, a0, a0, c->inv_w[0]); + } + + /* Point sprite coordinate replacement: A texcoord with this + * enabled gets replaced with the value (x, y, 0, 1) where x and + * y vary from 0 to 1 across the horizontal and vertical of the + * point. + */ + if (pc_coord_replace) { + set_predicate_control_flag_value(p, c, pc_coord_replace); + /* Calculate 1.0/PointWidth */ + gfx4_math(&c->func, + c->tmp, + BRW_MATH_FUNCTION_INV, + 0, + c->dx0, + BRW_MATH_PRECISION_FULL); + + brw_set_default_access_mode(p, BRW_ALIGN_16); + + /* dA/dx, dA/dy */ + brw_MOV(p, c->m1Cx, brw_imm_f(0.0)); + brw_MOV(p, c->m2Cy, brw_imm_f(0.0)); + brw_MOV(p, brw_writemask(c->m1Cx, WRITEMASK_X), c->tmp); + if (c->key.sprite_origin_lower_left) { + brw_MOV(p, brw_writemask(c->m2Cy, WRITEMASK_Y), negate(c->tmp)); + } else { + brw_MOV(p, brw_writemask(c->m2Cy, WRITEMASK_Y), c->tmp); + } + + /* attribute constant offset */ + brw_MOV(p, c->m3C0, brw_imm_f(0.0)); + if (c->key.sprite_origin_lower_left) { + brw_MOV(p, brw_writemask(c->m3C0, WRITEMASK_YW), brw_imm_f(1.0)); + } else { + brw_MOV(p, brw_writemask(c->m3C0, WRITEMASK_W), brw_imm_f(1.0)); + } + + brw_set_default_access_mode(p, BRW_ALIGN_1); + } + + if (pc & ~pc_coord_replace) { + set_predicate_control_flag_value(p, c, pc & ~pc_coord_replace); + brw_MOV(p, c->m1Cx, brw_imm_ud(0)); + brw_MOV(p, c->m2Cy, brw_imm_ud(0)); + brw_MOV(p, c->m3C0, a0); /* constant value */ + } + + + set_predicate_control_flag_value(p, c, pc); + /* Copy m0..m3 to URB. */ + brw_urb_WRITE(p, + brw_null_reg(), + 0, + brw_vec8_grf(0, 0), + last ? BRW_URB_WRITE_EOT_COMPLETE + : BRW_URB_WRITE_NO_FLAGS, + 4, /* msg len */ + 0, /* response len */ + i*4, /* urb destination offset */ + BRW_URB_SWIZZLE_TRANSPOSE); + } + + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); +} + +/* Points setup - several simplifications as all attributes are + * constant across the face of the point (point sprites excluded!) + */ +static void brw_emit_point_setup(struct brw_sf_compile *c, bool allocate) +{ + struct brw_codegen *p = &c->func; + GLuint i; + + c->flag_value = 0xff; + c->nr_verts = 1; + + if (allocate) + alloc_regs(c); + + copy_z_inv_w(c); + + brw_MOV(p, c->m1Cx, brw_imm_ud(0)); /* zero - move out of loop */ + brw_MOV(p, c->m2Cy, brw_imm_ud(0)); /* zero - move out of loop */ + + for (i = 0; i < c->nr_setup_regs; i++) + { + struct brw_reg a0 = offset(c->vert[0], i); + GLushort pc, pc_persp, pc_linear; + bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear); + + if (pc_persp) + { + /* This seems odd as the values are all constant, but the + * fragment shader will be expecting it: + */ + set_predicate_control_flag_value(p, c, pc_persp); + brw_MUL(p, a0, a0, c->inv_w[0]); + } + + + /* The delta values are always zero, just send the starting + * coordinate. Again, this is to fit in with the interpolation + * code in the fragment shader. + */ + { + set_predicate_control_flag_value(p, c, pc); + + brw_MOV(p, c->m3C0, a0); /* constant value */ + + /* Copy m0..m3 to URB. + */ + brw_urb_WRITE(p, + brw_null_reg(), + 0, + brw_vec8_grf(0, 0), + last ? BRW_URB_WRITE_EOT_COMPLETE + : BRW_URB_WRITE_NO_FLAGS, + 4, /* msg len */ + 0, /* response len */ + i*4, /* urb destination offset */ + BRW_URB_SWIZZLE_TRANSPOSE); + } + } + + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); +} + +static void brw_emit_anyprim_setup( struct brw_sf_compile *c ) +{ + struct brw_codegen *p = &c->func; + struct brw_reg payload_prim = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0); + struct brw_reg payload_attr = get_element_ud(brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0), 0); + struct brw_reg primmask; + int jmp; + struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); + + c->nr_verts = 3; + alloc_regs(c); + + primmask = retype(get_element(c->tmp, 0), BRW_REGISTER_TYPE_UD); + + brw_MOV(p, primmask, brw_imm_ud(1)); + brw_SHL(p, primmask, primmask, payload_prim); + + brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_TRILIST) | + (1<<_3DPRIM_TRISTRIP) | + (1<<_3DPRIM_TRIFAN) | + (1<<_3DPRIM_TRISTRIP_REVERSE) | + (1<<_3DPRIM_POLYGON) | + (1<<_3DPRIM_RECTLIST) | + (1<<_3DPRIM_TRIFAN_NOSTIPPLE))); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_Z); + jmp = brw_JMPI(p, brw_imm_d(0), BRW_PREDICATE_NORMAL) - p->store; + brw_emit_tri_setup(c, false); + brw_land_fwd_jump(p, jmp); + + brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_LINELIST) | + (1<<_3DPRIM_LINESTRIP) | + (1<<_3DPRIM_LINELOOP) | + (1<<_3DPRIM_LINESTRIP_CONT) | + (1<<_3DPRIM_LINESTRIP_BF) | + (1<<_3DPRIM_LINESTRIP_CONT_BF))); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_Z); + jmp = brw_JMPI(p, brw_imm_d(0), BRW_PREDICATE_NORMAL) - p->store; + brw_emit_line_setup(c, false); + brw_land_fwd_jump(p, jmp); + + brw_AND(p, v1_null_ud, payload_attr, brw_imm_ud(1<devinfo, brw_last_inst, BRW_CONDITIONAL_Z); + jmp = brw_JMPI(p, brw_imm_d(0), BRW_PREDICATE_NORMAL) - p->store; + brw_emit_point_sprite_setup(c, false); + brw_land_fwd_jump(p, jmp); + + brw_emit_point_setup( c, false ); +} + +const unsigned * +brw_compile_sf(const struct brw_compiler *compiler, + void *mem_ctx, + const struct brw_sf_prog_key *key, + struct brw_sf_prog_data *prog_data, + struct intel_vue_map *vue_map, + unsigned *final_assembly_size) +{ + struct brw_sf_compile c; + memset(&c, 0, sizeof(c)); + + /* Begin the compilation: + */ + brw_init_codegen(&compiler->isa, &c.func, mem_ctx); + + c.key = *key; + c.vue_map = *vue_map; + if (c.key.do_point_coord) { + /* + * gl_PointCoord is a FS instead of VS builtin variable, thus it's + * not included in c.vue_map generated in VS stage. Here we add + * it manually to let SF shader generate the needed interpolation + * coefficient for FS shader. + */ + c.vue_map.varying_to_slot[BRW_VARYING_SLOT_PNTC] = c.vue_map.num_slots; + c.vue_map.slot_to_varying[c.vue_map.num_slots++] = BRW_VARYING_SLOT_PNTC; + } + c.urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET; + c.nr_attr_regs = (c.vue_map.num_slots + 1)/2 - c.urb_entry_read_offset; + c.nr_setup_regs = c.nr_attr_regs; + + c.prog_data.urb_read_length = c.nr_attr_regs; + c.prog_data.urb_entry_size = c.nr_setup_regs * 2; + + /* Which primitive? Or all three? + */ + switch (key->primitive) { + case BRW_SF_PRIM_TRIANGLES: + c.nr_verts = 3; + brw_emit_tri_setup( &c, true ); + break; + case BRW_SF_PRIM_LINES: + c.nr_verts = 2; + brw_emit_line_setup( &c, true ); + break; + case BRW_SF_PRIM_POINTS: + c.nr_verts = 1; + if (key->do_point_sprite) + brw_emit_point_sprite_setup( &c, true ); + else + brw_emit_point_setup( &c, true ); + break; + case BRW_SF_PRIM_UNFILLED_TRIS: + c.nr_verts = 3; + brw_emit_anyprim_setup( &c ); + break; + default: + unreachable("not reached"); + } + + /* FINISHME: SF programs use calculated jumps (i.e., JMPI with a register + * source). Compacting would be difficult. + */ + /* brw_compact_instructions(&c.func, 0, 0, NULL); */ + + *prog_data = c.prog_data; + + const unsigned *program = brw_get_program(&c.func, final_assembly_size); + + if (INTEL_DEBUG(DEBUG_SF)) { + fprintf(stderr, "sf:\n"); + brw_disassemble_with_labels(&compiler->isa, + program, 0, *final_assembly_size, stderr); + fprintf(stderr, "\n"); + } + + return program; +} diff --git a/src/intel/compiler/elk/brw_compiler.c b/src/intel/compiler/elk/brw_compiler.c new file mode 100644 index 00000000000..c267a05a0a5 --- /dev/null +++ b/src/intel/compiler/elk/brw_compiler.c @@ -0,0 +1,370 @@ +/* + * Copyright © 2015-2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_compiler.h" +#include "brw_shader.h" +#include "brw_eu.h" +#include "brw_nir.h" +#include "dev/intel_debug.h" +#include "compiler/nir/nir.h" +#include "util/u_debug.h" + +#define COMMON_OPTIONS \ + .has_uclz = true, \ + .lower_fdiv = true, \ + .lower_scmp = true, \ + .lower_flrp16 = true, \ + .lower_fmod = true, \ + .lower_ufind_msb = true, \ + .lower_uadd_carry = true, \ + .lower_usub_borrow = true, \ + .lower_flrp64 = true, \ + .lower_fisnormal = true, \ + .lower_isign = true, \ + .lower_ldexp = true, \ + .lower_bitfield_extract = true, \ + .lower_bitfield_insert = true, \ + .lower_device_index_to_zero = true, \ + .vectorize_io = true, \ + .vectorize_tess_levels = true, \ + .use_interpolated_input_intrinsics = true, \ + .lower_insert_byte = true, \ + .lower_insert_word = true, \ + .vertex_id_zero_based = true, \ + .lower_base_vertex = true, \ + .support_16bit_alu = true, \ + .lower_uniforms_to_ubo = true + +#define COMMON_SCALAR_OPTIONS \ + .lower_to_scalar = true, \ + .lower_pack_half_2x16 = true, \ + .lower_pack_snorm_2x16 = true, \ + .lower_pack_snorm_4x8 = true, \ + .lower_pack_unorm_2x16 = true, \ + .lower_pack_unorm_4x8 = true, \ + .lower_unpack_half_2x16 = true, \ + .lower_unpack_snorm_2x16 = true, \ + .lower_unpack_snorm_4x8 = true, \ + .lower_unpack_unorm_2x16 = true, \ + .lower_unpack_unorm_4x8 = true, \ + .lower_hadd64 = true, \ + .avoid_ternary_with_two_constants = true, \ + .has_pack_32_4x8 = true, \ + .max_unroll_iterations = 32, \ + .force_indirect_unrolling = nir_var_function_temp, \ + .divergence_analysis_options = \ + (nir_divergence_single_patch_per_tcs_subgroup | \ + nir_divergence_single_patch_per_tes_subgroup | \ + nir_divergence_shader_record_ptr_uniform) + +const struct nir_shader_compiler_options brw_scalar_nir_options = { + COMMON_OPTIONS, + COMMON_SCALAR_OPTIONS, +}; + +const struct nir_shader_compiler_options brw_vector_nir_options = { + COMMON_OPTIONS, + + /* In the vec4 backend, our dpN instruction replicates its result to all the + * components of a vec4. We would like NIR to give us replicated fdot + * instructions because it can optimize better for us. + */ + .fdot_replicates = true, + + .lower_usub_sat = true, + .lower_pack_snorm_2x16 = true, + .lower_pack_unorm_2x16 = true, + .lower_unpack_snorm_2x16 = true, + .lower_unpack_unorm_2x16 = true, + .lower_extract_byte = true, + .lower_extract_word = true, + .intel_vec4 = true, + .max_unroll_iterations = 32, +}; + +struct brw_compiler * +brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo) +{ + struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler); + + compiler->devinfo = devinfo; + + brw_init_isa_info(&compiler->isa, devinfo); + + brw_fs_alloc_reg_sets(compiler); + if (devinfo->ver < 8) + brw_vec4_alloc_reg_set(compiler); + + compiler->precise_trig = debug_get_bool_option("INTEL_PRECISE_TRIG", false); + + compiler->use_tcs_multi_patch = devinfo->ver >= 12; + + /* Default to the sampler since that's what we've done since forever */ + compiler->indirect_ubos_use_sampler = true; + + compiler->lower_dpas = devinfo->verx10 < 125 || + intel_device_info_is_mtl(devinfo) || + (intel_device_info_is_arl(devinfo) && + devinfo->platform != INTEL_PLATFORM_ARL_H) || + debug_get_bool_option("INTEL_LOWER_DPAS", false); + + /* There is no vec4 mode on Gfx10+, and we don't use it at all on Gfx8+. */ + for (int i = MESA_SHADER_VERTEX; i < MESA_ALL_SHADER_STAGES; i++) { + compiler->scalar_stage[i] = devinfo->ver >= 8 || + i == MESA_SHADER_FRAGMENT || i == MESA_SHADER_COMPUTE; + } + + for (int i = MESA_SHADER_TASK; i < MESA_VULKAN_SHADER_STAGES; i++) + compiler->scalar_stage[i] = true; + + nir_lower_int64_options int64_options = + nir_lower_imul64 | + nir_lower_isign64 | + nir_lower_divmod64 | + nir_lower_imul_high64 | + nir_lower_find_lsb64 | + nir_lower_ufind_msb64 | + nir_lower_bit_count64; + nir_lower_doubles_options fp64_options = + nir_lower_drcp | + nir_lower_dsqrt | + nir_lower_drsq | + nir_lower_dtrunc | + nir_lower_dfloor | + nir_lower_dceil | + nir_lower_dfract | + nir_lower_dround_even | + nir_lower_dmod | + nir_lower_dsub | + nir_lower_ddiv; + + if (!devinfo->has_64bit_float || INTEL_DEBUG(DEBUG_SOFT64)) + fp64_options |= nir_lower_fp64_full_software; + if (!devinfo->has_64bit_int) + int64_options |= (nir_lower_int64_options)~0; + + /* The Bspec's section titled "Instruction_multiply[DevBDW+]" claims that + * destination type can be Quadword and source type Doubleword for Gfx8 and + * Gfx9. So, lower 64 bit multiply instruction on rest of the platforms. + */ + if (devinfo->ver < 8 || devinfo->ver > 9) + int64_options |= nir_lower_imul_2x32_64; + + /* We want the GLSL compiler to emit code that uses condition codes */ + for (int i = 0; i < MESA_ALL_SHADER_STAGES; i++) { + struct nir_shader_compiler_options *nir_options = + rzalloc(compiler, struct nir_shader_compiler_options); + bool is_scalar = compiler->scalar_stage[i]; + if (is_scalar) { + *nir_options = brw_scalar_nir_options; + int64_options |= nir_lower_usub_sat64; + } else { + *nir_options = brw_vector_nir_options; + } + + /* Prior to Gfx6, there are no three source operations, and Gfx11 loses + * LRP. + */ + nir_options->lower_ffma16 = devinfo->ver < 6; + nir_options->lower_ffma32 = devinfo->ver < 6; + nir_options->lower_ffma64 = devinfo->ver < 6; + nir_options->lower_flrp32 = devinfo->ver < 6 || devinfo->ver >= 11; + nir_options->lower_fpow = devinfo->ver >= 12; + + nir_options->has_bfe = devinfo->ver >= 7; + nir_options->has_bfm = devinfo->ver >= 7; + nir_options->has_bfi = devinfo->ver >= 7; + + nir_options->has_rotate16 = devinfo->ver >= 11; + nir_options->has_rotate32 = devinfo->ver >= 11; + nir_options->lower_bitfield_reverse = devinfo->ver < 7; + nir_options->lower_find_lsb = devinfo->ver < 7; + nir_options->lower_ifind_msb = devinfo->ver < 7; + nir_options->has_iadd3 = devinfo->verx10 >= 125; + + nir_options->has_sdot_4x8 = devinfo->ver >= 12; + nir_options->has_udot_4x8 = devinfo->ver >= 12; + nir_options->has_sudot_4x8 = devinfo->ver >= 12; + nir_options->has_sdot_4x8_sat = devinfo->ver >= 12; + nir_options->has_udot_4x8_sat = devinfo->ver >= 12; + nir_options->has_sudot_4x8_sat = devinfo->ver >= 12; + + nir_options->lower_int64_options = int64_options; + nir_options->lower_doubles_options = fp64_options; + + nir_options->unify_interfaces = i < MESA_SHADER_FRAGMENT; + + nir_options->force_indirect_unrolling |= + brw_nir_no_indirect_mask(compiler, i); + nir_options->force_indirect_unrolling_sampler = devinfo->ver < 7; + + if (compiler->use_tcs_multi_patch) { + /* TCS MULTI_PATCH mode has multiple patches per subgroup */ + nir_options->divergence_analysis_options &= + ~nir_divergence_single_patch_per_tcs_subgroup; + } + + if (devinfo->ver < 12) + nir_options->divergence_analysis_options |= + nir_divergence_single_prim_per_subgroup; + + compiler->nir_options[i] = nir_options; + } + + compiler->mesh.mue_header_packing = + (unsigned)debug_get_num_option("INTEL_MESH_HEADER_PACKING", 3); + compiler->mesh.mue_compaction = + debug_get_bool_option("INTEL_MESH_COMPACTION", true); + + return compiler; +} + +static void +insert_u64_bit(uint64_t *val, bool add) +{ + *val = (*val << 1) | !!add; +} + +uint64_t +brw_get_compiler_config_value(const struct brw_compiler *compiler) +{ + uint64_t config = 0; + unsigned bits = 0; + + insert_u64_bit(&config, compiler->precise_trig); + bits++; + insert_u64_bit(&config, compiler->lower_dpas); + bits++; + insert_u64_bit(&config, compiler->mesh.mue_compaction); + bits++; + + uint64_t mask = DEBUG_DISK_CACHE_MASK; + bits += util_bitcount64(mask); + + u_foreach_bit64(bit, mask) + insert_u64_bit(&config, INTEL_DEBUG(1ULL << bit)); + + mask = SIMD_DISK_CACHE_MASK; + bits += util_bitcount64(mask); + + u_foreach_bit64(bit, mask) + insert_u64_bit(&config, (intel_simd & (1ULL << bit)) != 0); + + mask = 3; + bits += util_bitcount64(mask); + + u_foreach_bit64(bit, mask) + insert_u64_bit(&config, (compiler->mesh.mue_header_packing & (1ULL << bit)) != 0); + + assert(bits <= util_bitcount64(UINT64_MAX)); + + return config; +} + +void +brw_device_sha1(char *hex, + const struct intel_device_info *devinfo) { + struct mesa_sha1 ctx; + _mesa_sha1_init(&ctx); + brw_device_sha1_update(&ctx, devinfo); + unsigned char result[20]; + _mesa_sha1_final(&ctx, result); + _mesa_sha1_format(hex, result); +} + +unsigned +brw_prog_data_size(gl_shader_stage stage) +{ + static const size_t stage_sizes[] = { + [MESA_SHADER_VERTEX] = sizeof(struct brw_vs_prog_data), + [MESA_SHADER_TESS_CTRL] = sizeof(struct brw_tcs_prog_data), + [MESA_SHADER_TESS_EVAL] = sizeof(struct brw_tes_prog_data), + [MESA_SHADER_GEOMETRY] = sizeof(struct brw_gs_prog_data), + [MESA_SHADER_FRAGMENT] = sizeof(struct brw_wm_prog_data), + [MESA_SHADER_COMPUTE] = sizeof(struct brw_cs_prog_data), + [MESA_SHADER_TASK] = sizeof(struct brw_task_prog_data), + [MESA_SHADER_MESH] = sizeof(struct brw_mesh_prog_data), + [MESA_SHADER_RAYGEN] = sizeof(struct brw_bs_prog_data), + [MESA_SHADER_ANY_HIT] = sizeof(struct brw_bs_prog_data), + [MESA_SHADER_CLOSEST_HIT] = sizeof(struct brw_bs_prog_data), + [MESA_SHADER_MISS] = sizeof(struct brw_bs_prog_data), + [MESA_SHADER_INTERSECTION] = sizeof(struct brw_bs_prog_data), + [MESA_SHADER_CALLABLE] = sizeof(struct brw_bs_prog_data), + [MESA_SHADER_KERNEL] = sizeof(struct brw_cs_prog_data), + }; + assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_sizes)); + return stage_sizes[stage]; +} + +unsigned +brw_prog_key_size(gl_shader_stage stage) +{ + static const size_t stage_sizes[] = { + [MESA_SHADER_VERTEX] = sizeof(struct brw_vs_prog_key), + [MESA_SHADER_TESS_CTRL] = sizeof(struct brw_tcs_prog_key), + [MESA_SHADER_TESS_EVAL] = sizeof(struct brw_tes_prog_key), + [MESA_SHADER_GEOMETRY] = sizeof(struct brw_gs_prog_key), + [MESA_SHADER_FRAGMENT] = sizeof(struct brw_wm_prog_key), + [MESA_SHADER_COMPUTE] = sizeof(struct brw_cs_prog_key), + [MESA_SHADER_TASK] = sizeof(struct brw_task_prog_key), + [MESA_SHADER_MESH] = sizeof(struct brw_mesh_prog_key), + [MESA_SHADER_RAYGEN] = sizeof(struct brw_bs_prog_key), + [MESA_SHADER_ANY_HIT] = sizeof(struct brw_bs_prog_key), + [MESA_SHADER_CLOSEST_HIT] = sizeof(struct brw_bs_prog_key), + [MESA_SHADER_MISS] = sizeof(struct brw_bs_prog_key), + [MESA_SHADER_INTERSECTION] = sizeof(struct brw_bs_prog_key), + [MESA_SHADER_CALLABLE] = sizeof(struct brw_bs_prog_key), + [MESA_SHADER_KERNEL] = sizeof(struct brw_cs_prog_key), + }; + assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_sizes)); + return stage_sizes[stage]; +} + +void +brw_write_shader_relocs(const struct brw_isa_info *isa, + void *program, + const struct brw_stage_prog_data *prog_data, + struct brw_shader_reloc_value *values, + unsigned num_values) +{ + for (unsigned i = 0; i < prog_data->num_relocs; i++) { + assert(prog_data->relocs[i].offset % 8 == 0); + void *dst = program + prog_data->relocs[i].offset; + for (unsigned j = 0; j < num_values; j++) { + if (prog_data->relocs[i].id == values[j].id) { + uint32_t value = values[j].value + prog_data->relocs[i].delta; + switch (prog_data->relocs[i].type) { + case BRW_SHADER_RELOC_TYPE_U32: + *(uint32_t *)dst = value; + break; + case BRW_SHADER_RELOC_TYPE_MOV_IMM: + brw_update_reloc_imm(isa, dst, value); + break; + default: + unreachable("Invalid relocation type"); + } + break; + } + } + } +} diff --git a/src/intel/compiler/elk/brw_compiler.h b/src/intel/compiler/elk/brw_compiler.h new file mode 100644 index 00000000000..30a05d8e287 --- /dev/null +++ b/src/intel/compiler/elk/brw_compiler.h @@ -0,0 +1,2131 @@ +/* + * Copyright © 2010 - 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BRW_COMPILER_H +#define BRW_COMPILER_H + +#include +#include "c11/threads.h" +#include "dev/intel_device_info.h" +#include "isl/isl.h" +#include "util/macros.h" +#include "util/mesa-sha1.h" +#include "util/enum_operators.h" +#include "util/ralloc.h" +#include "util/u_math.h" +#include "brw_isa_info.h" +#include "intel_shader_enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct ra_regs; +struct nir_shader; +struct shader_info; + +struct nir_shader_compiler_options; +typedef struct nir_shader nir_shader; + +struct brw_compiler { + const struct intel_device_info *devinfo; + + /* This lock must be taken if the compiler is to be modified in any way, + * including adding something to the ralloc child list. + */ + mtx_t mutex; + + struct brw_isa_info isa; + + struct { + struct ra_regs *regs; + + /** + * Array of the ra classes for the unaligned contiguous register + * block sizes used. + */ + struct ra_class **classes; + } vec4_reg_set; + + struct { + struct ra_regs *regs; + + /** + * Array of the ra classes for the unaligned contiguous register + * block sizes used, indexed by register size. + */ + struct ra_class *classes[16]; + + /** + * ra class for the aligned barycentrics we use for PLN, which doesn't + * appear in *classes. + */ + struct ra_class *aligned_bary_class; + } fs_reg_sets[3]; + + void (*shader_debug_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4); + void (*shader_perf_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4); + + bool scalar_stage[MESA_ALL_SHADER_STAGES]; + bool use_tcs_multi_patch; + struct nir_shader_compiler_options *nir_options[MESA_ALL_SHADER_STAGES]; + + /** + * Apply workarounds for SIN and COS output range problems. + * This can negatively impact performance. + */ + bool precise_trig; + + /** + * Is 3DSTATE_CONSTANT_*'s Constant Buffer 0 relative to Dynamic State + * Base Address? (If not, it's a normal GPU address.) + */ + bool constant_buffer_0_is_relative; + + /** + * Whether or not the driver supports NIR shader constants. This controls + * whether nir_opt_large_constants will be run. + */ + bool supports_shader_constants; + + /** + * Whether indirect UBO loads should use the sampler or go through the + * data/constant cache. For the sampler, UBO surface states have to be set + * up with VK_FORMAT_R32G32B32A32_FLOAT whereas if it's going through the + * constant or data cache, UBOs must use VK_FORMAT_RAW. + */ + bool indirect_ubos_use_sampler; + + /** + * Gfx12.5+ has a bit in the SEND instruction extending the bindless + * surface offset range from 20 to 26 bits, effectively giving us 4Gb of + * bindless surface descriptors instead of 64Mb previously. + */ + bool extended_bindless_surface_offset; + + /** + * Gfx11+ has a bit in the dword 3 of the sampler message header that + * indicates whether the sampler handle is relative to the dynamic state + * base address (0) or the bindless sampler base address (1). The driver + * can select this. + */ + bool use_bindless_sampler_offset; + + /** + * Should DPAS instructions be lowered? + * + * This will be set for all platforms before Gfx12.5. It may also be set + * platforms that support DPAS for testing purposes. + */ + bool lower_dpas; + + /** + * Calling the ra_allocate function after each register spill can take + * several minutes. This option speeds up shader compilation by spilling + * more registers after the ra_allocate failure. Required for + * Cyberpunk 2077, which uses a watchdog thread to terminate the process + * in case the render thread hasn't responded within 2 minutes. + */ + int spilling_rate; + + struct nir_shader *clc_shader; + + struct { + unsigned mue_header_packing; + bool mue_compaction; + } mesh; +}; + +#define brw_shader_debug_log(compiler, data, fmt, ... ) do { \ + static unsigned id = 0; \ + compiler->shader_debug_log(data, &id, fmt, ##__VA_ARGS__); \ +} while (0) + +#define brw_shader_perf_log(compiler, data, fmt, ... ) do { \ + static unsigned id = 0; \ + compiler->shader_perf_log(data, &id, fmt, ##__VA_ARGS__); \ +} while (0) + +/** + * We use a constant subgroup size of 32. It really only needs to be a + * maximum and, since we do SIMD32 for compute shaders in some cases, it + * needs to be at least 32. SIMD8 and SIMD16 shaders will still claim a + * subgroup size of 32 but will act as if 16 or 24 of those channels are + * disabled. + */ +#define BRW_SUBGROUP_SIZE 32 + +static inline bool +brw_shader_stage_is_bindless(gl_shader_stage stage) +{ + return stage >= MESA_SHADER_RAYGEN && + stage <= MESA_SHADER_CALLABLE; +} + +static inline bool +brw_shader_stage_requires_bindless_resources(gl_shader_stage stage) +{ + return brw_shader_stage_is_bindless(stage) || gl_shader_stage_is_mesh(stage); +} + +/** + * Program key structures. + * + * When drawing, we look for the currently bound shaders in the program + * cache. This is essentially a hash table lookup, and these are the keys. + * + * Sometimes OpenGL features specified as state need to be simulated via + * shader code, due to a mismatch between the API and the hardware. This + * is often referred to as "non-orthagonal state" or "NOS". We store NOS + * in the program key so it's considered when searching for a program. If + * we haven't seen a particular combination before, we have to recompile a + * new specialized version. + * + * Shader compilation should not look up state in gl_context directly, but + * instead use the copy in the program key. This guarantees recompiles will + * happen correctly. + * + * @{ + */ + +enum PACKED gfx6_gather_sampler_wa { + WA_SIGN = 1, /* whether we need to sign extend */ + WA_8BIT = 2, /* if we have an 8bit format needing wa */ + WA_16BIT = 4, /* if we have a 16bit format needing wa */ +}; + +#define BRW_MAX_SAMPLERS 32 + +/* Provide explicit padding for each member, to ensure that the compiler + * initializes every bit in the shader cache keys. The keys will be compared + * with memcmp. + */ +PRAGMA_DIAGNOSTIC_PUSH +PRAGMA_DIAGNOSTIC_ERROR(-Wpadded) + +/** + * Sampler information needed by VS, WM, and GS program cache keys. + */ +struct brw_sampler_prog_key_data { + /** + * EXT_texture_swizzle and DEPTH_TEXTURE_MODE swizzles. + * + * This field is not consumed by the back-end compiler and is only relevant + * for the crocus OpenGL driver for Broadwell and earlier hardware. + */ + uint16_t swizzles[BRW_MAX_SAMPLERS]; + + uint32_t gl_clamp_mask[3]; + + /** + * For RG32F, gather4's channel select is broken. + */ + uint32_t gather_channel_quirk_mask; + + /** + * For Sandybridge, which shader w/a we need for gather quirks. + */ + enum gfx6_gather_sampler_wa gfx6_gather_wa[BRW_MAX_SAMPLERS]; +}; + +enum brw_robustness_flags { + BRW_ROBUSTNESS_UBO = BITFIELD_BIT(0), + BRW_ROBUSTNESS_SSBO = BITFIELD_BIT(1), +}; + +struct brw_base_prog_key { + unsigned program_string_id; + + enum brw_robustness_flags robust_flags:2; + + unsigned padding:22; + + /** + * Apply workarounds for SIN and COS input range problems. + * This limits input range for SIN and COS to [-2p : 2p] to + * avoid precision issues. + */ + bool limit_trig_input_range; + + struct brw_sampler_prog_key_data tex; +}; + +/** + * The VF can't natively handle certain types of attributes, such as GL_FIXED + * or most 10_10_10_2 types. These flags enable various VS workarounds to + * "fix" attributes at the beginning of shaders. + */ +#define BRW_ATTRIB_WA_COMPONENT_MASK 7 /* mask for GL_FIXED scale channel count */ +#define BRW_ATTRIB_WA_NORMALIZE 8 /* normalize in shader */ +#define BRW_ATTRIB_WA_BGRA 16 /* swap r/b channels in shader */ +#define BRW_ATTRIB_WA_SIGN 32 /* interpret as signed in shader */ +#define BRW_ATTRIB_WA_SCALE 64 /* interpret as scaled in shader */ + +/** + * OpenGL attribute slots fall in [0, VERT_ATTRIB_MAX - 1] with the range + * [VERT_ATTRIB_GENERIC0, VERT_ATTRIB_MAX - 1] reserved for up to 16 user + * input vertex attributes. In Vulkan, we expose up to 28 user vertex input + * attributes that are mapped to slots also starting at VERT_ATTRIB_GENERIC0. + */ +#define MAX_GL_VERT_ATTRIB VERT_ATTRIB_MAX +#define MAX_VK_VERT_ATTRIB (VERT_ATTRIB_GENERIC0 + 28) + +/** + * Max number of binding table entries used for stream output. + * + * From the OpenGL 3.0 spec, table 6.44 (Transform Feedback State), the + * minimum value of MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS is 64. + * + * On Gfx6, the size of transform feedback data is limited not by the number + * of components but by the number of binding table entries we set aside. We + * use one binding table entry for a float, one entry for a vector, and one + * entry per matrix column. Since the only way we can communicate our + * transform feedback capabilities to the client is via + * MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS, we need to plan for the + * worst case, in which all the varyings are floats, so we use up one binding + * table entry per component. Therefore we need to set aside at least 64 + * binding table entries for use by transform feedback. + * + * Note: since we don't currently pack varyings, it is currently impossible + * for the client to actually use up all of these binding table entries--if + * all of their varyings were floats, they would run out of varying slots and + * fail to link. But that's a bug, so it seems prudent to go ahead and + * allocate the number of binding table entries we will need once the bug is + * fixed. + */ +#define BRW_MAX_SOL_BINDINGS 64 + +/** The program key for Vertex Shaders. */ +struct brw_vs_prog_key { + struct brw_base_prog_key base; + + /** + * Per-attribute workaround flags + * + * For each attribute, a combination of BRW_ATTRIB_WA_*. + * + * For OpenGL, where we expose a maximum of 16 user input attributes + * we only need up to VERT_ATTRIB_MAX slots, however, in Vulkan + * slots preceding VERT_ATTRIB_GENERIC0 are unused and we can + * expose up to 28 user input vertex attributes that are mapped to slots + * starting at VERT_ATTRIB_GENERIC0, so this array needs to be large + * enough to hold this many slots. + */ + uint8_t gl_attrib_wa_flags[MAX2(MAX_GL_VERT_ATTRIB, MAX_VK_VERT_ATTRIB)]; + + /** + * For pre-Gfx6 hardware, a bitfield indicating which texture coordinates + * are going to be replaced with point coordinates (as a consequence of a + * call to glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)). Because + * our SF thread requires exact matching between VS outputs and FS inputs, + * these texture coordinates will need to be unconditionally included in + * the VUE, even if they aren't written by the vertex shader. + */ + uint8_t point_coord_replace; + unsigned clamp_pointsize:1; + + bool copy_edgeflag:1; + + bool clamp_vertex_color:1; + + /** + * How many user clipping planes are being uploaded to the vertex shader as + * push constants. + * + * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to + * clip distances. + */ + unsigned nr_userclip_plane_consts:4; + + uint32_t padding: 25; +}; + +/** The program key for Tessellation Control Shaders. */ +struct brw_tcs_prog_key +{ + struct brw_base_prog_key base; + + /** A bitfield of per-vertex outputs written. */ + uint64_t outputs_written; + + enum tess_primitive_mode _tes_primitive_mode; + + /** Number of input vertices, 0 means dynamic */ + unsigned input_vertices; + + /** A bitfield of per-patch outputs written. */ + uint32_t patch_outputs_written; + + bool quads_workaround; + uint32_t padding:24; +}; + +#define BRW_MAX_TCS_INPUT_VERTICES (32) + +static inline uint32_t +brw_tcs_prog_key_input_vertices(const struct brw_tcs_prog_key *key) +{ + return key->input_vertices != 0 ? + key->input_vertices : BRW_MAX_TCS_INPUT_VERTICES; +} + +/** The program key for Tessellation Evaluation Shaders. */ +struct brw_tes_prog_key +{ + struct brw_base_prog_key base; + + /** A bitfield of per-vertex inputs read. */ + uint64_t inputs_read; + + /** A bitfield of per-patch inputs read. */ + uint32_t patch_inputs_read; + + /** + * How many user clipping planes are being uploaded to the tessellation + * evaluation shader as push constants. + * + * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to + * clip distances. + */ + unsigned nr_userclip_plane_consts:4; + unsigned clamp_pointsize:1; + uint32_t padding:27; +}; + +/** The program key for Geometry Shaders. */ +struct brw_gs_prog_key +{ + struct brw_base_prog_key base; + + /** + * How many user clipping planes are being uploaded to the geometry shader + * as push constants. + * + * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to + * clip distances. + */ + unsigned nr_userclip_plane_consts:4; + unsigned clamp_pointsize:1; + unsigned padding:27; +}; + +struct brw_task_prog_key +{ + struct brw_base_prog_key base; +}; + +struct brw_mesh_prog_key +{ + struct brw_base_prog_key base; + + bool compact_mue:1; + unsigned padding:31; +}; + +enum brw_sf_primitive { + BRW_SF_PRIM_POINTS = 0, + BRW_SF_PRIM_LINES = 1, + BRW_SF_PRIM_TRIANGLES = 2, + BRW_SF_PRIM_UNFILLED_TRIS = 3, +}; + +struct brw_sf_prog_key { + uint64_t attrs; + bool contains_flat_varying; + unsigned char interp_mode[65]; /* BRW_VARYING_SLOT_COUNT */ + uint8_t point_sprite_coord_replace; + enum brw_sf_primitive primitive:2; + bool do_twoside_color:1; + bool frontface_ccw:1; + bool do_point_sprite:1; + bool do_point_coord:1; + bool sprite_origin_lower_left:1; + bool userclip_active:1; + unsigned padding: 32; +}; + +enum brw_clip_mode { + BRW_CLIP_MODE_NORMAL = 0, + BRW_CLIP_MODE_CLIP_ALL = 1, + BRW_CLIP_MODE_CLIP_NON_REJECTED = 2, + BRW_CLIP_MODE_REJECT_ALL = 3, + BRW_CLIP_MODE_ACCEPT_ALL = 4, + BRW_CLIP_MODE_KERNEL_CLIP = 5, +}; + +enum brw_clip_fill_mode { + BRW_CLIP_FILL_MODE_LINE = 0, + BRW_CLIP_FILL_MODE_POINT = 1, + BRW_CLIP_FILL_MODE_FILL = 2, + BRW_CLIP_FILL_MODE_CULL = 3, +}; + +/* Note that if unfilled primitives are being emitted, we have to fix + * up polygon offset and flatshading at this point: + */ +struct brw_clip_prog_key { + uint64_t attrs; + float offset_factor; + float offset_units; + float offset_clamp; + bool contains_flat_varying; + bool contains_noperspective_varying; + unsigned char interp_mode[65]; /* BRW_VARYING_SLOT_COUNT */ + unsigned primitive:4; + unsigned nr_userclip:4; + bool pv_first:1; + bool do_unfilled:1; + enum brw_clip_fill_mode fill_cw:2; /* includes cull information */ + enum brw_clip_fill_mode fill_ccw:2; /* includes cull information */ + bool offset_cw:1; + bool offset_ccw:1; + bool copy_bfc_cw:1; + bool copy_bfc_ccw:1; + enum brw_clip_mode clip_mode:3; + uint64_t padding:51; +}; + +/* A big lookup table is used to figure out which and how many + * additional regs will inserted before the main payload in the WM + * program execution. These mainly relate to depth and stencil + * processing and the early-depth-test optimization. + */ +enum brw_wm_iz_bits { + BRW_WM_IZ_PS_KILL_ALPHATEST_BIT = 0x1, + BRW_WM_IZ_PS_COMPUTES_DEPTH_BIT = 0x2, + BRW_WM_IZ_DEPTH_WRITE_ENABLE_BIT = 0x4, + BRW_WM_IZ_DEPTH_TEST_ENABLE_BIT = 0x8, + BRW_WM_IZ_STENCIL_WRITE_ENABLE_BIT = 0x10, + BRW_WM_IZ_STENCIL_TEST_ENABLE_BIT = 0x20, + BRW_WM_IZ_BIT_MAX = 0x40 +}; + +enum brw_sometimes { + BRW_NEVER = 0, + BRW_SOMETIMES, + BRW_ALWAYS +}; + +static inline enum brw_sometimes +brw_sometimes_invert(enum brw_sometimes x) +{ + return (enum brw_sometimes)((int)BRW_ALWAYS - (int)x); +} + +/** The program key for Fragment/Pixel Shaders. */ +struct brw_wm_prog_key { + struct brw_base_prog_key base; + + uint64_t input_slots_valid; + float alpha_test_ref; + uint8_t color_outputs_valid; + + /* Some collection of BRW_WM_IZ_* */ + uint8_t iz_lookup; + bool stats_wm:1; + bool flat_shade:1; + unsigned nr_color_regions:5; + bool emit_alpha_test:1; + enum compare_func alpha_test_func:3; /* < For Gfx4/5 MRT alpha test */ + bool alpha_test_replicate_alpha:1; + enum brw_sometimes alpha_to_coverage:2; + bool clamp_fragment_color:1; + + bool force_dual_color_blend:1; + + /** Whether or inputs are interpolated at sample rate by default + * + * This corresponds to the sample shading API bit in Vulkan or OpenGL which + * controls how inputs with no interpolation qualifier are interpolated. + * This is distinct from the way that using gl_SampleID or similar requires + * us to run per-sample. Even when running per-sample due to gl_SampleID, + * we may still interpolate unqualified inputs at the pixel center. + */ + enum brw_sometimes persample_interp:2; + + /* Whether or not we are running on a multisampled framebuffer */ + enum brw_sometimes multisample_fbo:2; + + enum brw_sometimes line_aa:2; + + /* Whether the preceding shader stage is mesh */ + enum brw_sometimes mesh_input:2; + + bool coherent_fb_fetch:1; + bool ignore_sample_mask_out:1; + bool coarse_pixel:1; + + uint64_t padding:53; +}; + +struct brw_cs_prog_key { + struct brw_base_prog_key base; +}; + +struct brw_bs_prog_key { + struct brw_base_prog_key base; + + /* Represents enum enum brw_rt_ray_flags values given at pipeline creation + * to be combined with ray_flags handed to the traceRayEXT() calls by the + * shader. + */ + uint32_t pipeline_ray_flags; +}; + +struct brw_ff_gs_prog_key { + uint64_t attrs; + + /** + * Map from the index of a transform feedback binding table entry to the + * gl_varying_slot that should be streamed out through that binding table + * entry. + */ + unsigned char transform_feedback_bindings[BRW_MAX_SOL_BINDINGS]; + + /** + * Map from the index of a transform feedback binding table entry to the + * swizzles that should be used when streaming out data through that + * binding table entry. + */ + unsigned char transform_feedback_swizzles[BRW_MAX_SOL_BINDINGS]; + + /** + * Hardware primitive type being drawn, e.g. _3DPRIM_TRILIST. + */ + unsigned primitive:8; + + unsigned pv_first:1; + unsigned need_gs_prog:1; + + /** + * Number of varyings that are output to transform feedback. + */ + unsigned num_transform_feedback_bindings:7; /* 0-BRW_MAX_SOL_BINDINGS */ + uint64_t padding:47; +}; + +/* brw_any_prog_key is any of the keys that map to an API stage */ +union brw_any_prog_key { + struct brw_base_prog_key base; + struct brw_vs_prog_key vs; + struct brw_tcs_prog_key tcs; + struct brw_tes_prog_key tes; + struct brw_gs_prog_key gs; + struct brw_wm_prog_key wm; + struct brw_cs_prog_key cs; + struct brw_bs_prog_key bs; + struct brw_task_prog_key task; + struct brw_mesh_prog_key mesh; +}; + +PRAGMA_DIAGNOSTIC_POP + +/** Max number of render targets in a shader */ +#define BRW_MAX_DRAW_BUFFERS 8 + +/** + * Binding table index for the first gfx6 SOL binding. + */ +#define BRW_GFX6_SOL_BINDING_START 0 + +struct brw_ubo_range +{ + uint16_t block; + + /* In units of 32-byte registers */ + uint8_t start; + uint8_t length; +}; + +/* We reserve the first 2^16 values for builtins */ +#define BRW_PARAM_IS_BUILTIN(param) (((param) & 0xffff0000) == 0) + +enum brw_param_builtin { + BRW_PARAM_BUILTIN_ZERO, + + BRW_PARAM_BUILTIN_CLIP_PLANE_0_X, + BRW_PARAM_BUILTIN_CLIP_PLANE_0_Y, + BRW_PARAM_BUILTIN_CLIP_PLANE_0_Z, + BRW_PARAM_BUILTIN_CLIP_PLANE_0_W, + BRW_PARAM_BUILTIN_CLIP_PLANE_1_X, + BRW_PARAM_BUILTIN_CLIP_PLANE_1_Y, + BRW_PARAM_BUILTIN_CLIP_PLANE_1_Z, + BRW_PARAM_BUILTIN_CLIP_PLANE_1_W, + BRW_PARAM_BUILTIN_CLIP_PLANE_2_X, + BRW_PARAM_BUILTIN_CLIP_PLANE_2_Y, + BRW_PARAM_BUILTIN_CLIP_PLANE_2_Z, + BRW_PARAM_BUILTIN_CLIP_PLANE_2_W, + BRW_PARAM_BUILTIN_CLIP_PLANE_3_X, + BRW_PARAM_BUILTIN_CLIP_PLANE_3_Y, + BRW_PARAM_BUILTIN_CLIP_PLANE_3_Z, + BRW_PARAM_BUILTIN_CLIP_PLANE_3_W, + BRW_PARAM_BUILTIN_CLIP_PLANE_4_X, + BRW_PARAM_BUILTIN_CLIP_PLANE_4_Y, + BRW_PARAM_BUILTIN_CLIP_PLANE_4_Z, + BRW_PARAM_BUILTIN_CLIP_PLANE_4_W, + BRW_PARAM_BUILTIN_CLIP_PLANE_5_X, + BRW_PARAM_BUILTIN_CLIP_PLANE_5_Y, + BRW_PARAM_BUILTIN_CLIP_PLANE_5_Z, + BRW_PARAM_BUILTIN_CLIP_PLANE_5_W, + BRW_PARAM_BUILTIN_CLIP_PLANE_6_X, + BRW_PARAM_BUILTIN_CLIP_PLANE_6_Y, + BRW_PARAM_BUILTIN_CLIP_PLANE_6_Z, + BRW_PARAM_BUILTIN_CLIP_PLANE_6_W, + BRW_PARAM_BUILTIN_CLIP_PLANE_7_X, + BRW_PARAM_BUILTIN_CLIP_PLANE_7_Y, + BRW_PARAM_BUILTIN_CLIP_PLANE_7_Z, + BRW_PARAM_BUILTIN_CLIP_PLANE_7_W, + + BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X, + BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Y, + BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Z, + BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_W, + BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X, + BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y, + + BRW_PARAM_BUILTIN_PATCH_VERTICES_IN, + + BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_X, + BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Y, + BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Z, + BRW_PARAM_BUILTIN_SUBGROUP_ID, + BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X, + BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Y, + BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z, + BRW_PARAM_BUILTIN_WORK_DIM, +}; + +#define BRW_PARAM_BUILTIN_CLIP_PLANE(idx, comp) \ + (BRW_PARAM_BUILTIN_CLIP_PLANE_0_X + ((idx) << 2) + (comp)) + +#define BRW_PARAM_BUILTIN_IS_CLIP_PLANE(param) \ + ((param) >= BRW_PARAM_BUILTIN_CLIP_PLANE_0_X && \ + (param) <= BRW_PARAM_BUILTIN_CLIP_PLANE_7_W) + +#define BRW_PARAM_BUILTIN_CLIP_PLANE_IDX(param) \ + (((param) - BRW_PARAM_BUILTIN_CLIP_PLANE_0_X) >> 2) + +#define BRW_PARAM_BUILTIN_CLIP_PLANE_COMP(param) \ + (((param) - BRW_PARAM_BUILTIN_CLIP_PLANE_0_X) & 0x3) + +enum brw_shader_reloc_id { + BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW, + BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH, + BRW_SHADER_RELOC_SHADER_START_OFFSET, + BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW, + BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH, + BRW_SHADER_RELOC_DESCRIPTORS_ADDR_HIGH, +}; + +enum brw_shader_reloc_type { + /** An arbitrary 32-bit value */ + BRW_SHADER_RELOC_TYPE_U32, + /** A MOV instruction with an immediate source */ + BRW_SHADER_RELOC_TYPE_MOV_IMM, +}; + +/** Represents a code relocation + * + * Relocatable constants are immediates in the code which we want to be able + * to replace post-compile with the actual value. + */ +struct brw_shader_reloc { + /** The 32-bit ID of the relocatable constant */ + uint32_t id; + + /** Type of this relocation */ + enum brw_shader_reloc_type type; + + /** The offset in the shader to the relocated value + * + * For MOV_IMM relocs, this is an offset to the MOV instruction. This + * allows us to do some sanity checking while we update the value. + */ + uint32_t offset; + + /** Value to be added to the relocated value before it is written */ + uint32_t delta; +}; + +/** A value to write to a relocation */ +struct brw_shader_reloc_value { + /** The 32-bit ID of the relocatable constant */ + uint32_t id; + + /** The value with which to replace the relocated immediate */ + uint32_t value; +}; + +struct brw_stage_prog_data { + struct brw_ubo_range ubo_ranges[4]; + + unsigned nr_params; /**< number of float params/constants */ + + gl_shader_stage stage; + + /* zero_push_reg is a bitfield which indicates what push registers (if any) + * should be zeroed by SW at the start of the shader. The corresponding + * push_reg_mask_param specifies the param index (in 32-bit units) where + * the actual runtime 64-bit mask will be pushed. The shader will zero + * push reg i if + * + * reg_used & zero_push_reg & ~*push_reg_mask_param & (1ull << i) + * + * If this field is set, brw_compiler::compact_params must be false. + */ + uint64_t zero_push_reg; + unsigned push_reg_mask_param; + + unsigned curb_read_length; + unsigned total_scratch; + unsigned total_shared; + + unsigned program_size; + + unsigned const_data_size; + unsigned const_data_offset; + + unsigned num_relocs; + const struct brw_shader_reloc *relocs; + + /** Does this program pull from any UBO or other constant buffers? */ + bool has_ubo_pull; + + /** How many ray queries objects in this shader. */ + unsigned ray_queries; + + /** + * Register where the thread expects to find input data from the URB + * (typically uniforms, followed by vertex or fragment attributes). + */ + unsigned dispatch_grf_start_reg; + + bool use_alt_mode; /**< Use ALT floating point mode? Otherwise, IEEE. */ + + /* 32-bit identifiers for all push/pull parameters. These can be anything + * the driver wishes them to be; the core of the back-end compiler simply + * re-arranges them. The one restriction is that the bottom 2^16 values + * are reserved for builtins defined in the brw_param_builtin enum defined + * above. + */ + uint32_t *param; + + /* Whether shader uses atomic operations. */ + bool uses_atomic_load_store; +}; + +static inline uint32_t * +brw_stage_prog_data_add_params(struct brw_stage_prog_data *prog_data, + unsigned nr_new_params) +{ + unsigned old_nr_params = prog_data->nr_params; + prog_data->nr_params += nr_new_params; + prog_data->param = reralloc(ralloc_parent(prog_data->param), + prog_data->param, uint32_t, + prog_data->nr_params); + return prog_data->param + old_nr_params; +} + +enum brw_barycentric_mode { + BRW_BARYCENTRIC_PERSPECTIVE_PIXEL = 0, + BRW_BARYCENTRIC_PERSPECTIVE_CENTROID = 1, + BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE = 2, + BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL = 3, + BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID = 4, + BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE = 5, + BRW_BARYCENTRIC_MODE_COUNT = 6 +}; +#define BRW_BARYCENTRIC_PERSPECTIVE_BITS \ + ((1 << BRW_BARYCENTRIC_PERSPECTIVE_PIXEL) | \ + (1 << BRW_BARYCENTRIC_PERSPECTIVE_CENTROID) | \ + (1 << BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE)) +#define BRW_BARYCENTRIC_NONPERSPECTIVE_BITS \ + ((1 << BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL) | \ + (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID) | \ + (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE)) + +enum brw_pixel_shader_computed_depth_mode { + BRW_PSCDEPTH_OFF = 0, /* PS does not compute depth */ + BRW_PSCDEPTH_ON = 1, /* PS computes depth; no guarantee about value */ + BRW_PSCDEPTH_ON_GE = 2, /* PS guarantees output depth >= source depth */ + BRW_PSCDEPTH_ON_LE = 3, /* PS guarantees output depth <= source depth */ +}; + +/* Data about a particular attempt to compile a program. Note that + * there can be many of these, each in a different GL state + * corresponding to a different brw_wm_prog_key struct, with different + * compiled programs. + */ +struct brw_wm_prog_data { + struct brw_stage_prog_data base; + + unsigned num_per_primitive_inputs; + unsigned num_varying_inputs; + + uint8_t reg_blocks_8; + uint8_t reg_blocks_16; + uint8_t reg_blocks_32; + + uint8_t dispatch_grf_start_reg_16; + uint8_t dispatch_grf_start_reg_32; + uint32_t prog_offset_16; + uint32_t prog_offset_32; + + struct { + /** @{ + * surface indices the WM-specific surfaces + */ + uint32_t render_target_read_start; + /** @} */ + } binding_table; + + uint8_t color_outputs_written; + uint8_t computed_depth_mode; + + /** + * Number of polygons handled in parallel by the multi-polygon PS + * kernel. + */ + uint8_t max_polygons; + + /** + * Dispatch width of the multi-polygon PS kernel, or 0 if no + * multi-polygon kernel was built. + */ + uint8_t dispatch_multi; + + bool computed_stencil; + bool early_fragment_tests; + bool post_depth_coverage; + bool inner_coverage; + bool dispatch_8; + bool dispatch_16; + bool dispatch_32; + bool dual_src_blend; + bool uses_pos_offset; + bool uses_omask; + bool uses_kill; + bool uses_src_depth; + bool uses_src_w; + bool uses_depth_w_coefficients; + bool uses_sample_mask; + bool uses_vmask; + bool has_render_target_reads; + bool has_side_effects; + bool pulls_bary; + + bool contains_flat_varying; + bool contains_noperspective_varying; + + /** True if the shader wants sample shading + * + * This corresponds to whether or not a gl_SampleId, gl_SamplePosition, or + * a sample-qualified input are used in the shader. It is independent of + * GL_MIN_SAMPLE_SHADING_VALUE in GL or minSampleShading in Vulkan. + */ + bool sample_shading; + + /** Should this shader be dispatched per-sample */ + enum brw_sometimes persample_dispatch; + + /** + * Shader is ran at the coarse pixel shading dispatch rate (3DSTATE_CPS). + */ + enum brw_sometimes coarse_pixel_dispatch; + + /** + * Shader writes the SampleMask and this is AND-ed with the API's + * SampleMask to generate a new coverage mask. + */ + enum brw_sometimes alpha_to_coverage; + + unsigned msaa_flags_param; + + /** + * Mask of which interpolation modes are required by the fragment shader. + * Those interpolations are delivered as part of the thread payload. Used + * in hardware setup on gfx6+. + */ + uint32_t barycentric_interp_modes; + + /** + * Whether nonperspective interpolation modes are used by the + * barycentric_interp_modes or fragment shader through interpolator messages. + */ + bool uses_nonperspective_interp_modes; + + /** + * Mask of which FS inputs are marked flat by the shader source. This is + * needed for setting up 3DSTATE_SF/SBE. + */ + uint32_t flat_inputs; + + /** + * The FS inputs + */ + uint64_t inputs; + + /* Mapping of VUE slots to interpolation modes. + * Used by the Gfx4-5 clip/sf/wm stages. + */ + unsigned char interp_mode[65]; /* BRW_VARYING_SLOT_COUNT */ + + /** + * Map from gl_varying_slot to the position within the FS setup data + * payload where the varying's attribute vertex deltas should be delivered. + * For varying slots that are not used by the FS, the value is -1. + */ + int urb_setup[VARYING_SLOT_MAX]; + int urb_setup_channel[VARYING_SLOT_MAX]; + + /** + * Cache structure into the urb_setup array above that contains the + * attribute numbers of active varyings out of urb_setup. + * The actual count is stored in urb_setup_attribs_count. + */ + uint8_t urb_setup_attribs[VARYING_SLOT_MAX]; + uint8_t urb_setup_attribs_count; +}; + +#ifdef GFX_VERx10 + +#if GFX_VERx10 >= 200 + +/** Returns the SIMD width corresponding to a given KSP index + * + * The "Variable Pixel Dispatch" table in the PRM (which can be found, for + * example in Vol. 7 of the SKL PRM) has a mapping from dispatch widths to + * kernel start pointer (KSP) indices that is based on what dispatch widths + * are enabled. This function provides, effectively, the reverse mapping. + * + * If the given KSP is enabled, a SIMD width of 8, 16, or 32 is + * returned. Note that for a multipolygon dispatch kernel 8 is always + * returned, since multipolygon kernels use the "_8" fields from + * brw_wm_prog_data regardless of their SIMD width. If the KSP is + * invalid, 0 is returned. + */ +static inline unsigned +brw_fs_simd_width_for_ksp(unsigned ksp_idx, bool enabled, unsigned width_sel) +{ + assert(ksp_idx < 2); + return !enabled ? 0 : + width_sel ? 32 : + 16; +} + +#define brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx) \ + (ksp_idx == 0 && (wm_state).Kernel0MaximumPolysperThread ? 8 : \ + ksp_idx == 0 ? brw_fs_simd_width_for_ksp(ksp_idx, (wm_state).Kernel0Enable, \ + (wm_state).Kernel0SIMDWidth): \ + brw_fs_simd_width_for_ksp(ksp_idx, (wm_state).Kernel1Enable, \ + (wm_state).Kernel1SIMDWidth)) + +#else + +/** Returns the SIMD width corresponding to a given KSP index + * + * The "Variable Pixel Dispatch" table in the PRM (which can be found, for + * example in Vol. 7 of the SKL PRM) has a mapping from dispatch widths to + * kernel start pointer (KSP) indices that is based on what dispatch widths + * are enabled. This function provides, effectively, the reverse mapping. + * + * If the given KSP is valid with respect to the SIMD8/16/32 enables, a SIMD + * width of 8, 16, or 32 is returned. If the KSP is invalid, 0 is returned. + */ +static inline unsigned +brw_fs_simd_width_for_ksp(unsigned ksp_idx, bool simd8_enabled, + bool simd16_enabled, bool simd32_enabled) +{ + /* This function strictly ignores contiguous dispatch */ + switch (ksp_idx) { + case 0: + return simd8_enabled ? 8 : + (simd16_enabled && !simd32_enabled) ? 16 : + (simd32_enabled && !simd16_enabled) ? 32 : 0; + case 1: + return (simd32_enabled && (simd16_enabled || simd8_enabled)) ? 32 : 0; + case 2: + return (simd16_enabled && (simd32_enabled || simd8_enabled)) ? 16 : 0; + default: + unreachable("Invalid KSP index"); + } +} + +#define brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx) \ + brw_fs_simd_width_for_ksp((ksp_idx), (wm_state)._8PixelDispatchEnable, \ + (wm_state)._16PixelDispatchEnable, \ + (wm_state)._32PixelDispatchEnable) + +#endif + +#endif + +#define brw_wm_state_has_ksp(wm_state, ksp_idx) \ + (brw_wm_state_simd_width_for_ksp((wm_state), (ksp_idx)) != 0) + +static inline uint32_t +_brw_wm_prog_data_prog_offset(const struct brw_wm_prog_data *prog_data, + unsigned simd_width) +{ + switch (simd_width) { + case 8: return 0; + case 16: return prog_data->prog_offset_16; + case 32: return prog_data->prog_offset_32; + default: return 0; + } +} + +#define brw_wm_prog_data_prog_offset(prog_data, wm_state, ksp_idx) \ + _brw_wm_prog_data_prog_offset(prog_data, \ + brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx)) + +static inline uint8_t +_brw_wm_prog_data_dispatch_grf_start_reg(const struct brw_wm_prog_data *prog_data, + unsigned simd_width) +{ + switch (simd_width) { + case 8: return prog_data->base.dispatch_grf_start_reg; + case 16: return prog_data->dispatch_grf_start_reg_16; + case 32: return prog_data->dispatch_grf_start_reg_32; + default: return 0; + } +} + +#define brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm_state, ksp_idx) \ + _brw_wm_prog_data_dispatch_grf_start_reg(prog_data, \ + brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx)) + +static inline uint8_t +_brw_wm_prog_data_reg_blocks(const struct brw_wm_prog_data *prog_data, + unsigned simd_width) +{ + switch (simd_width) { + case 8: return prog_data->reg_blocks_8; + case 16: return prog_data->reg_blocks_16; + case 32: return prog_data->reg_blocks_32; + default: return 0; + } +} + +#define brw_wm_prog_data_reg_blocks(prog_data, wm_state, ksp_idx) \ + _brw_wm_prog_data_reg_blocks(prog_data, \ + brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx)) + +static inline bool +brw_wm_prog_data_is_persample(const struct brw_wm_prog_data *prog_data, + enum intel_msaa_flags pushed_msaa_flags) +{ + if (pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC) { + if (!(pushed_msaa_flags & INTEL_MSAA_FLAG_MULTISAMPLE_FBO)) + return false; + + if (prog_data->sample_shading) + assert(pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH); + + if (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH) + assert(prog_data->persample_dispatch != BRW_NEVER); + else + assert(prog_data->persample_dispatch != BRW_ALWAYS); + + return (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH) != 0; + } + + assert(prog_data->persample_dispatch == BRW_ALWAYS || + prog_data->persample_dispatch == BRW_NEVER); + + return prog_data->persample_dispatch; +} + +static inline uint32_t +wm_prog_data_barycentric_modes(const struct brw_wm_prog_data *prog_data, + enum intel_msaa_flags pushed_msaa_flags) +{ + uint32_t modes = prog_data->barycentric_interp_modes; + + /* In the non dynamic case, we can just return the computed modes from + * compilation time. + */ + if (!(pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC)) + return modes; + + if (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_INTERP) { + assert(prog_data->persample_dispatch == BRW_ALWAYS || + (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH)); + + /* Making dynamic per-sample interpolation work is a bit tricky. The + * hardware will hang if SAMPLE is requested but per-sample dispatch is + * not enabled. This means we can't preemptively add SAMPLE to the + * barycentrics bitfield. Instead, we have to add it late and only + * on-demand. Annoyingly, changing the number of barycentrics requested + * changes the whole PS shader payload so we very much don't want to do + * that. Instead, if the dynamic per-sample interpolation flag is set, + * we check to see if SAMPLE was requested and, if not, replace the + * highest barycentric bit in the [non]perspective grouping (CENTROID, + * if it exists, else PIXEL) with SAMPLE. The shader will stomp all the + * barycentrics in the shader with SAMPLE so it really doesn't matter + * which one we replace. The important thing is that we keep the number + * of barycentrics in each [non]perspective grouping the same. + */ + if ((modes & BRW_BARYCENTRIC_PERSPECTIVE_BITS) && + !(modes & BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE))) { + int sample_mode = + util_last_bit(modes & BRW_BARYCENTRIC_PERSPECTIVE_BITS) - 1; + assert(modes & BITFIELD_BIT(sample_mode)); + + modes &= ~BITFIELD_BIT(sample_mode); + modes |= BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE); + } + + if ((modes & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) && + !(modes & BITFIELD_BIT(BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))) { + int sample_mode = + util_last_bit(modes & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) - 1; + assert(modes & BITFIELD_BIT(sample_mode)); + + modes &= ~BITFIELD_BIT(sample_mode); + modes |= BITFIELD_BIT(BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE); + } + } else { + /* If we're not using per-sample interpolation, we need to disable the + * per-sample bits. + * + * SKL PRMs, Volume 2a: Command Reference: Instructions, + * 3DSTATE_WM:Barycentric Interpolation Mode: + + * "MSDISPMODE_PERSAMPLE is required in order to select Perspective + * Sample or Non-perspective Sample barycentric coordinates." + */ + modes &= ~(BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE) | + BITFIELD_BIT(BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE)); + } + + return modes; +} + +static inline bool +brw_wm_prog_data_is_coarse(const struct brw_wm_prog_data *prog_data, + enum intel_msaa_flags pushed_msaa_flags) +{ + if (pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC) { + if (pushed_msaa_flags & INTEL_MSAA_FLAG_COARSE_RT_WRITES) + assert(prog_data->coarse_pixel_dispatch != BRW_NEVER); + else + assert(prog_data->coarse_pixel_dispatch != BRW_ALWAYS); + + return pushed_msaa_flags & INTEL_MSAA_FLAG_COARSE_RT_WRITES; + } + + assert(prog_data->coarse_pixel_dispatch == BRW_ALWAYS || + prog_data->coarse_pixel_dispatch == BRW_NEVER); + + return prog_data->coarse_pixel_dispatch; +} + +struct brw_push_const_block { + unsigned dwords; /* Dword count, not reg aligned */ + unsigned regs; + unsigned size; /* Bytes, register aligned */ +}; + +struct brw_cs_prog_data { + struct brw_stage_prog_data base; + + unsigned local_size[3]; + + /* Program offsets for the 8/16/32 SIMD variants. Multiple variants are + * kept when using variable group size, and the right one can only be + * decided at dispatch time. + */ + unsigned prog_offset[3]; + + /* Bitmask indicating which program offsets are valid. */ + unsigned prog_mask; + + /* Bitmask indicating which programs have spilled. */ + unsigned prog_spilled; + + bool uses_barrier; + bool uses_num_work_groups; + bool uses_inline_data; + bool uses_btd_stack_ids; + bool uses_systolic; + uint8_t generate_local_id; + enum intel_compute_walk_order walk_order; + + struct { + struct brw_push_const_block cross_thread; + struct brw_push_const_block per_thread; + } push; + + struct { + /** @{ + * surface indices the CS-specific surfaces + */ + uint32_t work_groups_start; + /** @} */ + } binding_table; +}; + +static inline uint32_t +brw_cs_prog_data_prog_offset(const struct brw_cs_prog_data *prog_data, + unsigned dispatch_width) +{ + assert(dispatch_width == 8 || + dispatch_width == 16 || + dispatch_width == 32); + const unsigned index = dispatch_width / 16; + assert(prog_data->prog_mask & (1 << index)); + return prog_data->prog_offset[index]; +} + +struct brw_bs_prog_data { + struct brw_stage_prog_data base; + + /** SIMD size of the root shader */ + uint8_t simd_size; + + /** Maximum stack size of all shaders */ + uint32_t max_stack_size; + + /** Offset into the shader where the resume SBT is located */ + uint32_t resume_sbt_offset; + + /** Number of resume shaders */ + uint32_t num_resume_shaders; +}; + +struct brw_ff_gs_prog_data { + unsigned urb_read_length; + unsigned total_grf; + + /** + * Gfx6 transform feedback: Amount by which the streaming vertex buffer + * indices should be incremented each time the GS is invoked. + */ + unsigned svbi_postincrement_value; +}; + +/** + * Enum representing the i965-specific vertex results that don't correspond + * exactly to any element of gl_varying_slot. The values of this enum are + * assigned such that they don't conflict with gl_varying_slot. + */ +typedef enum +{ + BRW_VARYING_SLOT_NDC = VARYING_SLOT_MAX, + BRW_VARYING_SLOT_PAD, + /** + * Technically this is not a varying but just a placeholder that + * compile_sf_prog() inserts into its VUE map to cause the gl_PointCoord + * builtin variable to be compiled correctly. see compile_sf_prog() for + * more info. + */ + BRW_VARYING_SLOT_PNTC, + BRW_VARYING_SLOT_COUNT +} brw_varying_slot; + +/** + * We always program SF to start reading at an offset of 1 (2 varying slots) + * from the start of the vertex URB entry. This causes it to skip: + * - VARYING_SLOT_PSIZ and BRW_VARYING_SLOT_NDC on gfx4-5 + * - VARYING_SLOT_PSIZ and VARYING_SLOT_POS on gfx6+ + */ +#define BRW_SF_URB_ENTRY_READ_OFFSET 1 + +/** + * Bitmask indicating which fragment shader inputs represent varyings (and + * hence have to be delivered to the fragment shader by the SF/SBE stage). + */ +#define BRW_FS_VARYING_INPUT_MASK \ + (BITFIELD64_RANGE(0, VARYING_SLOT_MAX) & \ + ~VARYING_BIT_POS & ~VARYING_BIT_FACE) + +void brw_print_vue_map(FILE *fp, const struct intel_vue_map *vue_map, + gl_shader_stage stage); + +/** + * Convert a VUE slot number into a byte offset within the VUE. + */ +static inline unsigned brw_vue_slot_to_offset(unsigned slot) +{ + return 16*slot; +} + +/** + * Convert a vertex output (brw_varying_slot) into a byte offset within the + * VUE. + */ +static inline unsigned +brw_varying_to_offset(const struct intel_vue_map *vue_map, unsigned varying) +{ + return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]); +} + +void brw_compute_vue_map(const struct intel_device_info *devinfo, + struct intel_vue_map *vue_map, + uint64_t slots_valid, + bool separate_shader, + uint32_t pos_slots); + +void brw_compute_tess_vue_map(struct intel_vue_map *const vue_map, + uint64_t slots_valid, + uint32_t is_patch); + +/* brw_interpolation_map.c */ +void brw_setup_vue_interpolation(const struct intel_vue_map *vue_map, + struct nir_shader *nir, + struct brw_wm_prog_data *prog_data); + +struct brw_vue_prog_data { + struct brw_stage_prog_data base; + struct intel_vue_map vue_map; + + /** Should the hardware deliver input VUE handles for URB pull loads? */ + bool include_vue_handles; + + unsigned urb_read_length; + unsigned total_grf; + + uint32_t clip_distance_mask; + uint32_t cull_distance_mask; + + /* Used for calculating urb partitions. In the VS, this is the size of the + * URB entry used for both input and output to the thread. In the GS, this + * is the size of the URB entry used for output. + */ + unsigned urb_entry_size; + + enum intel_shader_dispatch_mode dispatch_mode; +}; + +struct brw_vs_prog_data { + struct brw_vue_prog_data base; + + uint64_t inputs_read; + uint64_t double_inputs_read; + + unsigned nr_attribute_slots; + + bool uses_vertexid; + bool uses_instanceid; + bool uses_is_indexed_draw; + bool uses_firstvertex; + bool uses_baseinstance; + bool uses_drawid; +}; + +struct brw_tcs_prog_data +{ + struct brw_vue_prog_data base; + + /** Should the non-SINGLE_PATCH payload provide primitive ID? */ + bool include_primitive_id; + + /** Number vertices in output patch */ + int instances; + + /** Track patch count threshold */ + int patch_count_threshold; +}; + + +struct brw_tes_prog_data +{ + struct brw_vue_prog_data base; + + enum intel_tess_partitioning partitioning; + enum intel_tess_output_topology output_topology; + enum intel_tess_domain domain; + bool include_primitive_id; +}; + +struct brw_gs_prog_data +{ + struct brw_vue_prog_data base; + + unsigned vertices_in; + + /** + * Size of an output vertex, measured in HWORDS (32 bytes). + */ + unsigned output_vertex_size_hwords; + + unsigned output_topology; + + /** + * Size of the control data (cut bits or StreamID bits), in hwords (32 + * bytes). 0 if there is no control data. + */ + unsigned control_data_header_size_hwords; + + /** + * Format of the control data (either GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID + * if the control data is StreamID bits, or + * GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT if the control data is cut bits). + * Ignored if control_data_header_size is 0. + */ + unsigned control_data_format; + + bool include_primitive_id; + + /** + * The number of vertices emitted, if constant - otherwise -1. + */ + int static_vertex_count; + + int invocations; + + /** + * Gfx6: Provoking vertex convention for odd-numbered triangles + * in tristrips. + */ + unsigned pv_first:1; + + /** + * Gfx6: Number of varyings that are output to transform feedback. + */ + unsigned num_transform_feedback_bindings:7; /* 0-BRW_MAX_SOL_BINDINGS */ + + /** + * Gfx6: Map from the index of a transform feedback binding table entry to the + * gl_varying_slot that should be streamed out through that binding table + * entry. + */ + unsigned char transform_feedback_bindings[64 /* BRW_MAX_SOL_BINDINGS */]; + + /** + * Gfx6: Map from the index of a transform feedback binding table entry to the + * swizzles that should be used when streaming out data through that + * binding table entry. + */ + unsigned char transform_feedback_swizzles[64 /* BRW_MAX_SOL_BINDINGS */]; +}; + +struct brw_sf_prog_data { + uint32_t urb_read_length; + uint32_t total_grf; + + /* Each vertex may have up to 12 attributes, 4 components each, + * except WPOS which requires only 2. (11*4 + 2) == 44 ==> 11 + * rows. + * + * Actually we use 4 for each, so call it 12 rows. + */ + unsigned urb_entry_size; +}; + +struct brw_clip_prog_data { + uint32_t curb_read_length; /* user planes? */ + uint32_t clip_mode; + uint32_t urb_read_length; + uint32_t total_grf; +}; + +struct brw_tue_map { + uint32_t size_dw; + + uint32_t per_task_data_start_dw; +}; + +struct brw_mue_map { + int32_t start_dw[VARYING_SLOT_MAX]; + uint32_t len_dw[VARYING_SLOT_MAX]; + uint32_t per_primitive_indices_dw; + + uint32_t size_dw; + + uint32_t max_primitives; + uint32_t per_primitive_start_dw; + uint32_t per_primitive_header_size_dw; + uint32_t per_primitive_data_size_dw; + uint32_t per_primitive_pitch_dw; + bool user_data_in_primitive_header; + + uint32_t max_vertices; + uint32_t per_vertex_start_dw; + uint32_t per_vertex_header_size_dw; + uint32_t per_vertex_data_size_dw; + uint32_t per_vertex_pitch_dw; + bool user_data_in_vertex_header; +}; + +struct brw_task_prog_data { + struct brw_cs_prog_data base; + struct brw_tue_map map; + bool uses_drawid; +}; + +enum brw_mesh_index_format { + BRW_INDEX_FORMAT_U32, + BRW_INDEX_FORMAT_U888X, +}; + +struct brw_mesh_prog_data { + struct brw_cs_prog_data base; + struct brw_mue_map map; + + uint32_t clip_distance_mask; + uint32_t cull_distance_mask; + uint16_t primitive_type; + + enum brw_mesh_index_format index_format; + + bool uses_drawid; +}; + +/* brw_any_prog_data is prog_data for any stage that maps to an API stage */ +union brw_any_prog_data { + struct brw_stage_prog_data base; + struct brw_vue_prog_data vue; + struct brw_vs_prog_data vs; + struct brw_tcs_prog_data tcs; + struct brw_tes_prog_data tes; + struct brw_gs_prog_data gs; + struct brw_wm_prog_data wm; + struct brw_cs_prog_data cs; + struct brw_bs_prog_data bs; + struct brw_task_prog_data task; + struct brw_mesh_prog_data mesh; +}; + +#define DEFINE_PROG_DATA_DOWNCAST(STAGE, CHECK) \ +static inline struct brw_##STAGE##_prog_data * \ +brw_##STAGE##_prog_data(struct brw_stage_prog_data *prog_data) \ +{ \ + if (prog_data) \ + assert(CHECK); \ + return (struct brw_##STAGE##_prog_data *) prog_data; \ +} \ +static inline const struct brw_##STAGE##_prog_data * \ +brw_##STAGE##_prog_data_const(const struct brw_stage_prog_data *prog_data) \ +{ \ + if (prog_data) \ + assert(CHECK); \ + return (const struct brw_##STAGE##_prog_data *) prog_data; \ +} + +DEFINE_PROG_DATA_DOWNCAST(vs, prog_data->stage == MESA_SHADER_VERTEX) +DEFINE_PROG_DATA_DOWNCAST(tcs, prog_data->stage == MESA_SHADER_TESS_CTRL) +DEFINE_PROG_DATA_DOWNCAST(tes, prog_data->stage == MESA_SHADER_TESS_EVAL) +DEFINE_PROG_DATA_DOWNCAST(gs, prog_data->stage == MESA_SHADER_GEOMETRY) +DEFINE_PROG_DATA_DOWNCAST(wm, prog_data->stage == MESA_SHADER_FRAGMENT) +DEFINE_PROG_DATA_DOWNCAST(cs, gl_shader_stage_uses_workgroup(prog_data->stage)) +DEFINE_PROG_DATA_DOWNCAST(bs, brw_shader_stage_is_bindless(prog_data->stage)) + +DEFINE_PROG_DATA_DOWNCAST(vue, prog_data->stage == MESA_SHADER_VERTEX || + prog_data->stage == MESA_SHADER_TESS_CTRL || + prog_data->stage == MESA_SHADER_TESS_EVAL || + prog_data->stage == MESA_SHADER_GEOMETRY) + +DEFINE_PROG_DATA_DOWNCAST(task, prog_data->stage == MESA_SHADER_TASK) +DEFINE_PROG_DATA_DOWNCAST(mesh, prog_data->stage == MESA_SHADER_MESH) + +/* These are not really brw_stage_prog_data. */ +DEFINE_PROG_DATA_DOWNCAST(ff_gs, true) +DEFINE_PROG_DATA_DOWNCAST(clip, true) +DEFINE_PROG_DATA_DOWNCAST(sf, true) +#undef DEFINE_PROG_DATA_DOWNCAST + +struct brw_compile_stats { + uint32_t dispatch_width; /**< 0 for vec4 */ + uint32_t max_polygons; + uint32_t max_dispatch_width; + uint32_t instructions; + uint32_t sends; + uint32_t loops; + uint32_t cycles; + uint32_t spills; + uint32_t fills; + uint32_t max_live_registers; +}; + +/** @} */ + +struct brw_compiler * +brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo); + +/** + * Returns a compiler configuration for use with disk shader cache + * + * This value only needs to change for settings that can cause different + * program generation between two runs on the same hardware. + * + * For example, it doesn't need to be different for gen 8 and gen 9 hardware, + * but it does need to be different if INTEL_DEBUG=nocompact is or isn't used. + */ +uint64_t +brw_get_compiler_config_value(const struct brw_compiler *compiler); + +/* Provides a string sha1 hash of all device information fields that could + * affect shader compilation. + */ +void +brw_device_sha1(char *hex, const struct intel_device_info *devinfo); + +/* For callers computing their own UUID or hash. Hashes all device + * information fields that could affect shader compilation into the provided + * sha1_ctx. + */ +void +brw_device_sha1_update(struct mesa_sha1 *sha1_ctx, + const struct intel_device_info *devinfo); + +unsigned +brw_prog_data_size(gl_shader_stage stage); + +unsigned +brw_prog_key_size(gl_shader_stage stage); + +struct brw_compile_params { + void *mem_ctx; + + nir_shader *nir; + + struct brw_compile_stats *stats; + + void *log_data; + + char *error_str; + + uint64_t debug_flag; + + uint32_t source_hash; +}; + +/** + * Parameters for compiling a vertex shader. + * + * Some of these will be modified during the shader compilation. + */ +struct brw_compile_vs_params { + struct brw_compile_params base; + + const struct brw_vs_prog_key *key; + struct brw_vs_prog_data *prog_data; + + bool edgeflag_is_last; /* true for gallium */ +}; + +/** + * Compile a vertex shader. + * + * Returns the final assembly and updates the parameters structure. + */ +const unsigned * +brw_compile_vs(const struct brw_compiler *compiler, + struct brw_compile_vs_params *params); + +/** + * Parameters for compiling a tessellation control shader. + * + * Some of these will be modified during the shader compilation. + */ +struct brw_compile_tcs_params { + struct brw_compile_params base; + + const struct brw_tcs_prog_key *key; + struct brw_tcs_prog_data *prog_data; +}; + +/** + * Compile a tessellation control shader. + * + * Returns the final assembly and updates the parameters structure. + */ +const unsigned * +brw_compile_tcs(const struct brw_compiler *compiler, + struct brw_compile_tcs_params *params); + +/** + * Parameters for compiling a tessellation evaluation shader. + * + * Some of these will be modified during the shader compilation. + */ +struct brw_compile_tes_params { + struct brw_compile_params base; + + const struct brw_tes_prog_key *key; + struct brw_tes_prog_data *prog_data; + const struct intel_vue_map *input_vue_map; +}; + +/** + * Compile a tessellation evaluation shader. + * + * Returns the final assembly and updates the parameters structure. + */ +const unsigned * +brw_compile_tes(const struct brw_compiler *compiler, + struct brw_compile_tes_params *params); + +/** + * Parameters for compiling a geometry shader. + * + * Some of these will be modified during the shader compilation. + */ +struct brw_compile_gs_params { + struct brw_compile_params base; + + const struct brw_gs_prog_key *key; + struct brw_gs_prog_data *prog_data; +}; + +/** + * Compile a geometry shader. + * + * Returns the final assembly and updates the parameters structure. + */ +const unsigned * +brw_compile_gs(const struct brw_compiler *compiler, + struct brw_compile_gs_params *params); + +/** + * Compile a strips and fans shader. + * + * This is a fixed-function shader determined entirely by the shader key and + * a VUE map. + * + * Returns the final assembly and the program's size. + */ +const unsigned * +brw_compile_sf(const struct brw_compiler *compiler, + void *mem_ctx, + const struct brw_sf_prog_key *key, + struct brw_sf_prog_data *prog_data, + struct intel_vue_map *vue_map, + unsigned *final_assembly_size); + +/** + * Compile a clipper shader. + * + * This is a fixed-function shader determined entirely by the shader key and + * a VUE map. + * + * Returns the final assembly and the program's size. + */ +const unsigned * +brw_compile_clip(const struct brw_compiler *compiler, + void *mem_ctx, + const struct brw_clip_prog_key *key, + struct brw_clip_prog_data *prog_data, + struct intel_vue_map *vue_map, + unsigned *final_assembly_size); + +struct brw_compile_task_params { + struct brw_compile_params base; + + const struct brw_task_prog_key *key; + struct brw_task_prog_data *prog_data; +}; + +const unsigned * +brw_compile_task(const struct brw_compiler *compiler, + struct brw_compile_task_params *params); + +struct brw_compile_mesh_params { + struct brw_compile_params base; + + const struct brw_mesh_prog_key *key; + struct brw_mesh_prog_data *prog_data; + const struct brw_tue_map *tue_map; +}; + +const unsigned * +brw_compile_mesh(const struct brw_compiler *compiler, + struct brw_compile_mesh_params *params); + +/** + * Parameters for compiling a fragment shader. + * + * Some of these will be modified during the shader compilation. + */ +struct brw_compile_fs_params { + struct brw_compile_params base; + + const struct brw_wm_prog_key *key; + struct brw_wm_prog_data *prog_data; + + const struct intel_vue_map *vue_map; + const struct brw_mue_map *mue_map; + + bool allow_spilling; + bool use_rep_send; + uint8_t max_polygons; +}; + +/** + * Compile a fragment shader. + * + * Returns the final assembly and updates the parameters structure. + */ +const unsigned * +brw_compile_fs(const struct brw_compiler *compiler, + struct brw_compile_fs_params *params); + +/** + * Parameters for compiling a compute shader. + * + * Some of these will be modified during the shader compilation. + */ +struct brw_compile_cs_params { + struct brw_compile_params base; + + const struct brw_cs_prog_key *key; + struct brw_cs_prog_data *prog_data; +}; + +/** + * Compile a compute shader. + * + * Returns the final assembly and updates the parameters structure. + */ +const unsigned * +brw_compile_cs(const struct brw_compiler *compiler, + struct brw_compile_cs_params *params); + +/** + * Parameters for compiling a Bindless shader. + * + * Some of these will be modified during the shader compilation. + */ +struct brw_compile_bs_params { + struct brw_compile_params base; + + const struct brw_bs_prog_key *key; + struct brw_bs_prog_data *prog_data; + + unsigned num_resume_shaders; + struct nir_shader **resume_shaders; +}; + +/** + * Compile a Bindless shader. + * + * Returns the final assembly and updates the parameters structure. + */ +const unsigned * +brw_compile_bs(const struct brw_compiler *compiler, + struct brw_compile_bs_params *params); + +/** + * Compile a fixed function geometry shader. + * + * Returns the final assembly and the program's size. + */ +const unsigned * +brw_compile_ff_gs_prog(struct brw_compiler *compiler, + void *mem_ctx, + const struct brw_ff_gs_prog_key *key, + struct brw_ff_gs_prog_data *prog_data, + struct intel_vue_map *vue_map, + unsigned *final_assembly_size); + +void brw_debug_key_recompile(const struct brw_compiler *c, void *log, + gl_shader_stage stage, + const struct brw_base_prog_key *old_key, + const struct brw_base_prog_key *key); + +/* Shared Local Memory Size is specified as powers of two, + * and also have a Gen-dependent minimum value if not zero. + */ +static inline uint32_t +intel_calculate_slm_size(unsigned gen, uint32_t bytes) +{ + assert(bytes <= 64 * 1024); + if (bytes > 0) + return MAX2(util_next_power_of_two(bytes), gen >= 9 ? 1024 : 4096); + else + return 0; +} + +static inline uint32_t +encode_slm_size(unsigned gen, uint32_t bytes) +{ + uint32_t slm_size = 0; + + /* Shared Local Memory is specified as powers of two, and encoded in + * INTERFACE_DESCRIPTOR_DATA with the following representations: + * + * Size | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB | + * ------------------------------------------------------------------- + * Gfx7-8 | 0 | none | none | 1 | 2 | 4 | 8 | 16 | + * ------------------------------------------------------------------- + * Gfx9+ | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + */ + + if (bytes > 0) { + slm_size = intel_calculate_slm_size(gen, bytes); + assert(util_is_power_of_two_nonzero(slm_size)); + + if (gen >= 9) { + /* Turn an exponent of 10 (1024 kB) into 1. */ + assert(slm_size >= 1024); + slm_size = ffs(slm_size) - 10; + } else { + assert(slm_size >= 4096); + /* Convert to the pre-Gfx9 representation. */ + slm_size = slm_size / 4096; + } + } + + return slm_size; +} + +unsigned +brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data, + unsigned threads); + +void +brw_write_shader_relocs(const struct brw_isa_info *isa, + void *program, + const struct brw_stage_prog_data *prog_data, + struct brw_shader_reloc_value *values, + unsigned num_values); + +/** + * Get the dispatch information for a shader to be used with GPGPU_WALKER and + * similar instructions. + * + * If override_local_size is not NULL, it must to point to a 3-element that + * will override the value from prog_data->local_size. This is used by + * ARB_compute_variable_group_size, where the size is set only at dispatch + * time (so prog_data is outdated). + */ +struct intel_cs_dispatch_info +brw_cs_get_dispatch_info(const struct intel_device_info *devinfo, + const struct brw_cs_prog_data *prog_data, + const unsigned *override_local_size); + +/** + * Return true if the given shader stage is dispatched contiguously by the + * relevant fixed function starting from channel 0 of the SIMD thread, which + * implies that the dispatch mask of a thread can be assumed to have the form + * '2^n - 1' for some n. + */ +static inline bool +brw_stage_has_packed_dispatch(ASSERTED const struct intel_device_info *devinfo, + gl_shader_stage stage, unsigned max_polygons, + const struct brw_stage_prog_data *prog_data) +{ + /* The code below makes assumptions about the hardware's thread dispatch + * behavior that could be proven wrong in future generations -- Make sure + * to do a full test run with brw_fs_test_dispatch_packing() hooked up to + * the NIR front-end before changing this assertion. + */ + assert(devinfo->ver <= 12); + + switch (stage) { + case MESA_SHADER_FRAGMENT: { + /* The PSD discards subspans coming in with no lit samples, which in the + * per-pixel shading case implies that each subspan will either be fully + * lit (due to the VMask being used to allow derivative computations), + * or not dispatched at all. In per-sample dispatch mode individual + * samples from the same subspan have a fixed relative location within + * the SIMD thread, so dispatch of unlit samples cannot be avoided in + * general and we should return false. + */ + const struct brw_wm_prog_data *wm_prog_data = + (const struct brw_wm_prog_data *)prog_data; + return devinfo->verx10 < 125 && + !wm_prog_data->persample_dispatch && + wm_prog_data->uses_vmask && + max_polygons < 2; + } + case MESA_SHADER_COMPUTE: + /* Compute shaders will be spawned with either a fully enabled dispatch + * mask or with whatever bottom/right execution mask was given to the + * GPGPU walker command to be used along the workgroup edges -- In both + * cases the dispatch mask is required to be tightly packed for our + * invocation index calculations to work. + */ + return true; + default: + /* Most remaining fixed functions are limited to use a packed dispatch + * mask due to the hardware representation of the dispatch mask as a + * single counter representing the number of enabled channels. + */ + return true; + } +} + +/** + * Computes the first varying slot in the URB produced by the previous stage + * that is used in the next stage. We do this by testing the varying slots in + * the previous stage's vue map against the inputs read in the next stage. + * + * Note that: + * + * - Each URB offset contains two varying slots and we can only skip a + * full offset if both slots are unused, so the value we return here is always + * rounded down to the closest multiple of two. + * + * - gl_Layer and gl_ViewportIndex don't have their own varying slots, they are + * part of the vue header, so if these are read we can't skip anything. + */ +static inline int +brw_compute_first_urb_slot_required(uint64_t inputs_read, + const struct intel_vue_map *prev_stage_vue_map) +{ + if ((inputs_read & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PRIMITIVE_SHADING_RATE)) == 0) { + for (int i = 0; i < prev_stage_vue_map->num_slots; i++) { + int varying = prev_stage_vue_map->slot_to_varying[i]; + if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying)) != 0) + return ROUND_DOWN_TO(i, 2); + } + } + + return 0; +} + +/* From InlineData in 3DSTATE_TASK_SHADER_DATA and 3DSTATE_MESH_SHADER_DATA. */ +#define BRW_TASK_MESH_INLINE_DATA_SIZE_DW 8 + +/* InlineData[0-1] is used for Vulkan descriptor. */ +#define BRW_TASK_MESH_PUSH_CONSTANTS_START_DW 2 + +#define BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW \ + (BRW_TASK_MESH_INLINE_DATA_SIZE_DW - BRW_TASK_MESH_PUSH_CONSTANTS_START_DW) + +/** + * This enum is used as the base indice of the nir_load_topology_id_intel + * intrinsic. This is used to return different values based on some aspect of + * the topology of the device. + */ +enum brw_topology_id +{ + /* A value based of the DSS identifier the shader is currently running on. + * Be mindful that the DSS ID can be higher than the total number of DSS on + * the device. This is because of the fusing that can occur on different + * parts. + */ + BRW_TOPOLOGY_ID_DSS, + + /* A value composed of EU ID, thread ID & SIMD lane ID. */ + BRW_TOPOLOGY_ID_EU_THREAD_SIMD, +}; + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* BRW_COMPILER_H */ diff --git a/src/intel/compiler/elk/brw_dead_control_flow.cpp b/src/intel/compiler/elk/brw_dead_control_flow.cpp new file mode 100644 index 00000000000..0d9253bab18 --- /dev/null +++ b/src/intel/compiler/elk/brw_dead_control_flow.cpp @@ -0,0 +1,121 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_dead_control_flow.cpp + * + * This file implements the dead control flow elimination optimization pass. + */ + +#include "brw_shader.h" +#include "brw_cfg.h" + +using namespace brw; + +/* Look for and eliminate dead control flow: + * + * - if/endif + * - else in else/endif + * - then in if/else/endif + */ +bool +dead_control_flow_eliminate(backend_shader *s) +{ + bool progress = false; + + foreach_block_safe (block, s->cfg) { + bblock_t *prev_block = block->prev(); + + if (!prev_block) + continue; + + backend_instruction *const inst = block->start(); + backend_instruction *const prev_inst = prev_block->end(); + + /* ENDIF instructions, by definition, can only be found at the start of + * basic blocks. + */ + if (inst->opcode == BRW_OPCODE_ENDIF && + prev_inst->opcode == BRW_OPCODE_ELSE) { + bblock_t *const else_block = prev_block; + backend_instruction *const else_inst = prev_inst; + + else_inst->remove(else_block); + progress = true; + } else if (inst->opcode == BRW_OPCODE_ENDIF && + prev_inst->opcode == BRW_OPCODE_IF) { + bblock_t *const endif_block = block; + bblock_t *const if_block = prev_block; + backend_instruction *const endif_inst = inst; + backend_instruction *const if_inst = prev_inst; + + bblock_t *earlier_block = NULL, *later_block = NULL; + + if (if_block->start_ip == if_block->end_ip) { + earlier_block = if_block->prev(); + } else { + earlier_block = if_block; + } + if_inst->remove(if_block); + + if (endif_block->start_ip == endif_block->end_ip) { + later_block = endif_block->next(); + } else { + later_block = endif_block; + } + endif_inst->remove(endif_block); + + assert((earlier_block == NULL) == (later_block == NULL)); + if (earlier_block && earlier_block->can_combine_with(later_block)) { + earlier_block->combine_with(later_block); + + /* If ENDIF was in its own block, then we've now deleted it and + * merged the two surrounding blocks, the latter of which the + * __next block pointer was pointing to. + */ + if (endif_block != later_block) { + __next = earlier_block->next(); + } + } + + progress = true; + } else if (inst->opcode == BRW_OPCODE_ELSE && + prev_inst->opcode == BRW_OPCODE_IF) { + bblock_t *const else_block = block; + backend_instruction *const if_inst = prev_inst; + backend_instruction *const else_inst = inst; + + /* Since the else-branch is becoming the new then-branch, the + * condition has to be inverted. + */ + if_inst->predicate_inverse = !if_inst->predicate_inverse; + else_inst->remove(else_block); + + progress = true; + } + } + + if (progress) + s->invalidate_analysis(DEPENDENCY_BLOCKS | DEPENDENCY_INSTRUCTIONS); + + return progress; +} diff --git a/src/intel/compiler/elk/brw_dead_control_flow.h b/src/intel/compiler/elk/brw_dead_control_flow.h new file mode 100644 index 00000000000..9732c2b9f3f --- /dev/null +++ b/src/intel/compiler/elk/brw_dead_control_flow.h @@ -0,0 +1,31 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BRW_DEAD_CONTROL_FLOW_H +#define BRW_DEAD_CONTROL_FLOW_H + +#include "brw_shader.h" + +bool dead_control_flow_eliminate(backend_shader *s); + +#endif /* BRW_DEAD_CONTROL_FLOW_H */ diff --git a/src/intel/compiler/elk/brw_debug_recompile.c b/src/intel/compiler/elk/brw_debug_recompile.c new file mode 100644 index 00000000000..6e055e09f7c --- /dev/null +++ b/src/intel/compiler/elk/brw_debug_recompile.c @@ -0,0 +1,238 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file brw_debug_recompiles.c + */ + +#include + +#include "brw_compiler.h" + +static bool +key_debug(const struct brw_compiler *c, void *log, + const char *name, int a, int b) +{ + if (a != b) { + brw_shader_perf_log(c, log, " %s %d->%d\n", name, a, b); + return true; + } + return false; +} + +static bool +key_debug_float(const struct brw_compiler *c, void *log, + const char *name, float a, float b) +{ + if (a != b) { + brw_shader_perf_log(c, log, " %s %f->%f\n", name, a, b); + return true; + } + return false; +} + +#define check(name, field) \ + key_debug(c, log, name, old_key->field, key->field) +#define check_float(name, field) \ + key_debug_float(c, log, name, old_key->field, key->field) + +static bool +debug_sampler_recompile(const struct brw_compiler *c, void *log, + const struct brw_sampler_prog_key_data *old_key, + const struct brw_sampler_prog_key_data *key) +{ + bool found = false; + + found |= check("gather channel quirk", gather_channel_quirk_mask); + + for (unsigned i = 0; i < BRW_MAX_SAMPLERS; i++) { + found |= check("EXT_texture_swizzle or DEPTH_TEXTURE_MODE", swizzles[i]); + found |= check("textureGather workarounds", gfx6_gather_wa[i]); + } + + for (unsigned i = 0; i < 3; i++) { + found |= check("GL_CLAMP enabled on any texture unit", gl_clamp_mask[i]); + } + + return found; +} + +static bool +debug_base_recompile(const struct brw_compiler *c, void *log, + const struct brw_base_prog_key *old_key, + const struct brw_base_prog_key *key) +{ + return debug_sampler_recompile(c, log, &old_key->tex, &key->tex); +} + +static void +debug_vs_recompile(const struct brw_compiler *c, void *log, + const struct brw_vs_prog_key *old_key, + const struct brw_vs_prog_key *key) +{ + bool found = debug_base_recompile(c, log, &old_key->base, &key->base); + + for (unsigned i = 0; i < VERT_ATTRIB_MAX; i++) { + found |= check("vertex attrib w/a flags", gl_attrib_wa_flags[i]); + } + + found |= check("legacy user clipping", nr_userclip_plane_consts); + found |= check("copy edgeflag", copy_edgeflag); + found |= check("pointcoord replace", point_coord_replace); + found |= check("vertex color clamping", clamp_vertex_color); + + if (!found) { + brw_shader_perf_log(c, log, " something else\n"); + } +} + +static void +debug_tcs_recompile(const struct brw_compiler *c, void *log, + const struct brw_tcs_prog_key *old_key, + const struct brw_tcs_prog_key *key) +{ + bool found = debug_base_recompile(c, log, &old_key->base, &key->base); + + found |= check("input vertices", input_vertices); + found |= check("outputs written", outputs_written); + found |= check("patch outputs written", patch_outputs_written); + found |= check("tes primitive mode", _tes_primitive_mode); + found |= check("quads and equal_spacing workaround", quads_workaround); + + if (!found) { + brw_shader_perf_log(c, log, " something else\n"); + } +} + +static void +debug_tes_recompile(const struct brw_compiler *c, void *log, + const struct brw_tes_prog_key *old_key, + const struct brw_tes_prog_key *key) +{ + bool found = debug_base_recompile(c, log, &old_key->base, &key->base); + + found |= check("inputs read", inputs_read); + found |= check("patch inputs read", patch_inputs_read); + + if (!found) { + brw_shader_perf_log(c, log, " something else\n"); + } +} + +static void +debug_gs_recompile(const struct brw_compiler *c, void *log, + const struct brw_gs_prog_key *old_key, + const struct brw_gs_prog_key *key) +{ + bool found = debug_base_recompile(c, log, &old_key->base, &key->base); + + if (!found) { + brw_shader_perf_log(c, log, " something else\n"); + } +} + +static void +debug_fs_recompile(const struct brw_compiler *c, void *log, + const struct brw_wm_prog_key *old_key, + const struct brw_wm_prog_key *key) +{ + bool found = false; + + found |= check("alphatest, computed depth, depth test, or depth write", + iz_lookup); + found |= check("depth statistics", stats_wm); + found |= check("flat shading", flat_shade); + found |= check("number of color buffers", nr_color_regions); + found |= check("MRT alpha test", alpha_test_replicate_alpha); + found |= check("alpha to coverage", alpha_to_coverage); + found |= check("fragment color clamping", clamp_fragment_color); + found |= check("per-sample interpolation", persample_interp); + found |= check("multisampled FBO", multisample_fbo); + found |= check("line smoothing", line_aa); + found |= check("force dual color blending", force_dual_color_blend); + found |= check("coherent fb fetch", coherent_fb_fetch); + found |= check("ignore sample mask out", ignore_sample_mask_out); + found |= check("coarse pixel", coarse_pixel); + + found |= check("input slots valid", input_slots_valid); + found |= check("mrt alpha test function", alpha_test_func); + found |= check("mrt alpha test reference value", alpha_test_ref); + + found |= debug_base_recompile(c, log, &old_key->base, &key->base); + + if (!found) { + brw_shader_perf_log(c, log, " something else\n"); + } +} + +static void +debug_cs_recompile(const struct brw_compiler *c, void *log, + const struct brw_cs_prog_key *old_key, + const struct brw_cs_prog_key *key) +{ + bool found = debug_base_recompile(c, log, &old_key->base, &key->base); + + if (!found) { + brw_shader_perf_log(c, log, " something else\n"); + } +} + +void +brw_debug_key_recompile(const struct brw_compiler *c, void *log, + gl_shader_stage stage, + const struct brw_base_prog_key *old_key, + const struct brw_base_prog_key *key) +{ + if (!old_key) { + brw_shader_perf_log(c, log, " No previous compile found...\n"); + return; + } + + switch (stage) { + case MESA_SHADER_VERTEX: + debug_vs_recompile(c, log, (const struct brw_vs_prog_key *)old_key, + (const struct brw_vs_prog_key *)key); + break; + case MESA_SHADER_TESS_CTRL: + debug_tcs_recompile(c, log, (const struct brw_tcs_prog_key *)old_key, + (const struct brw_tcs_prog_key *)key); + break; + case MESA_SHADER_TESS_EVAL: + debug_tes_recompile(c, log, (const struct brw_tes_prog_key *)old_key, + (const struct brw_tes_prog_key *)key); + break; + case MESA_SHADER_GEOMETRY: + debug_gs_recompile(c, log, (const struct brw_gs_prog_key *)old_key, + (const struct brw_gs_prog_key *)key); + break; + case MESA_SHADER_FRAGMENT: + debug_fs_recompile(c, log, (const struct brw_wm_prog_key *)old_key, + (const struct brw_wm_prog_key *)key); + break; + case MESA_SHADER_COMPUTE: + debug_cs_recompile(c, log, (const struct brw_cs_prog_key *)old_key, + (const struct brw_cs_prog_key *)key); + break; + default: + break; + } +} diff --git a/src/intel/compiler/elk/brw_device_sha1_gen_c.py b/src/intel/compiler/elk/brw_device_sha1_gen_c.py new file mode 100755 index 00000000000..06aaa3b5478 --- /dev/null +++ b/src/intel/compiler/elk/brw_device_sha1_gen_c.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +COPYRIGHT = """\ +/* + * Copyright 2024 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +""" + +import argparse +import os +import sys + +from mako.template import Template +from mako import exceptions + +sys.path.append(f"{os.path.dirname(sys.argv[0])}/../dev") +import intel_device_info + +template = COPYRIGHT + """ + +/* DO NOT EDIT - This file generated automatically by intel_device_serialize_c.py script */ + +#include "dev/intel_device_info.h" +#include "brw_compiler.h" +#define SHA_UPDATE_FIELD(field) _mesa_sha1_update(ctx, &devinfo->field, sizeof(devinfo->field)) + +void +brw_device_sha1_update(struct mesa_sha1 *ctx, + const struct intel_device_info *devinfo) { +% for member in compiler_fields: + SHA_UPDATE_FIELD(${member.name}); +% endfor +} + +#undef SHA_UPDATE_FIELD + +""" + +def main(): + """print intel_device_serialize.c at the specified path""" + parser = argparse.ArgumentParser() + parser.add_argument('--outdir', required=True, + help='Directory to put the generated files in') + args = parser.parse_args() + path = os.path.join(args.outdir, 'brw_device_sha1_gen.c') + device_members = intel_device_info.TYPES_BY_NAME["intel_device_info"].members + compiler_fields = [field for field in device_members if field.compiler_field] + with open(path, 'w', encoding='utf-8') as f: + try: + f.write(Template(template).render(compiler_fields=compiler_fields)) + except: + print(exceptions.text_error_template().render(compiler_fields=compiler_fields)) + +if __name__ == "__main__": + main() diff --git a/src/intel/compiler/elk/brw_disasm.c b/src/intel/compiler/elk/brw_disasm.c new file mode 100644 index 00000000000..b70ee663a9a --- /dev/null +++ b/src/intel/compiler/elk/brw_disasm.c @@ -0,0 +1,2887 @@ +/* + * Copyright © 2008 Keith Packard + * Copyright © 2014 Intel Corporation + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that copyright + * notice and this permission notice appear in supporting documentation, and + * that the name of the copyright holders not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. The copyright holders make no representations + * about the suitability of this software for any purpose. It is provided "as + * is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, + * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER + * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THIS SOFTWARE. + */ + +#include +#include +#include +#include + +#include "brw_disasm.h" +#include "brw_disasm_info.h" +#include "brw_eu_defines.h" +#include "brw_eu.h" +#include "brw_inst.h" +#include "brw_isa_info.h" +#include "brw_reg.h" +#include "brw_shader.h" +#include "util/half_float.h" + +bool +brw_has_jip(const struct intel_device_info *devinfo, enum opcode opcode) +{ + if (devinfo->ver < 6) + return false; + + return opcode == BRW_OPCODE_IF || + opcode == BRW_OPCODE_ELSE || + opcode == BRW_OPCODE_ENDIF || + opcode == BRW_OPCODE_WHILE || + opcode == BRW_OPCODE_BREAK || + opcode == BRW_OPCODE_CONTINUE || + opcode == BRW_OPCODE_HALT; +} + +bool +brw_has_uip(const struct intel_device_info *devinfo, enum opcode opcode) +{ + if (devinfo->ver < 6) + return false; + + return (devinfo->ver >= 7 && opcode == BRW_OPCODE_IF) || + (devinfo->ver >= 8 && opcode == BRW_OPCODE_ELSE) || + opcode == BRW_OPCODE_BREAK || + opcode == BRW_OPCODE_CONTINUE || + opcode == BRW_OPCODE_HALT; +} + +static bool +has_branch_ctrl(const struct intel_device_info *devinfo, enum opcode opcode) +{ + if (devinfo->ver < 8) + return false; + + return opcode == BRW_OPCODE_IF || + opcode == BRW_OPCODE_ELSE; + /* opcode == BRW_OPCODE_GOTO; */ +} + +static bool +is_logic_instruction(unsigned opcode) +{ + return opcode == BRW_OPCODE_AND || + opcode == BRW_OPCODE_NOT || + opcode == BRW_OPCODE_OR || + opcode == BRW_OPCODE_XOR; +} + +static bool +is_send(unsigned opcode) +{ + return opcode == BRW_OPCODE_SEND || + opcode == BRW_OPCODE_SENDC || + opcode == BRW_OPCODE_SENDS || + opcode == BRW_OPCODE_SENDSC; +} + +static bool +is_split_send(UNUSED const struct intel_device_info *devinfo, unsigned opcode) +{ + if (devinfo->ver >= 12) + return is_send(opcode); + else + return opcode == BRW_OPCODE_SENDS || + opcode == BRW_OPCODE_SENDSC; +} + +const char *const conditional_modifier[16] = { + [BRW_CONDITIONAL_NONE] = "", + [BRW_CONDITIONAL_Z] = ".z", + [BRW_CONDITIONAL_NZ] = ".nz", + [BRW_CONDITIONAL_G] = ".g", + [BRW_CONDITIONAL_GE] = ".ge", + [BRW_CONDITIONAL_L] = ".l", + [BRW_CONDITIONAL_LE] = ".le", + [BRW_CONDITIONAL_R] = ".r", + [BRW_CONDITIONAL_O] = ".o", + [BRW_CONDITIONAL_U] = ".u", +}; + +static const char *const m_negate[2] = { + [0] = "", + [1] = "-", +}; + +static const char *const _abs[2] = { + [0] = "", + [1] = "(abs)", +}; + +static const char *const m_bitnot[2] = { "", "~" }; + +static const char *const vert_stride[16] = { + [0] = "0", + [1] = "1", + [2] = "2", + [3] = "4", + [4] = "8", + [5] = "16", + [6] = "32", + [15] = "VxH", +}; + +static const char *const width[8] = { + [0] = "1", + [1] = "2", + [2] = "4", + [3] = "8", + [4] = "16", +}; + +static const char *const horiz_stride[4] = { + [0] = "0", + [1] = "1", + [2] = "2", + [3] = "4" +}; + +static const char *const chan_sel[4] = { + [0] = "x", + [1] = "y", + [2] = "z", + [3] = "w", +}; + +static const char *const debug_ctrl[2] = { + [0] = "", + [1] = ".breakpoint" +}; + +static const char *const saturate[2] = { + [0] = "", + [1] = ".sat" +}; + +static const char *const cmpt_ctrl[2] = { + [0] = "", + [1] = "compacted" +}; + +static const char *const accwr[2] = { + [0] = "", + [1] = "AccWrEnable" +}; + +static const char *const branch_ctrl[2] = { + [0] = "", + [1] = "BranchCtrl" +}; + +static const char *const wectrl[2] = { + [0] = "", + [1] = "WE_all" +}; + +static const char *const exec_size[8] = { + [0] = "1", + [1] = "2", + [2] = "4", + [3] = "8", + [4] = "16", + [5] = "32" +}; + +static const char *const pred_inv[2] = { + [0] = "+", + [1] = "-" +}; + +const char *const pred_ctrl_align16[16] = { + [1] = "", + [2] = ".x", + [3] = ".y", + [4] = ".z", + [5] = ".w", + [6] = ".any4h", + [7] = ".all4h", +}; + +static const char *const pred_ctrl_align1[16] = { + [BRW_PREDICATE_NORMAL] = "", + [BRW_PREDICATE_ALIGN1_ANYV] = ".anyv", + [BRW_PREDICATE_ALIGN1_ALLV] = ".allv", + [BRW_PREDICATE_ALIGN1_ANY2H] = ".any2h", + [BRW_PREDICATE_ALIGN1_ALL2H] = ".all2h", + [BRW_PREDICATE_ALIGN1_ANY4H] = ".any4h", + [BRW_PREDICATE_ALIGN1_ALL4H] = ".all4h", + [BRW_PREDICATE_ALIGN1_ANY8H] = ".any8h", + [BRW_PREDICATE_ALIGN1_ALL8H] = ".all8h", + [BRW_PREDICATE_ALIGN1_ANY16H] = ".any16h", + [BRW_PREDICATE_ALIGN1_ALL16H] = ".all16h", + [BRW_PREDICATE_ALIGN1_ANY32H] = ".any32h", + [BRW_PREDICATE_ALIGN1_ALL32H] = ".all32h", +}; + +static const char *const xe2_pred_ctrl[4] = { + [BRW_PREDICATE_NORMAL] = "", + [XE2_PREDICATE_ANY] = ".any", + [XE2_PREDICATE_ALL] = ".all", +}; + +static const char *const thread_ctrl[4] = { + [BRW_THREAD_NORMAL] = "", + [BRW_THREAD_ATOMIC] = "atomic", + [BRW_THREAD_SWITCH] = "switch", +}; + +static const char *const compr_ctrl[4] = { + [0] = "", + [1] = "sechalf", + [2] = "compr", + [3] = "compr4", +}; + +static const char *const dep_ctrl[4] = { + [0] = "", + [1] = "NoDDClr", + [2] = "NoDDChk", + [3] = "NoDDClr,NoDDChk", +}; + +static const char *const mask_ctrl[4] = { + [0] = "", + [1] = "nomask", +}; + +static const char *const access_mode[2] = { + [0] = "align1", + [1] = "align16", +}; + +static const char *const reg_file[4] = { + [0] = "A", + [1] = "g", + [2] = "m", + [3] = "imm", +}; + +static const char *const writemask[16] = { + [0x0] = ".", + [0x1] = ".x", + [0x2] = ".y", + [0x3] = ".xy", + [0x4] = ".z", + [0x5] = ".xz", + [0x6] = ".yz", + [0x7] = ".xyz", + [0x8] = ".w", + [0x9] = ".xw", + [0xa] = ".yw", + [0xb] = ".xyw", + [0xc] = ".zw", + [0xd] = ".xzw", + [0xe] = ".yzw", + [0xf] = "", +}; + +static const char *const end_of_thread[2] = { + [0] = "", + [1] = "EOT" +}; + +/* SFIDs on Gfx4-5 */ +static const char *const gfx4_sfid[16] = { + [BRW_SFID_NULL] = "null", + [BRW_SFID_MATH] = "math", + [BRW_SFID_SAMPLER] = "sampler", + [BRW_SFID_MESSAGE_GATEWAY] = "gateway", + [BRW_SFID_DATAPORT_READ] = "read", + [BRW_SFID_DATAPORT_WRITE] = "write", + [BRW_SFID_URB] = "urb", + [BRW_SFID_THREAD_SPAWNER] = "thread_spawner", + [BRW_SFID_VME] = "vme", +}; + +static const char *const gfx6_sfid[16] = { + [BRW_SFID_NULL] = "null", + [BRW_SFID_MATH] = "math", + [BRW_SFID_SAMPLER] = "sampler", + [BRW_SFID_MESSAGE_GATEWAY] = "gateway", + [BRW_SFID_URB] = "urb", + [BRW_SFID_THREAD_SPAWNER] = "thread_spawner", + [GFX6_SFID_DATAPORT_SAMPLER_CACHE] = "dp_sampler", + [GFX6_SFID_DATAPORT_RENDER_CACHE] = "render", + [GFX6_SFID_DATAPORT_CONSTANT_CACHE] = "const", + [GFX7_SFID_DATAPORT_DATA_CACHE] = "data", + [GFX7_SFID_PIXEL_INTERPOLATOR] = "pixel interp", + [HSW_SFID_DATAPORT_DATA_CACHE_1] = "dp data 1", + [HSW_SFID_CRE] = "cre", + [GEN_RT_SFID_RAY_TRACE_ACCELERATOR] = "rt accel", + [GFX12_SFID_SLM] = "slm", + [GFX12_SFID_TGM] = "tgm", + [GFX12_SFID_UGM] = "ugm", +}; + +static const char *const gfx7_gateway_subfuncid[8] = { + [BRW_MESSAGE_GATEWAY_SFID_OPEN_GATEWAY] = "open", + [BRW_MESSAGE_GATEWAY_SFID_CLOSE_GATEWAY] = "close", + [BRW_MESSAGE_GATEWAY_SFID_FORWARD_MSG] = "forward msg", + [BRW_MESSAGE_GATEWAY_SFID_GET_TIMESTAMP] = "get timestamp", + [BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG] = "barrier msg", + [BRW_MESSAGE_GATEWAY_SFID_UPDATE_GATEWAY_STATE] = "update state", + [BRW_MESSAGE_GATEWAY_SFID_MMIO_READ_WRITE] = "mmio read/write", +}; + +static const char *const gfx4_dp_read_port_msg_type[4] = { + [0b00] = "OWord Block Read", + [0b01] = "OWord Dual Block Read", + [0b10] = "Media Block Read", + [0b11] = "DWord Scattered Read", +}; + +static const char *const g45_dp_read_port_msg_type[8] = { + [0b000] = "OWord Block Read", + [0b010] = "OWord Dual Block Read", + [0b100] = "Media Block Read", + [0b110] = "DWord Scattered Read", + [0b001] = "Render Target UNORM Read", + [0b011] = "AVC Loop Filter Read", +}; + +static const char *const dp_write_port_msg_type[8] = { + [0b000] = "OWord block write", + [0b001] = "OWord dual block write", + [0b010] = "media block write", + [0b011] = "DWord scattered write", + [0b100] = "RT write", + [0b101] = "streamed VB write", + [0b110] = "RT UNORM write", /* G45+ */ + [0b111] = "flush render cache", +}; + +static const char *const dp_rc_msg_type_gfx6[16] = { + [BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ] = "OWORD block read", + [GFX6_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ] = "RT UNORM read", + [GFX6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ] = "OWORD dual block read", + [GFX6_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ] = "media block read", + [GFX6_DATAPORT_READ_MESSAGE_OWORD_UNALIGN_BLOCK_READ] = + "OWORD unaligned block read", + [GFX6_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ] = "DWORD scattered read", + [GFX6_DATAPORT_WRITE_MESSAGE_DWORD_ATOMIC_WRITE] = "DWORD atomic write", + [GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE] = "OWORD block write", + [GFX6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE] = + "OWORD dual block write", + [GFX6_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE] = "media block write", + [GFX6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE] = + "DWORD scattered write", + [GFX6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE] = "RT write", + [GFX6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE] = "streamed VB write", + [GFX6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_UNORM_WRITE] = "RT UNORM write", +}; + +static const char *const dp_rc_msg_type_gfx7[16] = { + [GFX7_DATAPORT_RC_MEDIA_BLOCK_READ] = "media block read", + [GFX7_DATAPORT_RC_TYPED_SURFACE_READ] = "typed surface read", + [GFX7_DATAPORT_RC_TYPED_ATOMIC_OP] = "typed atomic op", + [GFX7_DATAPORT_RC_MEMORY_FENCE] = "memory fence", + [GFX7_DATAPORT_RC_MEDIA_BLOCK_WRITE] = "media block write", + [GFX7_DATAPORT_RC_RENDER_TARGET_WRITE] = "RT write", + [GFX7_DATAPORT_RC_TYPED_SURFACE_WRITE] = "typed surface write" +}; + +static const char *const dp_rc_msg_type_gfx9[16] = { + [GFX9_DATAPORT_RC_RENDER_TARGET_WRITE] = "RT write", + [GFX9_DATAPORT_RC_RENDER_TARGET_READ] = "RT read" +}; + +static const char *const * +dp_rc_msg_type(const struct intel_device_info *devinfo) +{ + return (devinfo->ver >= 9 ? dp_rc_msg_type_gfx9 : + devinfo->ver >= 7 ? dp_rc_msg_type_gfx7 : + devinfo->ver >= 6 ? dp_rc_msg_type_gfx6 : + dp_write_port_msg_type); +} + +static const char *const m_rt_write_subtype[] = { + [0b000] = "SIMD16", + [0b001] = "SIMD16/RepData", + [0b010] = "SIMD8/DualSrcLow", + [0b011] = "SIMD8/DualSrcHigh", + [0b100] = "SIMD8", + [0b101] = "SIMD8/ImageWrite", /* Gfx6+ */ + [0b111] = "SIMD16/RepData-111", /* no idea how this is different than 1 */ +}; + +static const char *const dp_dc0_msg_type_gfx7[16] = { + [GFX7_DATAPORT_DC_OWORD_BLOCK_READ] = "DC OWORD block read", + [GFX7_DATAPORT_DC_UNALIGNED_OWORD_BLOCK_READ] = + "DC unaligned OWORD block read", + [GFX7_DATAPORT_DC_OWORD_DUAL_BLOCK_READ] = "DC OWORD dual block read", + [GFX7_DATAPORT_DC_DWORD_SCATTERED_READ] = "DC DWORD scattered read", + [GFX7_DATAPORT_DC_BYTE_SCATTERED_READ] = "DC byte scattered read", + [GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ] = "DC untyped surface read", + [GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP] = "DC untyped atomic", + [GFX7_DATAPORT_DC_MEMORY_FENCE] = "DC mfence", + [GFX7_DATAPORT_DC_OWORD_BLOCK_WRITE] = "DC OWORD block write", + [GFX7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE] = "DC OWORD dual block write", + [GFX7_DATAPORT_DC_DWORD_SCATTERED_WRITE] = "DC DWORD scatterd write", + [GFX7_DATAPORT_DC_BYTE_SCATTERED_WRITE] = "DC byte scattered write", + [GFX7_DATAPORT_DC_UNTYPED_SURFACE_WRITE] = "DC untyped surface write", +}; + +static const char *const dp_oword_block_rw[8] = { + [BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW] = "1-low", + [BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH] = "1-high", + [BRW_DATAPORT_OWORD_BLOCK_2_OWORDS] = "2", + [BRW_DATAPORT_OWORD_BLOCK_4_OWORDS] = "4", + [BRW_DATAPORT_OWORD_BLOCK_8_OWORDS] = "8", +}; + +static const char *const dp_dc1_msg_type_hsw[32] = { + [HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ] = "untyped surface read", + [HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP] = "DC untyped atomic op", + [HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2] = + "DC untyped 4x2 atomic op", + [HSW_DATAPORT_DC_PORT1_MEDIA_BLOCK_READ] = "DC media block read", + [HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ] = "DC typed surface read", + [HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP] = "DC typed atomic", + [HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2] = "DC typed 4x2 atomic op", + [HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE] = "DC untyped surface write", + [HSW_DATAPORT_DC_PORT1_MEDIA_BLOCK_WRITE] = "DC media block write", + [HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP] = "DC atomic counter op", + [HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2] = + "DC 4x2 atomic counter op", + [HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE] = "DC typed surface write", + [GFX9_DATAPORT_DC_PORT1_A64_SCATTERED_READ] = "DC A64 scattered read", + [GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ] = "DC A64 untyped surface read", + [GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_OP] = "DC A64 untyped atomic op", + [GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_READ] = "DC A64 oword block read", + [GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_WRITE] = "DC A64 oword block write", + [GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE] = "DC A64 untyped surface write", + [GFX8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE] = "DC A64 scattered write", + [GFX9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP] = + "DC untyped atomic float op", + [GFX9_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_FLOAT_OP] = + "DC A64 untyped atomic float op", + [GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_INT_OP] = + "DC A64 untyped atomic half-integer op", + [GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_FLOAT_OP] = + "DC A64 untyped atomic half-float op", +}; + +static const char *const aop[16] = { + [BRW_AOP_AND] = "and", + [BRW_AOP_OR] = "or", + [BRW_AOP_XOR] = "xor", + [BRW_AOP_MOV] = "mov", + [BRW_AOP_INC] = "inc", + [BRW_AOP_DEC] = "dec", + [BRW_AOP_ADD] = "add", + [BRW_AOP_SUB] = "sub", + [BRW_AOP_REVSUB] = "revsub", + [BRW_AOP_IMAX] = "imax", + [BRW_AOP_IMIN] = "imin", + [BRW_AOP_UMAX] = "umax", + [BRW_AOP_UMIN] = "umin", + [BRW_AOP_CMPWR] = "cmpwr", + [BRW_AOP_PREDEC] = "predec", +}; + +static const char *const aop_float[5] = { + [BRW_AOP_FMAX] = "fmax", + [BRW_AOP_FMIN] = "fmin", + [BRW_AOP_FCMPWR] = "fcmpwr", + [BRW_AOP_FADD] = "fadd", +}; + +static const char * const pixel_interpolator_msg_types[4] = { + [GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET] = "per_message_offset", + [GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE] = "sample_position", + [GFX7_PIXEL_INTERPOLATOR_LOC_CENTROID] = "centroid", + [GFX7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET] = "per_slot_offset", +}; + +static const char *const math_function[16] = { + [BRW_MATH_FUNCTION_INV] = "inv", + [BRW_MATH_FUNCTION_LOG] = "log", + [BRW_MATH_FUNCTION_EXP] = "exp", + [BRW_MATH_FUNCTION_SQRT] = "sqrt", + [BRW_MATH_FUNCTION_RSQ] = "rsq", + [BRW_MATH_FUNCTION_SIN] = "sin", + [BRW_MATH_FUNCTION_COS] = "cos", + [BRW_MATH_FUNCTION_SINCOS] = "sincos", + [BRW_MATH_FUNCTION_FDIV] = "fdiv", + [BRW_MATH_FUNCTION_POW] = "pow", + [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER] = "intdivmod", + [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT] = "intdiv", + [BRW_MATH_FUNCTION_INT_DIV_REMAINDER] = "intmod", + [GFX8_MATH_FUNCTION_INVM] = "invm", + [GFX8_MATH_FUNCTION_RSQRTM] = "rsqrtm", +}; + +static const char *const sync_function[16] = { + [TGL_SYNC_NOP] = "nop", + [TGL_SYNC_ALLRD] = "allrd", + [TGL_SYNC_ALLWR] = "allwr", + [TGL_SYNC_FENCE] = "fence", + [TGL_SYNC_BAR] = "bar", + [TGL_SYNC_HOST] = "host", +}; + +static const char *const math_saturate[2] = { + [0] = "", + [1] = "sat" +}; + +static const char *const math_signed[2] = { + [0] = "", + [1] = "signed" +}; + +static const char *const math_scalar[2] = { + [0] = "", + [1] = "scalar" +}; + +static const char *const math_precision[2] = { + [0] = "", + [1] = "partial_precision" +}; + +static const char *const gfx5_urb_opcode[] = { + [0] = "urb_write", + [1] = "ff_sync", +}; + +static const char *const gfx7_urb_opcode[] = { + [BRW_URB_OPCODE_WRITE_HWORD] = "write HWord", + [BRW_URB_OPCODE_WRITE_OWORD] = "write OWord", + [BRW_URB_OPCODE_READ_HWORD] = "read HWord", + [BRW_URB_OPCODE_READ_OWORD] = "read OWord", + [GFX7_URB_OPCODE_ATOMIC_MOV] = "atomic mov", /* Gfx7+ */ + [GFX7_URB_OPCODE_ATOMIC_INC] = "atomic inc", /* Gfx7+ */ + [GFX8_URB_OPCODE_ATOMIC_ADD] = "atomic add", /* Gfx8+ */ + [GFX8_URB_OPCODE_SIMD8_WRITE] = "SIMD8 write", /* Gfx8+ */ + [GFX8_URB_OPCODE_SIMD8_READ] = "SIMD8 read", /* Gfx8+ */ + [GFX125_URB_OPCODE_FENCE] = "fence", /* Gfx12.5+ */ + /* [10-15] - reserved */ +}; + +static const char *const urb_swizzle[4] = { + [BRW_URB_SWIZZLE_NONE] = "", + [BRW_URB_SWIZZLE_INTERLEAVE] = "interleave", + [BRW_URB_SWIZZLE_TRANSPOSE] = "transpose", +}; + +static const char *const urb_allocate[2] = { + [0] = "", + [1] = "allocate" +}; + +static const char *const urb_used[2] = { + [0] = "", + [1] = "used" +}; + +static const char *const urb_complete[2] = { + [0] = "", + [1] = "complete" +}; + +static const char *const gfx5_sampler_msg_type[] = { + [GFX5_SAMPLER_MESSAGE_SAMPLE] = "sample", + [GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS] = "sample_b", + [GFX5_SAMPLER_MESSAGE_SAMPLE_LOD] = "sample_l", + [GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE] = "sample_c", + [GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS] = "sample_d", + [GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE] = "sample_b_c", + [GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE] = "sample_l_c", + [GFX5_SAMPLER_MESSAGE_SAMPLE_LD] = "ld", + [GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4] = "gather4", + [GFX5_SAMPLER_MESSAGE_LOD] = "lod", + [GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO] = "resinfo", + [GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO] = "sampleinfo", + [GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C] = "gather4_c", + [GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO] = "gather4_po", + [GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C] = "gather4_po_c", + [HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE] = "sample_d_c", + [GFX9_SAMPLER_MESSAGE_SAMPLE_LZ] = "sample_lz", + [GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ] = "sample_c_lz", + [GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ] = "ld_lz", + [GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W] = "ld2dms_w", + [GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS] = "ld_mcs", + [GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS] = "ld2dms", + [GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DSS] = "ld2dss", +}; + +static const char *const xe2_sampler_msg_type[] = { + [GFX5_SAMPLER_MESSAGE_SAMPLE] = "sample", + [GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS] = "sample_b", + [GFX5_SAMPLER_MESSAGE_SAMPLE_LOD] = "sample_l", + [GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE] = "sample_c", + [GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS] = "sample_d", + [GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE] = "sample_b_c", + [GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE] = "sample_l_c", + [GFX5_SAMPLER_MESSAGE_SAMPLE_LD] = "ld", + [GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4] = "gather4", + [GFX5_SAMPLER_MESSAGE_LOD] = "lod", + [GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO] = "resinfo", + [GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO] = "sampleinfo", + [GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C] = "gather4_c", + [GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO] = "gather4_po", + [XE2_SAMPLER_MESSAGE_SAMPLE_MLOD] = "sample_mlod", + [XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD] = "sample_c_mlod", + [HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE] = "sample_d_c", + [GFX9_SAMPLER_MESSAGE_SAMPLE_LZ] = "sample_lz", + [GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ] = "sample_c_lz", + [GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ] = "ld_lz", + [GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W] = "ld2dms_w", + [GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS] = "ld_mcs", + [GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS] = "ld2dms", + [GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DSS] = "ld2dss", +}; + +static const char *const gfx5_sampler_simd_mode[7] = { + [BRW_SAMPLER_SIMD_MODE_SIMD4X2] = "SIMD4x2", + [BRW_SAMPLER_SIMD_MODE_SIMD8] = "SIMD8", + [BRW_SAMPLER_SIMD_MODE_SIMD16] = "SIMD16", + [BRW_SAMPLER_SIMD_MODE_SIMD32_64] = "SIMD32/64", + [GFX10_SAMPLER_SIMD_MODE_SIMD8H] = "SIMD8H", + [GFX10_SAMPLER_SIMD_MODE_SIMD16H] = "SIMD16H", +}; + +static const char *const xe2_sampler_simd_mode[7] = { + [XE2_SAMPLER_SIMD_MODE_SIMD16] = "SIMD16", + [XE2_SAMPLER_SIMD_MODE_SIMD32] = "SIMD32", + [XE2_SAMPLER_SIMD_MODE_SIMD16H] = "SIMD16H", + [XE2_SAMPLER_SIMD_MODE_SIMD32H] = "SIMD32H", +}; + +static const char *const sampler_target_format[4] = { + [0] = "F", + [2] = "UD", + [3] = "D" +}; + +static const char *const lsc_operation[] = { + [LSC_OP_LOAD] = "load", + [LSC_OP_LOAD_CMASK] = "load_cmask", + [LSC_OP_STORE] = "store", + [LSC_OP_STORE_CMASK] = "store_cmask", + [LSC_OP_FENCE] = "fence", + [LSC_OP_ATOMIC_INC] = "atomic_inc", + [LSC_OP_ATOMIC_DEC] = "atomic_dec", + [LSC_OP_ATOMIC_LOAD] = "atomic_load", + [LSC_OP_ATOMIC_STORE] = "atomic_store", + [LSC_OP_ATOMIC_ADD] = "atomic_add", + [LSC_OP_ATOMIC_SUB] = "atomic_sub", + [LSC_OP_ATOMIC_MIN] = "atomic_min", + [LSC_OP_ATOMIC_MAX] = "atomic_max", + [LSC_OP_ATOMIC_UMIN] = "atomic_umin", + [LSC_OP_ATOMIC_UMAX] = "atomic_umax", + [LSC_OP_ATOMIC_CMPXCHG] = "atomic_cmpxchg", + [LSC_OP_ATOMIC_FADD] = "atomic_fadd", + [LSC_OP_ATOMIC_FSUB] = "atomic_fsub", + [LSC_OP_ATOMIC_FMIN] = "atomic_fmin", + [LSC_OP_ATOMIC_FMAX] = "atomic_fmax", + [LSC_OP_ATOMIC_FCMPXCHG] = "atomic_fcmpxchg", + [LSC_OP_ATOMIC_AND] = "atomic_and", + [LSC_OP_ATOMIC_OR] = "atomic_or", + [LSC_OP_ATOMIC_XOR] = "atomic_xor", +}; + +static const char *const lsc_addr_surface_type[] = { + [LSC_ADDR_SURFTYPE_FLAT] = "flat", + [LSC_ADDR_SURFTYPE_BSS] = "bss", + [LSC_ADDR_SURFTYPE_SS] = "ss", + [LSC_ADDR_SURFTYPE_BTI] = "bti", +}; + +static const char* const lsc_fence_scope[] = { + [LSC_FENCE_THREADGROUP] = "threadgroup", + [LSC_FENCE_LOCAL] = "local", + [LSC_FENCE_TILE] = "tile", + [LSC_FENCE_GPU] = "gpu", + [LSC_FENCE_ALL_GPU] = "all_gpu", + [LSC_FENCE_SYSTEM_RELEASE] = "system_release", + [LSC_FENCE_SYSTEM_ACQUIRE] = "system_acquire", +}; + +static const char* const lsc_flush_type[] = { + [LSC_FLUSH_TYPE_NONE] = "none", + [LSC_FLUSH_TYPE_EVICT] = "evict", + [LSC_FLUSH_TYPE_INVALIDATE] = "invalidate", + [LSC_FLUSH_TYPE_DISCARD] = "discard", + [LSC_FLUSH_TYPE_CLEAN] = "clean", + [LSC_FLUSH_TYPE_L3ONLY] = "l3only", + [LSC_FLUSH_TYPE_NONE_6] = "none_6", +}; + +static const char* const lsc_addr_size[] = { + [LSC_ADDR_SIZE_A16] = "a16", + [LSC_ADDR_SIZE_A32] = "a32", + [LSC_ADDR_SIZE_A64] = "a64", +}; + +static const char* const lsc_backup_fence_routing[] = { + [LSC_NORMAL_ROUTING] = "normal_routing", + [LSC_ROUTE_TO_LSC] = "route_to_lsc", +}; + +static const char* const lsc_data_size[] = { + [LSC_DATA_SIZE_D8] = "d8", + [LSC_DATA_SIZE_D16] = "d16", + [LSC_DATA_SIZE_D32] = "d32", + [LSC_DATA_SIZE_D64] = "d64", + [LSC_DATA_SIZE_D8U32] = "d8u32", + [LSC_DATA_SIZE_D16U32] = "d16u32", + [LSC_DATA_SIZE_D16BF32] = "d16bf32", +}; + +static const char* const lsc_vect_size_str[] = { + [LSC_VECT_SIZE_V1] = "V1", + [LSC_VECT_SIZE_V2] = "V2", + [LSC_VECT_SIZE_V3] = "V3", + [LSC_VECT_SIZE_V4] = "V4", + [LSC_VECT_SIZE_V8] = "V8", + [LSC_VECT_SIZE_V16] = "V16", + [LSC_VECT_SIZE_V32] = "V32", + [LSC_VECT_SIZE_V64] = "V64", +}; + +static const char* const lsc_cmask_str[] = { + [LSC_CMASK_X] = "x", + [LSC_CMASK_Y] = "y", + [LSC_CMASK_XY] = "xy", + [LSC_CMASK_Z] = "z", + [LSC_CMASK_XZ] = "xz", + [LSC_CMASK_YZ] = "yz", + [LSC_CMASK_XYZ] = "xyz", + [LSC_CMASK_W] = "w", + [LSC_CMASK_XW] = "xw", + [LSC_CMASK_YW] = "yw", + [LSC_CMASK_XYW] = "xyw", + [LSC_CMASK_ZW] = "zw", + [LSC_CMASK_XZW] = "xzw", + [LSC_CMASK_YZW] = "yzw", + [LSC_CMASK_XYZW] = "xyzw", +}; + +static const char* const lsc_cache_load[] = { + [LSC_CACHE_LOAD_L1STATE_L3MOCS] = "L1STATE_L3MOCS", + [LSC_CACHE_LOAD_L1UC_L3UC] = "L1UC_L3UC", + [LSC_CACHE_LOAD_L1UC_L3C] = "L1UC_L3C", + [LSC_CACHE_LOAD_L1C_L3UC] = "L1C_L3UC", + [LSC_CACHE_LOAD_L1C_L3C] = "L1C_L3C", + [LSC_CACHE_LOAD_L1S_L3UC] = "L1S_L3UC", + [LSC_CACHE_LOAD_L1S_L3C] = "L1S_L3C", + [LSC_CACHE_LOAD_L1IAR_L3C] = "L1IAR_L3C", +}; + +static const char* const lsc_cache_store[] = { + [LSC_CACHE_STORE_L1STATE_L3MOCS] = "L1STATE_L3MOCS", + [LSC_CACHE_STORE_L1UC_L3UC] = "L1UC_L3UC", + [LSC_CACHE_STORE_L1UC_L3WB] = "L1UC_L3WB", + [LSC_CACHE_STORE_L1WT_L3UC] = "L1WT_L3UC", + [LSC_CACHE_STORE_L1WT_L3WB] = "L1WT_L3WB", + [LSC_CACHE_STORE_L1S_L3UC] = "L1S_L3UC", + [LSC_CACHE_STORE_L1S_L3WB] = "L1S_L3WB", + [LSC_CACHE_STORE_L1WB_L3WB] = "L1WB_L3WB", +}; + +static const char* const xe2_lsc_cache_load[] = { + [XE2_LSC_CACHE_LOAD_L1STATE_L3MOCS] = "L1STATE_L3MOCS", + [XE2_LSC_CACHE_LOAD_L1UC_L3UC] = "L1UC_L3UC", + [XE2_LSC_CACHE_LOAD_L1UC_L3C] = "L1UC_L3C", + [XE2_LSC_CACHE_LOAD_L1UC_L3CC] = "L1UC_L3CC", + [XE2_LSC_CACHE_LOAD_L1C_L3UC] = "L1C_L3UC", + [XE2_LSC_CACHE_LOAD_L1C_L3C] = "L1C_L3C", + [XE2_LSC_CACHE_LOAD_L1C_L3CC] = "L1C_L3CC", + [XE2_LSC_CACHE_LOAD_L1S_L3UC] = "L1S_L3UC", + [XE2_LSC_CACHE_LOAD_L1S_L3C] = "L1S_L3C", + [XE2_LSC_CACHE_LOAD_L1IAR_L3IAR] = "L1IAR_L3IAR", +}; + +static const char* const xe2_lsc_cache_store[] = { + [XE2_LSC_CACHE_STORE_L1STATE_L3MOCS] = "L1STATE_L3MOCS", + [XE2_LSC_CACHE_STORE_L1UC_L3UC] = "L1UC_L3UC", + [XE2_LSC_CACHE_STORE_L1UC_L3WB] = "L1UC_L3WB", + [XE2_LSC_CACHE_STORE_L1WT_L3UC] = "L1WT_L3UC", + [XE2_LSC_CACHE_STORE_L1WT_L3WB] = "L1WT_L3WB", + [XE2_LSC_CACHE_STORE_L1S_L3UC] = "L1S_L3UC", + [XE2_LSC_CACHE_STORE_L1S_L3WB] = "L1S_L3WB", + [XE2_LSC_CACHE_STORE_L1WB_L3WB] = "L1WB_L3WB", +}; + +static const char* const dpas_systolic_depth[4] = { + [0] = "16", + [1] = "2", + [2] = "4", + [3] = "8" +}; + +static int column; + +static int +string(FILE *file, const char *string) +{ + fputs(string, file); + column += strlen(string); + return 0; +} + +static int +format(FILE *f, const char *format, ...) PRINTFLIKE(2, 3); + +static int +format(FILE *f, const char *format, ...) +{ + char buf[1024]; + va_list args; + va_start(args, format); + + vsnprintf(buf, sizeof(buf) - 1, format, args); + va_end(args); + string(f, buf); + return 0; +} + +static int +newline(FILE *f) +{ + putc('\n', f); + column = 0; + return 0; +} + +static int +pad(FILE *f, int c) +{ + do + string(f, " "); + while (column < c); + return 0; +} + +static int +control(FILE *file, const char *name, const char *const ctrl[], + unsigned id, int *space) +{ + if (!ctrl[id]) { + fprintf(file, "*** invalid %s value %d ", name, id); + return 1; + } + if (ctrl[id][0]) { + if (space && *space) + string(file, " "); + string(file, ctrl[id]); + if (space) + *space = 1; + } + return 0; +} + +static int +print_opcode(FILE *file, const struct brw_isa_info *isa, + enum opcode id) +{ + const struct opcode_desc *desc = brw_opcode_desc(isa, id); + if (!desc) { + format(file, "*** invalid opcode value %d ", id); + return 1; + } + string(file, desc->name); + return 0; +} + +static int +reg(FILE *file, unsigned _reg_file, unsigned _reg_nr) +{ + int err = 0; + + /* Clear the Compr4 instruction compression bit. */ + if (_reg_file == BRW_MESSAGE_REGISTER_FILE) + _reg_nr &= ~BRW_MRF_COMPR4; + + if (_reg_file == BRW_ARCHITECTURE_REGISTER_FILE) { + switch (_reg_nr & 0xf0) { + case BRW_ARF_NULL: + string(file, "null"); + break; + case BRW_ARF_ADDRESS: + format(file, "a%d", _reg_nr & 0x0f); + break; + case BRW_ARF_ACCUMULATOR: + format(file, "acc%d", _reg_nr & 0x0f); + break; + case BRW_ARF_FLAG: + format(file, "f%d", _reg_nr & 0x0f); + break; + case BRW_ARF_MASK: + format(file, "mask%d", _reg_nr & 0x0f); + break; + case BRW_ARF_MASK_STACK: + format(file, "ms%d", _reg_nr & 0x0f); + break; + case BRW_ARF_MASK_STACK_DEPTH: + format(file, "msd%d", _reg_nr & 0x0f); + break; + case BRW_ARF_STATE: + format(file, "sr%d", _reg_nr & 0x0f); + break; + case BRW_ARF_CONTROL: + format(file, "cr%d", _reg_nr & 0x0f); + break; + case BRW_ARF_NOTIFICATION_COUNT: + format(file, "n%d", _reg_nr & 0x0f); + break; + case BRW_ARF_IP: + string(file, "ip"); + return -1; + break; + case BRW_ARF_TDR: + format(file, "tdr0"); + return -1; + case BRW_ARF_TIMESTAMP: + format(file, "tm%d", _reg_nr & 0x0f); + break; + default: + format(file, "ARF%d", _reg_nr); + break; + } + } else { + err |= control(file, "src reg file", reg_file, _reg_file, NULL); + format(file, "%d", _reg_nr); + } + return err; +} + +static int +dest(FILE *file, const struct brw_isa_info *isa, const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + enum brw_reg_type type = brw_inst_dst_type(devinfo, inst); + unsigned elem_size = brw_reg_type_to_size(type); + int err = 0; + + if (is_split_send(devinfo, brw_inst_opcode(isa, inst))) { + /* These are fixed for split sends */ + type = BRW_REGISTER_TYPE_UD; + elem_size = 4; + if (devinfo->ver >= 12) { + err |= reg(file, brw_inst_send_dst_reg_file(devinfo, inst), + brw_inst_dst_da_reg_nr(devinfo, inst)); + string(file, brw_reg_type_to_letters(type)); + } else if (brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) { + err |= reg(file, brw_inst_send_dst_reg_file(devinfo, inst), + brw_inst_dst_da_reg_nr(devinfo, inst)); + unsigned subreg_nr = brw_inst_dst_da16_subreg_nr(devinfo, inst); + if (subreg_nr) + format(file, ".%u", subreg_nr); + string(file, brw_reg_type_to_letters(type)); + } else { + string(file, "g[a0"); + if (brw_inst_dst_ia_subreg_nr(devinfo, inst)) + format(file, ".%"PRIu64, brw_inst_dst_ia_subreg_nr(devinfo, inst) / + elem_size); + if (brw_inst_send_dst_ia16_addr_imm(devinfo, inst)) + format(file, " %d", brw_inst_send_dst_ia16_addr_imm(devinfo, inst)); + string(file, "]<"); + string(file, brw_reg_type_to_letters(type)); + } + } else if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + if (brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) { + err |= reg(file, brw_inst_dst_reg_file(devinfo, inst), + brw_inst_dst_da_reg_nr(devinfo, inst)); + if (err == -1) + return 0; + if (brw_inst_dst_da1_subreg_nr(devinfo, inst)) + format(file, ".%"PRIu64, brw_inst_dst_da1_subreg_nr(devinfo, inst) / + elem_size); + string(file, "<"); + err |= control(file, "horiz stride", horiz_stride, + brw_inst_dst_hstride(devinfo, inst), NULL); + string(file, ">"); + string(file, brw_reg_type_to_letters(type)); + } else { + string(file, "g[a0"); + if (brw_inst_dst_ia_subreg_nr(devinfo, inst)) + format(file, ".%"PRIu64, brw_inst_dst_ia_subreg_nr(devinfo, inst) / + elem_size); + if (brw_inst_dst_ia1_addr_imm(devinfo, inst)) + format(file, " %d", brw_inst_dst_ia1_addr_imm(devinfo, inst)); + string(file, "]<"); + err |= control(file, "horiz stride", horiz_stride, + brw_inst_dst_hstride(devinfo, inst), NULL); + string(file, ">"); + string(file, brw_reg_type_to_letters(type)); + } + } else { + if (brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) { + err |= reg(file, brw_inst_dst_reg_file(devinfo, inst), + brw_inst_dst_da_reg_nr(devinfo, inst)); + if (err == -1) + return 0; + if (brw_inst_dst_da16_subreg_nr(devinfo, inst)) + format(file, ".%u", 16 / elem_size); + string(file, "<1>"); + err |= control(file, "writemask", writemask, + brw_inst_da16_writemask(devinfo, inst), NULL); + string(file, brw_reg_type_to_letters(type)); + } else { + err = 1; + string(file, "Indirect align16 address mode not supported"); + } + } + + return 0; +} + +static int +dest_3src(FILE *file, const struct intel_device_info *devinfo, + const brw_inst *inst) +{ + bool is_align1 = brw_inst_3src_access_mode(devinfo, inst) == BRW_ALIGN_1; + int err = 0; + uint32_t reg_file; + unsigned subreg_nr; + enum brw_reg_type type; + + if (devinfo->ver < 10 && is_align1) + return 0; + + if (devinfo->ver == 6 && brw_inst_3src_a16_dst_reg_file(devinfo, inst)) + reg_file = BRW_MESSAGE_REGISTER_FILE; + else if (devinfo->ver >= 12) + reg_file = brw_inst_3src_a1_dst_reg_file(devinfo, inst); + else if (is_align1 && brw_inst_3src_a1_dst_reg_file(devinfo, inst)) + reg_file = BRW_ARCHITECTURE_REGISTER_FILE; + else + reg_file = BRW_GENERAL_REGISTER_FILE; + + err |= reg(file, reg_file, brw_inst_3src_dst_reg_nr(devinfo, inst)); + if (err == -1) + return 0; + + if (is_align1) { + type = brw_inst_3src_a1_dst_type(devinfo, inst); + subreg_nr = brw_inst_3src_a1_dst_subreg_nr(devinfo, inst); + } else { + type = brw_inst_3src_a16_dst_type(devinfo, inst); + subreg_nr = brw_inst_3src_a16_dst_subreg_nr(devinfo, inst) * 4; + } + subreg_nr /= brw_reg_type_to_size(type); + + if (subreg_nr) + format(file, ".%u", subreg_nr); + string(file, "<1>"); + + if (!is_align1) { + err |= control(file, "writemask", writemask, + brw_inst_3src_a16_dst_writemask(devinfo, inst), NULL); + } + string(file, brw_reg_type_to_letters(type)); + + return 0; +} + +static int +dest_dpas_3src(FILE *file, const struct intel_device_info *devinfo, + const brw_inst *inst) +{ + uint32_t reg_file = brw_inst_dpas_3src_dst_reg_file(devinfo, inst); + + if (reg(file, reg_file, brw_inst_dpas_3src_dst_reg_nr(devinfo, inst)) == -1) + return 0; + + enum brw_reg_type type = brw_inst_dpas_3src_dst_type(devinfo, inst); + unsigned subreg_nr = brw_inst_dpas_3src_dst_subreg_nr(devinfo, inst); + + if (subreg_nr) + format(file, ".%u", subreg_nr); + string(file, "<1>"); + + string(file, brw_reg_type_to_letters(type)); + + return 0; +} + +static int +src_align1_region(FILE *file, + unsigned _vert_stride, unsigned _width, + unsigned _horiz_stride) +{ + int err = 0; + string(file, "<"); + err |= control(file, "vert stride", vert_stride, _vert_stride, NULL); + string(file, ","); + err |= control(file, "width", width, _width, NULL); + string(file, ","); + err |= control(file, "horiz_stride", horiz_stride, _horiz_stride, NULL); + string(file, ">"); + return err; +} + +static int +src_da1(FILE *file, + const struct intel_device_info *devinfo, + unsigned opcode, + enum brw_reg_type type, unsigned _reg_file, + unsigned _vert_stride, unsigned _width, unsigned _horiz_stride, + unsigned reg_num, unsigned sub_reg_num, unsigned __abs, + unsigned _negate) +{ + int err = 0; + + if (devinfo->ver >= 8 && is_logic_instruction(opcode)) + err |= control(file, "bitnot", m_bitnot, _negate, NULL); + else + err |= control(file, "negate", m_negate, _negate, NULL); + + err |= control(file, "abs", _abs, __abs, NULL); + + err |= reg(file, _reg_file, reg_num); + if (err == -1) + return 0; + if (sub_reg_num) { + unsigned elem_size = brw_reg_type_to_size(type); + format(file, ".%d", sub_reg_num / elem_size); /* use formal style like spec */ + } + src_align1_region(file, _vert_stride, _width, _horiz_stride); + string(file, brw_reg_type_to_letters(type)); + return err; +} + +static int +src_ia1(FILE *file, + const struct intel_device_info *devinfo, + unsigned opcode, + enum brw_reg_type type, + int _addr_imm, + unsigned _addr_subreg_nr, + unsigned _negate, + unsigned __abs, + unsigned _horiz_stride, unsigned _width, unsigned _vert_stride) +{ + int err = 0; + + if (devinfo->ver >= 8 && is_logic_instruction(opcode)) + err |= control(file, "bitnot", m_bitnot, _negate, NULL); + else + err |= control(file, "negate", m_negate, _negate, NULL); + + err |= control(file, "abs", _abs, __abs, NULL); + + string(file, "g[a0"); + if (_addr_subreg_nr) + format(file, ".%d", _addr_subreg_nr); + if (_addr_imm) + format(file, " %d", _addr_imm); + string(file, "]"); + src_align1_region(file, _vert_stride, _width, _horiz_stride); + string(file, brw_reg_type_to_letters(type)); + return err; +} + +static int +src_swizzle(FILE *file, unsigned swiz) +{ + unsigned x = BRW_GET_SWZ(swiz, BRW_CHANNEL_X); + unsigned y = BRW_GET_SWZ(swiz, BRW_CHANNEL_Y); + unsigned z = BRW_GET_SWZ(swiz, BRW_CHANNEL_Z); + unsigned w = BRW_GET_SWZ(swiz, BRW_CHANNEL_W); + int err = 0; + + if (x == y && x == z && x == w) { + string(file, "."); + err |= control(file, "channel select", chan_sel, x, NULL); + } else if (swiz != BRW_SWIZZLE_XYZW) { + string(file, "."); + err |= control(file, "channel select", chan_sel, x, NULL); + err |= control(file, "channel select", chan_sel, y, NULL); + err |= control(file, "channel select", chan_sel, z, NULL); + err |= control(file, "channel select", chan_sel, w, NULL); + } + return err; +} + +static int +src_da16(FILE *file, + const struct intel_device_info *devinfo, + unsigned opcode, + enum brw_reg_type type, + unsigned _reg_file, + unsigned _vert_stride, + unsigned _reg_nr, + unsigned _subreg_nr, + unsigned __abs, + unsigned _negate, + unsigned swz_x, unsigned swz_y, unsigned swz_z, unsigned swz_w) +{ + int err = 0; + + if (devinfo->ver >= 8 && is_logic_instruction(opcode)) + err |= control(file, "bitnot", m_bitnot, _negate, NULL); + else + err |= control(file, "negate", m_negate, _negate, NULL); + + err |= control(file, "abs", _abs, __abs, NULL); + + err |= reg(file, _reg_file, _reg_nr); + if (err == -1) + return 0; + if (_subreg_nr) { + unsigned elem_size = brw_reg_type_to_size(type); + + /* bit4 for subreg number byte addressing. Make this same meaning as + in da1 case, so output looks consistent. */ + format(file, ".%d", 16 / elem_size); + } + string(file, "<"); + err |= control(file, "vert stride", vert_stride, _vert_stride, NULL); + string(file, ">"); + err |= src_swizzle(file, BRW_SWIZZLE4(swz_x, swz_y, swz_z, swz_w)); + string(file, brw_reg_type_to_letters(type)); + return err; +} + +static enum brw_vertical_stride +vstride_from_align1_3src_vstride(const struct intel_device_info *devinfo, + enum gfx10_align1_3src_vertical_stride vstride) +{ + switch (vstride) { + case BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0: return BRW_VERTICAL_STRIDE_0; + case BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2: + if (devinfo->ver >= 12) + return BRW_VERTICAL_STRIDE_1; + else + return BRW_VERTICAL_STRIDE_2; + case BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4: return BRW_VERTICAL_STRIDE_4; + case BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8: return BRW_VERTICAL_STRIDE_8; + default: + unreachable("not reached"); + } +} + +static enum brw_horizontal_stride +hstride_from_align1_3src_hstride(enum gfx10_align1_3src_src_horizontal_stride hstride) +{ + switch (hstride) { + case BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0: return BRW_HORIZONTAL_STRIDE_0; + case BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1: return BRW_HORIZONTAL_STRIDE_1; + case BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2: return BRW_HORIZONTAL_STRIDE_2; + case BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4: return BRW_HORIZONTAL_STRIDE_4; + default: + unreachable("not reached"); + } +} + +static enum brw_vertical_stride +vstride_from_align1_3src_hstride(enum gfx10_align1_3src_src_horizontal_stride hstride) +{ + switch (hstride) { + case BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0: return BRW_VERTICAL_STRIDE_0; + case BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1: return BRW_VERTICAL_STRIDE_1; + case BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2: return BRW_VERTICAL_STRIDE_2; + case BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4: return BRW_VERTICAL_STRIDE_4; + default: + unreachable("not reached"); + } +} + +/* From "GFX10 Regioning Rules for Align1 Ternary Operations" in the + * "Register Region Restrictions" documentation + */ +static enum brw_width +implied_width(enum brw_vertical_stride _vert_stride, + enum brw_horizontal_stride _horiz_stride) +{ + /* "1. Width is 1 when Vertical and Horizontal Strides are both zero." */ + if (_vert_stride == BRW_VERTICAL_STRIDE_0 && + _horiz_stride == BRW_HORIZONTAL_STRIDE_0) { + return BRW_WIDTH_1; + + /* "2. Width is equal to vertical stride when Horizontal Stride is zero." */ + } else if (_horiz_stride == BRW_HORIZONTAL_STRIDE_0) { + switch (_vert_stride) { + case BRW_VERTICAL_STRIDE_1: return BRW_WIDTH_1; + case BRW_VERTICAL_STRIDE_2: return BRW_WIDTH_2; + case BRW_VERTICAL_STRIDE_4: return BRW_WIDTH_4; + case BRW_VERTICAL_STRIDE_8: return BRW_WIDTH_8; + case BRW_VERTICAL_STRIDE_0: + default: + unreachable("not reached"); + } + + } else { + /* FINISHME: Implement these: */ + + /* "3. Width is equal to Vertical Stride/Horizontal Stride when both + * Strides are non-zero. + * + * 4. Vertical Stride must not be zero if Horizontal Stride is non-zero. + * This implies Vertical Stride is always greater than Horizontal + * Stride." + * + * Given these statements and the knowledge that the stride and width + * values are encoded in logarithmic form, we can perform the division + * by just subtracting. + */ + return _vert_stride - _horiz_stride; + } +} + +static int +src0_3src(FILE *file, const struct intel_device_info *devinfo, + const brw_inst *inst) +{ + int err = 0; + unsigned reg_nr, subreg_nr; + enum brw_reg_file _file; + enum brw_reg_type type; + enum brw_vertical_stride _vert_stride; + enum brw_width _width; + enum brw_horizontal_stride _horiz_stride; + bool is_scalar_region; + bool is_align1 = brw_inst_3src_access_mode(devinfo, inst) == BRW_ALIGN_1; + + if (devinfo->ver < 10 && is_align1) + return 0; + + if (is_align1) { + if (devinfo->ver >= 12 && !brw_inst_3src_a1_src0_is_imm(devinfo, inst)) { + _file = brw_inst_3src_a1_src0_reg_file(devinfo, inst); + } else if (brw_inst_3src_a1_src0_reg_file(devinfo, inst) == + BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE) { + _file = BRW_GENERAL_REGISTER_FILE; + } else if (brw_inst_3src_a1_src0_type(devinfo, inst) == + BRW_REGISTER_TYPE_NF) { + _file = BRW_ARCHITECTURE_REGISTER_FILE; + } else { + _file = BRW_IMMEDIATE_VALUE; + uint16_t imm_val = brw_inst_3src_a1_src0_imm(devinfo, inst); + enum brw_reg_type type = brw_inst_3src_a1_src0_type(devinfo, inst); + + if (type == BRW_REGISTER_TYPE_W) { + format(file, "%dW", imm_val); + } else if (type == BRW_REGISTER_TYPE_UW) { + format(file, "0x%04xUW", imm_val); + } else if (type == BRW_REGISTER_TYPE_HF) { + format(file, "0x%04xHF", imm_val); + } + return 0; + } + + reg_nr = brw_inst_3src_src0_reg_nr(devinfo, inst); + subreg_nr = brw_inst_3src_a1_src0_subreg_nr(devinfo, inst); + type = brw_inst_3src_a1_src0_type(devinfo, inst); + _vert_stride = vstride_from_align1_3src_vstride( + devinfo, brw_inst_3src_a1_src0_vstride(devinfo, inst)); + _horiz_stride = hstride_from_align1_3src_hstride( + brw_inst_3src_a1_src0_hstride(devinfo, inst)); + _width = implied_width(_vert_stride, _horiz_stride); + } else { + _file = BRW_GENERAL_REGISTER_FILE; + reg_nr = brw_inst_3src_src0_reg_nr(devinfo, inst); + subreg_nr = brw_inst_3src_a16_src0_subreg_nr(devinfo, inst) * 4; + type = brw_inst_3src_a16_src_type(devinfo, inst); + + if (brw_inst_3src_a16_src0_rep_ctrl(devinfo, inst)) { + _vert_stride = BRW_VERTICAL_STRIDE_0; + _width = BRW_WIDTH_1; + _horiz_stride = BRW_HORIZONTAL_STRIDE_0; + } else { + _vert_stride = BRW_VERTICAL_STRIDE_4; + _width = BRW_WIDTH_4; + _horiz_stride = BRW_HORIZONTAL_STRIDE_1; + } + } + is_scalar_region = _vert_stride == BRW_VERTICAL_STRIDE_0 && + _width == BRW_WIDTH_1 && + _horiz_stride == BRW_HORIZONTAL_STRIDE_0; + + subreg_nr /= brw_reg_type_to_size(type); + + err |= control(file, "negate", m_negate, + brw_inst_3src_src0_negate(devinfo, inst), NULL); + err |= control(file, "abs", _abs, brw_inst_3src_src0_abs(devinfo, inst), NULL); + + err |= reg(file, _file, reg_nr); + if (err == -1) + return 0; + if (subreg_nr || is_scalar_region) + format(file, ".%d", subreg_nr); + src_align1_region(file, _vert_stride, _width, _horiz_stride); + if (!is_scalar_region && !is_align1) + err |= src_swizzle(file, brw_inst_3src_a16_src0_swizzle(devinfo, inst)); + string(file, brw_reg_type_to_letters(type)); + return err; +} + +static int +src1_3src(FILE *file, const struct intel_device_info *devinfo, + const brw_inst *inst) +{ + int err = 0; + unsigned reg_nr, subreg_nr; + enum brw_reg_file _file; + enum brw_reg_type type; + enum brw_vertical_stride _vert_stride; + enum brw_width _width; + enum brw_horizontal_stride _horiz_stride; + bool is_scalar_region; + bool is_align1 = brw_inst_3src_access_mode(devinfo, inst) == BRW_ALIGN_1; + + if (devinfo->ver < 10 && is_align1) + return 0; + + if (is_align1) { + if (devinfo->ver >= 12) { + _file = brw_inst_3src_a1_src1_reg_file(devinfo, inst); + } else if (brw_inst_3src_a1_src1_reg_file(devinfo, inst) == + BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE) { + _file = BRW_GENERAL_REGISTER_FILE; + } else { + _file = BRW_ARCHITECTURE_REGISTER_FILE; + } + + reg_nr = brw_inst_3src_src1_reg_nr(devinfo, inst); + subreg_nr = brw_inst_3src_a1_src1_subreg_nr(devinfo, inst); + type = brw_inst_3src_a1_src1_type(devinfo, inst); + + _vert_stride = vstride_from_align1_3src_vstride( + devinfo, brw_inst_3src_a1_src1_vstride(devinfo, inst)); + _horiz_stride = hstride_from_align1_3src_hstride( + brw_inst_3src_a1_src1_hstride(devinfo, inst)); + _width = implied_width(_vert_stride, _horiz_stride); + } else { + _file = BRW_GENERAL_REGISTER_FILE; + reg_nr = brw_inst_3src_src1_reg_nr(devinfo, inst); + subreg_nr = brw_inst_3src_a16_src1_subreg_nr(devinfo, inst) * 4; + type = brw_inst_3src_a16_src_type(devinfo, inst); + + if (brw_inst_3src_a16_src1_rep_ctrl(devinfo, inst)) { + _vert_stride = BRW_VERTICAL_STRIDE_0; + _width = BRW_WIDTH_1; + _horiz_stride = BRW_HORIZONTAL_STRIDE_0; + } else { + _vert_stride = BRW_VERTICAL_STRIDE_4; + _width = BRW_WIDTH_4; + _horiz_stride = BRW_HORIZONTAL_STRIDE_1; + } + } + is_scalar_region = _vert_stride == BRW_VERTICAL_STRIDE_0 && + _width == BRW_WIDTH_1 && + _horiz_stride == BRW_HORIZONTAL_STRIDE_0; + + subreg_nr /= brw_reg_type_to_size(type); + + err |= control(file, "negate", m_negate, + brw_inst_3src_src1_negate(devinfo, inst), NULL); + err |= control(file, "abs", _abs, brw_inst_3src_src1_abs(devinfo, inst), NULL); + + err |= reg(file, _file, reg_nr); + if (err == -1) + return 0; + if (subreg_nr || is_scalar_region) + format(file, ".%d", subreg_nr); + src_align1_region(file, _vert_stride, _width, _horiz_stride); + if (!is_scalar_region && !is_align1) + err |= src_swizzle(file, brw_inst_3src_a16_src1_swizzle(devinfo, inst)); + string(file, brw_reg_type_to_letters(type)); + return err; +} + +static int +src2_3src(FILE *file, const struct intel_device_info *devinfo, + const brw_inst *inst) +{ + int err = 0; + unsigned reg_nr, subreg_nr; + enum brw_reg_file _file; + enum brw_reg_type type; + enum brw_vertical_stride _vert_stride; + enum brw_width _width; + enum brw_horizontal_stride _horiz_stride; + bool is_scalar_region; + bool is_align1 = brw_inst_3src_access_mode(devinfo, inst) == BRW_ALIGN_1; + + if (devinfo->ver < 10 && is_align1) + return 0; + + if (is_align1) { + if (devinfo->ver >= 12 && !brw_inst_3src_a1_src2_is_imm(devinfo, inst)) { + _file = brw_inst_3src_a1_src2_reg_file(devinfo, inst); + } else if (brw_inst_3src_a1_src2_reg_file(devinfo, inst) == + BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE) { + _file = BRW_GENERAL_REGISTER_FILE; + } else { + _file = BRW_IMMEDIATE_VALUE; + uint16_t imm_val = brw_inst_3src_a1_src2_imm(devinfo, inst); + enum brw_reg_type type = brw_inst_3src_a1_src2_type(devinfo, inst); + + if (type == BRW_REGISTER_TYPE_W) { + format(file, "%dW", imm_val); + } else if (type == BRW_REGISTER_TYPE_UW) { + format(file, "0x%04xUW", imm_val); + } else if (type == BRW_REGISTER_TYPE_HF) { + format(file, "0x%04xHF", imm_val); + } + return 0; + } + + reg_nr = brw_inst_3src_src2_reg_nr(devinfo, inst); + subreg_nr = brw_inst_3src_a1_src2_subreg_nr(devinfo, inst); + type = brw_inst_3src_a1_src2_type(devinfo, inst); + /* FINISHME: No vertical stride on src2. Is using the hstride in place + * correct? Doesn't seem like it, since there's hstride=1 but + * no vstride=1. + */ + _vert_stride = vstride_from_align1_3src_hstride( + brw_inst_3src_a1_src2_hstride(devinfo, inst)); + _horiz_stride = hstride_from_align1_3src_hstride( + brw_inst_3src_a1_src2_hstride(devinfo, inst)); + _width = implied_width(_vert_stride, _horiz_stride); + } else { + _file = BRW_GENERAL_REGISTER_FILE; + reg_nr = brw_inst_3src_src2_reg_nr(devinfo, inst); + subreg_nr = brw_inst_3src_a16_src2_subreg_nr(devinfo, inst) * 4; + type = brw_inst_3src_a16_src_type(devinfo, inst); + + if (brw_inst_3src_a16_src2_rep_ctrl(devinfo, inst)) { + _vert_stride = BRW_VERTICAL_STRIDE_0; + _width = BRW_WIDTH_1; + _horiz_stride = BRW_HORIZONTAL_STRIDE_0; + } else { + _vert_stride = BRW_VERTICAL_STRIDE_4; + _width = BRW_WIDTH_4; + _horiz_stride = BRW_HORIZONTAL_STRIDE_1; + } + } + is_scalar_region = _vert_stride == BRW_VERTICAL_STRIDE_0 && + _width == BRW_WIDTH_1 && + _horiz_stride == BRW_HORIZONTAL_STRIDE_0; + + subreg_nr /= brw_reg_type_to_size(type); + + err |= control(file, "negate", m_negate, + brw_inst_3src_src2_negate(devinfo, inst), NULL); + err |= control(file, "abs", _abs, brw_inst_3src_src2_abs(devinfo, inst), NULL); + + err |= reg(file, _file, reg_nr); + if (err == -1) + return 0; + if (subreg_nr || is_scalar_region) + format(file, ".%d", subreg_nr); + src_align1_region(file, _vert_stride, _width, _horiz_stride); + if (!is_scalar_region && !is_align1) + err |= src_swizzle(file, brw_inst_3src_a16_src2_swizzle(devinfo, inst)); + string(file, brw_reg_type_to_letters(type)); + return err; +} + +static int +src0_dpas_3src(FILE *file, const struct intel_device_info *devinfo, + const brw_inst *inst) +{ + uint32_t reg_file = brw_inst_dpas_3src_src0_reg_file(devinfo, inst); + + if (reg(file, reg_file, brw_inst_dpas_3src_src0_reg_nr(devinfo, inst)) == -1) + return 0; + + unsigned subreg_nr = brw_inst_dpas_3src_src0_subreg_nr(devinfo, inst); + enum brw_reg_type type = brw_inst_dpas_3src_src0_type(devinfo, inst); + + if (subreg_nr) + format(file, ".%d", subreg_nr); + src_align1_region(file, 1, 1, 0); + + string(file, brw_reg_type_to_letters(type)); + + return 0; +} + +static int +src1_dpas_3src(FILE *file, const struct intel_device_info *devinfo, + const brw_inst *inst) +{ + uint32_t reg_file = brw_inst_dpas_3src_src1_reg_file(devinfo, inst); + + if (reg(file, reg_file, brw_inst_dpas_3src_src1_reg_nr(devinfo, inst)) == -1) + return 0; + + unsigned subreg_nr = brw_inst_dpas_3src_src1_subreg_nr(devinfo, inst); + enum brw_reg_type type = brw_inst_dpas_3src_src1_type(devinfo, inst); + + if (subreg_nr) + format(file, ".%d", subreg_nr); + src_align1_region(file, 1, 1, 0); + + string(file, brw_reg_type_to_letters(type)); + + return 0; +} + +static int +src2_dpas_3src(FILE *file, const struct intel_device_info *devinfo, + const brw_inst *inst) +{ + uint32_t reg_file = brw_inst_dpas_3src_src2_reg_file(devinfo, inst); + + if (reg(file, reg_file, brw_inst_dpas_3src_src2_reg_nr(devinfo, inst)) == -1) + return 0; + + unsigned subreg_nr = brw_inst_dpas_3src_src2_subreg_nr(devinfo, inst); + enum brw_reg_type type = brw_inst_dpas_3src_src2_type(devinfo, inst); + + if (subreg_nr) + format(file, ".%d", subreg_nr); + src_align1_region(file, 1, 1, 0); + + string(file, brw_reg_type_to_letters(type)); + + return 0; +} + +static int +imm(FILE *file, const struct brw_isa_info *isa, enum brw_reg_type type, + const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + switch (type) { + case BRW_REGISTER_TYPE_UQ: + format(file, "0x%016"PRIx64"UQ", brw_inst_imm_uq(devinfo, inst)); + break; + case BRW_REGISTER_TYPE_Q: + format(file, "0x%016"PRIx64"Q", brw_inst_imm_uq(devinfo, inst)); + break; + case BRW_REGISTER_TYPE_UD: + format(file, "0x%08xUD", brw_inst_imm_ud(devinfo, inst)); + break; + case BRW_REGISTER_TYPE_D: + format(file, "%dD", brw_inst_imm_d(devinfo, inst)); + break; + case BRW_REGISTER_TYPE_UW: + format(file, "0x%04xUW", (uint16_t) brw_inst_imm_ud(devinfo, inst)); + break; + case BRW_REGISTER_TYPE_W: + format(file, "%dW", (int16_t) brw_inst_imm_d(devinfo, inst)); + break; + case BRW_REGISTER_TYPE_UV: + format(file, "0x%08xUV", brw_inst_imm_ud(devinfo, inst)); + break; + case BRW_REGISTER_TYPE_VF: + format(file, "0x%"PRIx64"VF", brw_inst_bits(inst, 127, 96)); + pad(file, 48); + format(file, "/* [%-gF, %-gF, %-gF, %-gF]VF */", + brw_vf_to_float(brw_inst_imm_ud(devinfo, inst)), + brw_vf_to_float(brw_inst_imm_ud(devinfo, inst) >> 8), + brw_vf_to_float(brw_inst_imm_ud(devinfo, inst) >> 16), + brw_vf_to_float(brw_inst_imm_ud(devinfo, inst) >> 24)); + break; + case BRW_REGISTER_TYPE_V: + format(file, "0x%08xV", brw_inst_imm_ud(devinfo, inst)); + break; + case BRW_REGISTER_TYPE_F: + /* The DIM instruction's src0 uses an F type but contains a + * 64-bit immediate + */ + if (brw_inst_opcode(isa, inst) == BRW_OPCODE_DIM) { + format(file, "0x%"PRIx64"F", brw_inst_bits(inst, 127, 64)); + pad(file, 48); + format(file, "/* %-gF */", brw_inst_imm_df(devinfo, inst)); + } else { + format(file, "0x%"PRIx64"F", brw_inst_bits(inst, 127, 96)); + pad(file, 48); + format(file, " /* %-gF */", brw_inst_imm_f(devinfo, inst)); + } + break; + case BRW_REGISTER_TYPE_DF: + format(file, "0x%016"PRIx64"DF", brw_inst_imm_uq(devinfo, inst)); + pad(file, 48); + format(file, "/* %-gDF */", brw_inst_imm_df(devinfo, inst)); + break; + case BRW_REGISTER_TYPE_HF: + format(file, "0x%04xHF", + (uint16_t) brw_inst_imm_ud(devinfo, inst)); + pad(file, 48); + format(file, "/* %-gHF */", + _mesa_half_to_float((uint16_t) brw_inst_imm_ud(devinfo, inst))); + break; + case BRW_REGISTER_TYPE_NF: + case BRW_REGISTER_TYPE_UB: + case BRW_REGISTER_TYPE_B: + format(file, "*** invalid immediate type %d ", type); + } + return 0; +} + +static int +src_sends_da(FILE *file, + const struct intel_device_info *devinfo, + enum brw_reg_type type, + enum brw_reg_file _reg_file, + unsigned _reg_nr, + unsigned _reg_subnr) +{ + int err = 0; + + err |= reg(file, _reg_file, _reg_nr); + if (err == -1) + return 0; + if (_reg_subnr) + format(file, ".1"); + string(file, brw_reg_type_to_letters(type)); + + return err; +} + +static int +src_sends_ia(FILE *file, + const struct intel_device_info *devinfo, + enum brw_reg_type type, + int _addr_imm, + unsigned _addr_subreg_nr) +{ + string(file, "g[a0"); + if (_addr_subreg_nr) + format(file, ".1"); + if (_addr_imm) + format(file, " %d", _addr_imm); + string(file, "]"); + string(file, brw_reg_type_to_letters(type)); + + return 0; +} + +static int +src_send_desc_ia(FILE *file, + const struct intel_device_info *devinfo, + unsigned _addr_subreg_nr) +{ + string(file, "a0"); + if (_addr_subreg_nr) + format(file, ".%d", _addr_subreg_nr); + format(file, "<0>UD"); + + return 0; +} + +static int +src0(FILE *file, const struct brw_isa_info *isa, const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + if (is_split_send(devinfo, brw_inst_opcode(isa, inst))) { + if (devinfo->ver >= 12) { + return src_sends_da(file, + devinfo, + BRW_REGISTER_TYPE_UD, + brw_inst_send_src0_reg_file(devinfo, inst), + brw_inst_src0_da_reg_nr(devinfo, inst), + 0); + } else if (brw_inst_send_src0_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) { + return src_sends_da(file, + devinfo, + BRW_REGISTER_TYPE_UD, + BRW_GENERAL_REGISTER_FILE, + brw_inst_src0_da_reg_nr(devinfo, inst), + brw_inst_src0_da16_subreg_nr(devinfo, inst)); + } else { + return src_sends_ia(file, + devinfo, + BRW_REGISTER_TYPE_UD, + brw_inst_send_src0_ia16_addr_imm(devinfo, inst), + brw_inst_src0_ia_subreg_nr(devinfo, inst)); + } + } else if (brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) { + return imm(file, isa, brw_inst_src0_type(devinfo, inst), inst); + } else if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + if (brw_inst_src0_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) { + return src_da1(file, + devinfo, + brw_inst_opcode(isa, inst), + brw_inst_src0_type(devinfo, inst), + brw_inst_src0_reg_file(devinfo, inst), + brw_inst_src0_vstride(devinfo, inst), + brw_inst_src0_width(devinfo, inst), + brw_inst_src0_hstride(devinfo, inst), + brw_inst_src0_da_reg_nr(devinfo, inst), + brw_inst_src0_da1_subreg_nr(devinfo, inst), + brw_inst_src0_abs(devinfo, inst), + brw_inst_src0_negate(devinfo, inst)); + } else { + return src_ia1(file, + devinfo, + brw_inst_opcode(isa, inst), + brw_inst_src0_type(devinfo, inst), + brw_inst_src0_ia1_addr_imm(devinfo, inst), + brw_inst_src0_ia_subreg_nr(devinfo, inst), + brw_inst_src0_negate(devinfo, inst), + brw_inst_src0_abs(devinfo, inst), + brw_inst_src0_hstride(devinfo, inst), + brw_inst_src0_width(devinfo, inst), + brw_inst_src0_vstride(devinfo, inst)); + } + } else { + if (brw_inst_src0_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) { + return src_da16(file, + devinfo, + brw_inst_opcode(isa, inst), + brw_inst_src0_type(devinfo, inst), + brw_inst_src0_reg_file(devinfo, inst), + brw_inst_src0_vstride(devinfo, inst), + brw_inst_src0_da_reg_nr(devinfo, inst), + brw_inst_src0_da16_subreg_nr(devinfo, inst), + brw_inst_src0_abs(devinfo, inst), + brw_inst_src0_negate(devinfo, inst), + brw_inst_src0_da16_swiz_x(devinfo, inst), + brw_inst_src0_da16_swiz_y(devinfo, inst), + brw_inst_src0_da16_swiz_z(devinfo, inst), + brw_inst_src0_da16_swiz_w(devinfo, inst)); + } else { + string(file, "Indirect align16 address mode not supported"); + return 1; + } + } +} + +static int +src1(FILE *file, const struct brw_isa_info *isa, const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + if (is_split_send(devinfo, brw_inst_opcode(isa, inst))) { + return src_sends_da(file, + devinfo, + BRW_REGISTER_TYPE_UD, + brw_inst_send_src1_reg_file(devinfo, inst), + brw_inst_send_src1_reg_nr(devinfo, inst), + 0 /* subreg_nr */); + } else if (brw_inst_src1_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) { + return imm(file, isa, brw_inst_src1_type(devinfo, inst), inst); + } else if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + if (brw_inst_src1_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) { + return src_da1(file, + devinfo, + brw_inst_opcode(isa, inst), + brw_inst_src1_type(devinfo, inst), + brw_inst_src1_reg_file(devinfo, inst), + brw_inst_src1_vstride(devinfo, inst), + brw_inst_src1_width(devinfo, inst), + brw_inst_src1_hstride(devinfo, inst), + brw_inst_src1_da_reg_nr(devinfo, inst), + brw_inst_src1_da1_subreg_nr(devinfo, inst), + brw_inst_src1_abs(devinfo, inst), + brw_inst_src1_negate(devinfo, inst)); + } else { + return src_ia1(file, + devinfo, + brw_inst_opcode(isa, inst), + brw_inst_src1_type(devinfo, inst), + brw_inst_src1_ia1_addr_imm(devinfo, inst), + brw_inst_src1_ia_subreg_nr(devinfo, inst), + brw_inst_src1_negate(devinfo, inst), + brw_inst_src1_abs(devinfo, inst), + brw_inst_src1_hstride(devinfo, inst), + brw_inst_src1_width(devinfo, inst), + brw_inst_src1_vstride(devinfo, inst)); + } + } else { + if (brw_inst_src1_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) { + return src_da16(file, + devinfo, + brw_inst_opcode(isa, inst), + brw_inst_src1_type(devinfo, inst), + brw_inst_src1_reg_file(devinfo, inst), + brw_inst_src1_vstride(devinfo, inst), + brw_inst_src1_da_reg_nr(devinfo, inst), + brw_inst_src1_da16_subreg_nr(devinfo, inst), + brw_inst_src1_abs(devinfo, inst), + brw_inst_src1_negate(devinfo, inst), + brw_inst_src1_da16_swiz_x(devinfo, inst), + brw_inst_src1_da16_swiz_y(devinfo, inst), + brw_inst_src1_da16_swiz_z(devinfo, inst), + brw_inst_src1_da16_swiz_w(devinfo, inst)); + } else { + string(file, "Indirect align16 address mode not supported"); + return 1; + } + } +} + +static int +qtr_ctrl(FILE *file, const struct intel_device_info *devinfo, + const brw_inst *inst) +{ + int qtr_ctl = brw_inst_qtr_control(devinfo, inst); + int exec_size = 1 << brw_inst_exec_size(devinfo, inst); + const unsigned nib_ctl = devinfo->ver < 7 || devinfo->ver >= 20 ? 0 : + brw_inst_nib_control(devinfo, inst); + + if (exec_size < 8 || nib_ctl) { + format(file, " %dN", qtr_ctl * 2 + nib_ctl + 1); + } else if (exec_size == 8) { + switch (qtr_ctl) { + case 0: + string(file, " 1Q"); + break; + case 1: + string(file, " 2Q"); + break; + case 2: + string(file, " 3Q"); + break; + case 3: + string(file, " 4Q"); + break; + } + } else if (exec_size == 16) { + if (qtr_ctl < 2) + string(file, " 1H"); + else + string(file, " 2H"); + } + return 0; +} + +static bool +inst_has_type(const struct brw_isa_info *isa, + const brw_inst *inst, + enum brw_reg_type type) +{ + const struct intel_device_info *devinfo = isa->devinfo; + const unsigned num_sources = brw_num_sources_from_inst(isa, inst); + + if (brw_inst_dst_type(devinfo, inst) == type) + return true; + + if (num_sources >= 3) { + if (brw_inst_3src_access_mode(devinfo, inst) == BRW_ALIGN_1) + return brw_inst_3src_a1_src0_type(devinfo, inst) == type || + brw_inst_3src_a1_src1_type(devinfo, inst) == type || + brw_inst_3src_a1_src2_type(devinfo, inst) == type; + else + return brw_inst_3src_a16_src_type(devinfo, inst) == type; + } else if (num_sources == 2) { + return brw_inst_src0_type(devinfo, inst) == type || + brw_inst_src1_type(devinfo, inst) == type; + } else { + return brw_inst_src0_type(devinfo, inst) == type; + } +} + +static int +swsb(FILE *file, const struct brw_isa_info *isa, const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + const enum opcode opcode = brw_inst_opcode(isa, inst); + const uint32_t x = brw_inst_swsb(devinfo, inst); + const bool is_unordered = + opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC || + opcode == BRW_OPCODE_MATH || opcode == BRW_OPCODE_DPAS || + (devinfo->has_64bit_float_via_math_pipe && + inst_has_type(isa, inst, BRW_REGISTER_TYPE_DF)); + const struct tgl_swsb swsb = tgl_swsb_decode(devinfo, is_unordered, x); + if (swsb.regdist) + format(file, " %s@%d", + (swsb.pipe == TGL_PIPE_FLOAT ? "F" : + swsb.pipe == TGL_PIPE_INT ? "I" : + swsb.pipe == TGL_PIPE_LONG ? "L" : + swsb.pipe == TGL_PIPE_ALL ? "A" : "" ), + swsb.regdist); + if (swsb.mode) + format(file, " $%d%s", swsb.sbid, + (swsb.mode & TGL_SBID_SET ? "" : + swsb.mode & TGL_SBID_DST ? ".dst" : ".src")); + return 0; +} + +#ifdef DEBUG +static __attribute__((__unused__)) int +brw_disassemble_imm(const struct brw_isa_info *isa, + uint32_t dw3, uint32_t dw2, uint32_t dw1, uint32_t dw0) +{ + brw_inst inst; + inst.data[0] = (((uint64_t) dw1) << 32) | ((uint64_t) dw0); + inst.data[1] = (((uint64_t) dw3) << 32) | ((uint64_t) dw2); + return brw_disassemble_inst(stderr, isa, &inst, false, 0, NULL); +} +#endif + +static void +write_label(FILE *file, const struct intel_device_info *devinfo, + const struct brw_label *root_label, + int offset, int jump) +{ + if (root_label != NULL) { + int to_bytes_scale = sizeof(brw_inst) / brw_jump_scale(devinfo); + const struct brw_label *label = + brw_find_label(root_label, offset + jump * to_bytes_scale); + if (label != NULL) { + format(file, " LABEL%d", label->number); + } + } +} + +static void +lsc_disassemble_ex_desc(const struct intel_device_info *devinfo, + uint32_t imm_desc, + uint32_t imm_ex_desc, + FILE *file) +{ + const unsigned addr_type = lsc_msg_desc_addr_type(devinfo, imm_desc); + switch (addr_type) { + case LSC_ADDR_SURFTYPE_FLAT: + format(file, " base_offset %u ", + lsc_flat_ex_desc_base_offset(devinfo, imm_ex_desc)); + break; + case LSC_ADDR_SURFTYPE_BSS: + case LSC_ADDR_SURFTYPE_SS: + format(file, " surface_state_index %u ", + lsc_bss_ex_desc_index(devinfo, imm_ex_desc)); + break; + case LSC_ADDR_SURFTYPE_BTI: + format(file, " BTI %u ", + lsc_bti_ex_desc_index(devinfo, imm_ex_desc)); + format(file, " base_offset %u ", + lsc_bti_ex_desc_base_offset(devinfo, imm_ex_desc)); + break; + default: + format(file, "unsupported address surface type %d", addr_type); + break; + } +} + +static inline bool +brw_sfid_is_lsc(unsigned sfid) +{ + switch (sfid) { + case GFX12_SFID_UGM: + case GFX12_SFID_SLM: + case GFX12_SFID_TGM: + return true; + default: + break; + } + + return false; +} + +int +brw_disassemble_inst(FILE *file, const struct brw_isa_info *isa, + const brw_inst *inst, bool is_compacted, + int offset, const struct brw_label *root_label) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + int err = 0; + int space = 0; + + const enum opcode opcode = brw_inst_opcode(isa, inst); + const struct opcode_desc *desc = brw_opcode_desc(isa, opcode); + + if (brw_inst_pred_control(devinfo, inst)) { + string(file, "("); + err |= control(file, "predicate inverse", pred_inv, + brw_inst_pred_inv(devinfo, inst), NULL); + format(file, "f%"PRIu64".%"PRIu64, + devinfo->ver >= 7 ? brw_inst_flag_reg_nr(devinfo, inst) : 0, + brw_inst_flag_subreg_nr(devinfo, inst)); + if (devinfo->ver >= 20) { + err |= control(file, "predicate control", xe2_pred_ctrl, + brw_inst_pred_control(devinfo, inst), NULL); + } else if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + err |= control(file, "predicate control align1", pred_ctrl_align1, + brw_inst_pred_control(devinfo, inst), NULL); + } else { + err |= control(file, "predicate control align16", pred_ctrl_align16, + brw_inst_pred_control(devinfo, inst), NULL); + } + string(file, ") "); + } + + err |= print_opcode(file, isa, opcode); + + if (!is_send(opcode)) + err |= control(file, "saturate", saturate, brw_inst_saturate(devinfo, inst), + NULL); + + err |= control(file, "debug control", debug_ctrl, + brw_inst_debug_control(devinfo, inst), NULL); + + if (opcode == BRW_OPCODE_MATH) { + string(file, " "); + err |= control(file, "function", math_function, + brw_inst_math_function(devinfo, inst), NULL); + + } else if (opcode == BRW_OPCODE_SYNC) { + string(file, " "); + err |= control(file, "function", sync_function, + brw_inst_cond_modifier(devinfo, inst), NULL); + + } else if (opcode == BRW_OPCODE_DPAS) { + string(file, "."); + + err |= control(file, "systolic depth", dpas_systolic_depth, + brw_inst_dpas_3src_sdepth(devinfo, inst), NULL); + + const unsigned rcount = brw_inst_dpas_3src_rcount(devinfo, inst) + 1; + + format(file, "x%d", rcount); + } else if (!is_send(opcode) && + (devinfo->ver < 12 || + brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE || + type_sz(brw_inst_src0_type(devinfo, inst)) < 8)) { + err |= control(file, "conditional modifier", conditional_modifier, + brw_inst_cond_modifier(devinfo, inst), NULL); + + /* If we're using the conditional modifier, print which flags reg is + * used for it. Note that on gfx6+, the embedded-condition SEL and + * control flow doesn't update flags. + */ + if (brw_inst_cond_modifier(devinfo, inst) && + (devinfo->ver < 6 || (opcode != BRW_OPCODE_SEL && + opcode != BRW_OPCODE_CSEL && + opcode != BRW_OPCODE_IF && + opcode != BRW_OPCODE_WHILE))) { + format(file, ".f%"PRIu64".%"PRIu64, + devinfo->ver >= 7 ? brw_inst_flag_reg_nr(devinfo, inst) : 0, + brw_inst_flag_subreg_nr(devinfo, inst)); + } + } + + if (opcode != BRW_OPCODE_NOP && opcode != BRW_OPCODE_NENOP) { + string(file, "("); + err |= control(file, "execution size", exec_size, + brw_inst_exec_size(devinfo, inst), NULL); + string(file, ")"); + } + + if (opcode == BRW_OPCODE_SEND && devinfo->ver < 6) + format(file, " %"PRIu64, brw_inst_base_mrf(devinfo, inst)); + + if (brw_has_uip(devinfo, opcode)) { + /* Instructions that have UIP also have JIP. */ + pad(file, 16); + string(file, "JIP: "); + write_label(file, devinfo, root_label, offset, brw_inst_jip(devinfo, inst)); + + pad(file, 38); + string(file, "UIP: "); + write_label(file, devinfo, root_label, offset, brw_inst_uip(devinfo, inst)); + } else if (brw_has_jip(devinfo, opcode)) { + int jip; + if (devinfo->ver >= 7) { + jip = brw_inst_jip(devinfo, inst); + } else { + jip = brw_inst_gfx6_jump_count(devinfo, inst); + } + + pad(file, 16); + string(file, "JIP: "); + write_label(file, devinfo, root_label, offset, jip); + } else if (devinfo->ver < 6 && (opcode == BRW_OPCODE_BREAK || + opcode == BRW_OPCODE_CONTINUE || + opcode == BRW_OPCODE_ELSE)) { + pad(file, 16); + format(file, "Jump: %d", brw_inst_gfx4_jump_count(devinfo, inst)); + pad(file, 32); + format(file, "Pop: %"PRIu64, brw_inst_gfx4_pop_count(devinfo, inst)); + } else if (devinfo->ver < 6 && (opcode == BRW_OPCODE_IF || + opcode == BRW_OPCODE_IFF || + opcode == BRW_OPCODE_HALT || + opcode == BRW_OPCODE_WHILE)) { + pad(file, 16); + format(file, "Jump: %d", brw_inst_gfx4_jump_count(devinfo, inst)); + } else if (devinfo->ver < 6 && opcode == BRW_OPCODE_ENDIF) { + pad(file, 16); + format(file, "Pop: %"PRIu64, brw_inst_gfx4_pop_count(devinfo, inst)); + } else if (opcode == BRW_OPCODE_JMPI) { + pad(file, 16); + err |= src1(file, isa, inst); + } else if (opcode == BRW_OPCODE_DPAS) { + pad(file, 16); + err |= dest_dpas_3src(file, devinfo, inst); + + pad(file, 32); + err |= src0_dpas_3src(file, devinfo, inst); + + pad(file, 48); + err |= src1_dpas_3src(file, devinfo, inst); + + pad(file, 64); + err |= src2_dpas_3src(file, devinfo, inst); + + } else if (desc && desc->nsrc == 3) { + pad(file, 16); + err |= dest_3src(file, devinfo, inst); + + pad(file, 32); + err |= src0_3src(file, devinfo, inst); + + pad(file, 48); + err |= src1_3src(file, devinfo, inst); + + pad(file, 64); + err |= src2_3src(file, devinfo, inst); + } else if (desc) { + if (desc->ndst > 0) { + pad(file, 16); + err |= dest(file, isa, inst); + } + + if (desc->nsrc > 0) { + pad(file, 32); + err |= src0(file, isa, inst); + } + + if (desc->nsrc > 1) { + pad(file, 48); + err |= src1(file, isa, inst); + } + } + + if (is_send(opcode)) { + enum brw_message_target sfid = brw_inst_sfid(devinfo, inst); + + bool has_imm_desc = false, has_imm_ex_desc = false; + uint32_t imm_desc = 0, imm_ex_desc = 0; + if (is_split_send(devinfo, opcode)) { + pad(file, 64); + if (brw_inst_send_sel_reg32_desc(devinfo, inst)) { + /* show the indirect descriptor source */ + err |= src_send_desc_ia(file, devinfo, 0); + } else { + has_imm_desc = true; + imm_desc = brw_inst_send_desc(devinfo, inst); + fprintf(file, "0x%08"PRIx32, imm_desc); + } + + pad(file, 80); + if (brw_inst_send_sel_reg32_ex_desc(devinfo, inst)) { + /* show the indirect descriptor source */ + err |= src_send_desc_ia(file, devinfo, + brw_inst_send_ex_desc_ia_subreg_nr(devinfo, inst)); + } else { + has_imm_ex_desc = true; + imm_ex_desc = brw_inst_sends_ex_desc(devinfo, inst); + fprintf(file, "0x%08"PRIx32, imm_ex_desc); + } + } else { + if (brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE) { + /* show the indirect descriptor source */ + pad(file, 48); + err |= src1(file, isa, inst); + pad(file, 64); + } else { + has_imm_desc = true; + imm_desc = brw_inst_send_desc(devinfo, inst); + pad(file, 48); + } + + /* Print message descriptor as immediate source */ + fprintf(file, "0x%08"PRIx64, inst->data[1] >> 32); + } + + newline(file); + pad(file, 16); + space = 0; + + fprintf(file, " "); + err |= control(file, "SFID", devinfo->ver >= 6 ? gfx6_sfid : gfx4_sfid, + sfid, &space); + string(file, " MsgDesc:"); + + if (!has_imm_desc) { + format(file, " indirect"); + } else { + bool unsupported = false; + switch (sfid) { + case BRW_SFID_MATH: + err |= control(file, "math function", math_function, + brw_inst_math_msg_function(devinfo, inst), &space); + err |= control(file, "math saturate", math_saturate, + brw_inst_math_msg_saturate(devinfo, inst), &space); + err |= control(file, "math signed", math_signed, + brw_inst_math_msg_signed_int(devinfo, inst), &space); + err |= control(file, "math scalar", math_scalar, + brw_inst_math_msg_data_type(devinfo, inst), &space); + err |= control(file, "math precision", math_precision, + brw_inst_math_msg_precision(devinfo, inst), &space); + break; + case BRW_SFID_SAMPLER: + if (devinfo->ver >= 20) { + err |= control(file, "sampler message", xe2_sampler_msg_type, + brw_sampler_desc_msg_type(devinfo, imm_desc), + &space); + err |= control(file, "sampler simd mode", xe2_sampler_simd_mode, + brw_sampler_desc_simd_mode(devinfo, imm_desc), + &space); + if (brw_sampler_desc_return_format(devinfo, imm_desc)) { + string(file, " HP"); + } + format(file, " Surface = %u Sampler = %u", + brw_sampler_desc_binding_table_index(devinfo, imm_desc), + brw_sampler_desc_sampler(devinfo, imm_desc)); + } else if (devinfo->ver >= 5) { + err |= control(file, "sampler message", gfx5_sampler_msg_type, + brw_sampler_desc_msg_type(devinfo, imm_desc), + &space); + err |= control(file, "sampler simd mode", gfx5_sampler_simd_mode, + brw_sampler_desc_simd_mode(devinfo, imm_desc), + &space); + if (devinfo->ver >= 8 && + brw_sampler_desc_return_format(devinfo, imm_desc)) { + string(file, " HP"); + } + format(file, " Surface = %u Sampler = %u", + brw_sampler_desc_binding_table_index(devinfo, imm_desc), + brw_sampler_desc_sampler(devinfo, imm_desc)); + } else { + format(file, " (bti %u, sampler %u, msg_type %u, ", + brw_sampler_desc_binding_table_index(devinfo, imm_desc), + brw_sampler_desc_sampler(devinfo, imm_desc), + brw_sampler_desc_msg_type(devinfo, imm_desc)); + if (devinfo->verx10 != 45) { + err |= control(file, "sampler target format", + sampler_target_format, + brw_sampler_desc_return_format(devinfo, imm_desc), + NULL); + } + string(file, ")"); + } + break; + case GFX6_SFID_DATAPORT_SAMPLER_CACHE: + case GFX6_SFID_DATAPORT_CONSTANT_CACHE: + /* aka BRW_SFID_DATAPORT_READ on Gfx4-5 */ + if (devinfo->ver >= 6) { + format(file, " (bti %u, msg_ctrl %u, msg_type %u, write_commit %u)", + brw_dp_desc_binding_table_index(devinfo, imm_desc), + brw_dp_desc_msg_control(devinfo, imm_desc), + brw_dp_desc_msg_type(devinfo, imm_desc), + devinfo->ver >= 7 ? 0u : + brw_dp_write_desc_write_commit(devinfo, imm_desc)); + } else { + bool is_965 = devinfo->verx10 == 40; + err |= control(file, "DP read message type", + is_965 ? gfx4_dp_read_port_msg_type : + g45_dp_read_port_msg_type, + brw_dp_read_desc_msg_type(devinfo, imm_desc), + &space); + + format(file, " MsgCtrl = 0x%u", + brw_dp_read_desc_msg_control(devinfo, imm_desc)); + + format(file, " Surface = %u", + brw_dp_desc_binding_table_index(devinfo, imm_desc)); + } + break; + + case GFX6_SFID_DATAPORT_RENDER_CACHE: { + /* aka BRW_SFID_DATAPORT_WRITE on Gfx4-5 */ + unsigned msg_type = brw_fb_write_desc_msg_type(devinfo, imm_desc); + + err |= control(file, "DP rc message type", + dp_rc_msg_type(devinfo), msg_type, &space); + + bool is_rt_write = msg_type == + (devinfo->ver >= 6 ? GFX6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE + : BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE); + + if (is_rt_write) { + err |= control(file, "RT message type", m_rt_write_subtype, + brw_inst_rt_message_type(devinfo, inst), &space); + if (devinfo->ver >= 6 && brw_inst_rt_slot_group(devinfo, inst)) + string(file, " Hi"); + if (brw_fb_write_desc_last_render_target(devinfo, imm_desc)) + string(file, " LastRT"); + if (devinfo->ver >= 10 && + brw_fb_write_desc_coarse_write(devinfo, imm_desc)) + string(file, " CoarseWrite"); + if (devinfo->ver < 7 && + brw_fb_write_desc_write_commit(devinfo, imm_desc)) + string(file, " WriteCommit"); + } else { + format(file, " MsgCtrl = 0x%u", + brw_fb_write_desc_msg_control(devinfo, imm_desc)); + } + + format(file, " Surface = %u", + brw_fb_desc_binding_table_index(devinfo, imm_desc)); + break; + } + + case BRW_SFID_URB: { + if (devinfo->ver >= 20) { + format(file, " ("); + const enum lsc_opcode op = lsc_msg_desc_opcode(devinfo, imm_desc); + err |= control(file, "operation", lsc_operation, + op, &space); + format(file, ","); + err |= control(file, "addr_size", lsc_addr_size, + lsc_msg_desc_addr_size(devinfo, imm_desc), + &space); + + format(file, ","); + err |= control(file, "data_size", lsc_data_size, + lsc_msg_desc_data_size(devinfo, imm_desc), + &space); + format(file, ","); + if (lsc_opcode_has_cmask(op)) { + err |= control(file, "component_mask", + lsc_cmask_str, + lsc_msg_desc_cmask(devinfo, imm_desc), + &space); + } else { + err |= control(file, "vector_size", + lsc_vect_size_str, + lsc_msg_desc_vect_size(devinfo, imm_desc), + &space); + if (lsc_msg_desc_transpose(devinfo, imm_desc)) + format(file, ", transpose"); + } + switch(op) { + case LSC_OP_LOAD_CMASK: + case LSC_OP_LOAD: + format(file, ","); + err |= control(file, "cache_load", + lsc_cache_load, + lsc_msg_desc_cache_ctrl(devinfo, imm_desc), + &space); + break; + default: + format(file, ","); + err |= control(file, "cache_store", + lsc_cache_store, + lsc_msg_desc_cache_ctrl(devinfo, imm_desc), + &space); + break; + } + + format(file, " dst_len = %u,", lsc_msg_desc_dest_len(devinfo, imm_desc)); + format(file, " src0_len = %u,", lsc_msg_desc_src0_len(devinfo, imm_desc)); + format(file, " src1_len = %d", brw_message_ex_desc_ex_mlen(devinfo, imm_ex_desc)); + err |= control(file, "address_type", lsc_addr_surface_type, + lsc_msg_desc_addr_type(devinfo, imm_desc), &space); + format(file, " )"); + } else { + unsigned urb_opcode = brw_inst_urb_opcode(devinfo, inst); + + format(file, " offset %"PRIu64, brw_inst_urb_global_offset(devinfo, inst)); + + space = 1; + + err |= control(file, "urb opcode", + devinfo->ver >= 7 ? gfx7_urb_opcode + : gfx5_urb_opcode, + urb_opcode, &space); + + if (devinfo->ver >= 7 && + brw_inst_urb_per_slot_offset(devinfo, inst)) { + string(file, " per-slot"); + } + + if (urb_opcode == GFX8_URB_OPCODE_SIMD8_WRITE || + urb_opcode == GFX8_URB_OPCODE_SIMD8_READ) { + if (brw_inst_urb_channel_mask_present(devinfo, inst)) + string(file, " masked"); + } else if (urb_opcode != GFX125_URB_OPCODE_FENCE) { + err |= control(file, "urb swizzle", urb_swizzle, + brw_inst_urb_swizzle_control(devinfo, inst), + &space); + } + + if (devinfo->ver < 7) { + err |= control(file, "urb allocate", urb_allocate, + brw_inst_urb_allocate(devinfo, inst), &space); + err |= control(file, "urb used", urb_used, + brw_inst_urb_used(devinfo, inst), &space); + } + if (devinfo->ver < 8) { + err |= control(file, "urb complete", urb_complete, + brw_inst_urb_complete(devinfo, inst), &space); + } + } + break; + } + case BRW_SFID_THREAD_SPAWNER: + break; + + case BRW_SFID_MESSAGE_GATEWAY: + format(file, " (%s)", + gfx7_gateway_subfuncid[brw_inst_gateway_subfuncid(devinfo, inst)]); + break; + + case GFX12_SFID_SLM: + case GFX12_SFID_TGM: + case GFX12_SFID_UGM: { + assert(devinfo->has_lsc); + format(file, " ("); + const enum lsc_opcode op = lsc_msg_desc_opcode(devinfo, imm_desc); + err |= control(file, "operation", lsc_operation, + op, &space); + format(file, ","); + err |= control(file, "addr_size", lsc_addr_size, + lsc_msg_desc_addr_size(devinfo, imm_desc), + &space); + + if (op == LSC_OP_FENCE) { + format(file, ","); + err |= control(file, "scope", lsc_fence_scope, + lsc_fence_msg_desc_scope(devinfo, imm_desc), + &space); + format(file, ","); + err |= control(file, "flush_type", lsc_flush_type, + lsc_fence_msg_desc_flush_type(devinfo, imm_desc), + &space); + format(file, ","); + err |= control(file, "backup_mode_fence_routing", + lsc_backup_fence_routing, + lsc_fence_msg_desc_backup_routing(devinfo, imm_desc), + &space); + } else { + format(file, ","); + err |= control(file, "data_size", lsc_data_size, + lsc_msg_desc_data_size(devinfo, imm_desc), + &space); + format(file, ","); + if (lsc_opcode_has_cmask(op)) { + err |= control(file, "component_mask", + lsc_cmask_str, + lsc_msg_desc_cmask(devinfo, imm_desc), + &space); + } else { + err |= control(file, "vector_size", + lsc_vect_size_str, + lsc_msg_desc_vect_size(devinfo, imm_desc), + &space); + if (lsc_msg_desc_transpose(devinfo, imm_desc)) + format(file, ", transpose"); + } + switch(op) { + case LSC_OP_LOAD_CMASK: + case LSC_OP_LOAD: + format(file, ","); + err |= control(file, "cache_load", + devinfo->ver >= 20 ? + xe2_lsc_cache_load : + lsc_cache_load, + lsc_msg_desc_cache_ctrl(devinfo, imm_desc), + &space); + break; + default: + format(file, ","); + err |= control(file, "cache_store", + devinfo->ver >= 20 ? + xe2_lsc_cache_store : + lsc_cache_store, + lsc_msg_desc_cache_ctrl(devinfo, imm_desc), + &space); + break; + } + } + format(file, " dst_len = %u,", lsc_msg_desc_dest_len(devinfo, imm_desc)); + format(file, " src0_len = %u,", lsc_msg_desc_src0_len(devinfo, imm_desc)); + + if (!brw_inst_send_sel_reg32_ex_desc(devinfo, inst)) + format(file, " src1_len = %d", + brw_message_ex_desc_ex_mlen(devinfo, imm_ex_desc)); + + err |= control(file, "address_type", lsc_addr_surface_type, + lsc_msg_desc_addr_type(devinfo, imm_desc), &space); + format(file, " )"); + break; + } + + case GFX7_SFID_DATAPORT_DATA_CACHE: + if (devinfo->ver >= 7) { + format(file, " ("); + space = 0; + + err |= control(file, "DP DC0 message type", + dp_dc0_msg_type_gfx7, + brw_dp_desc_msg_type(devinfo, imm_desc), &space); + + format(file, ", bti %u, ", + brw_dp_desc_binding_table_index(devinfo, imm_desc)); + + switch (brw_inst_dp_msg_type(devinfo, inst)) { + case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP: + control(file, "atomic op", aop, + brw_dp_desc_msg_control(devinfo, imm_desc) & 0xf, + &space); + break; + case GFX7_DATAPORT_DC_OWORD_BLOCK_READ: + case GFX7_DATAPORT_DC_OWORD_BLOCK_WRITE: { + unsigned msg_ctrl = brw_dp_desc_msg_control(devinfo, imm_desc); + assert(dp_oword_block_rw[msg_ctrl & 7]); + format(file, "owords = %s, aligned = %d", + dp_oword_block_rw[msg_ctrl & 7], (msg_ctrl >> 3) & 3); + break; + } + default: + format(file, "%u", + brw_dp_desc_msg_control(devinfo, imm_desc)); + } + format(file, ")"); + } else { + unsupported = true; + } + break; + + case HSW_SFID_DATAPORT_DATA_CACHE_1: { + if (devinfo->ver >= 7) { + format(file, " ("); + space = 0; + + unsigned msg_ctrl = brw_dp_desc_msg_control(devinfo, imm_desc); + + err |= control(file, "DP DC1 message type", + dp_dc1_msg_type_hsw, + brw_dp_desc_msg_type(devinfo, imm_desc), &space); + + format(file, ", Surface = %u, ", + brw_dp_desc_binding_table_index(devinfo, imm_desc)); + + switch (brw_inst_dp_msg_type(devinfo, inst)) { + case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP: + case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP: + case HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP: + format(file, "SIMD%d,", (msg_ctrl & (1 << 4)) ? 8 : 16); + FALLTHROUGH; + case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2: + case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2: + case HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2: + case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_OP: + case GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_INT_OP: + control(file, "atomic op", aop, msg_ctrl & 0xf, &space); + break; + case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ: + case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE: + case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ: + case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE: + case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE: + case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ: { + static const char *simd_modes[] = { "4x2", "16", "8" }; + format(file, "SIMD%s, Mask = 0x%x", + simd_modes[msg_ctrl >> 4], msg_ctrl & 0xf); + break; + } + case GFX9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP: + case GFX9_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_FLOAT_OP: + case GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_FLOAT_OP: + format(file, "SIMD%d,", (msg_ctrl & (1 << 4)) ? 8 : 16); + control(file, "atomic float op", aop_float, msg_ctrl & 0xf, + &space); + break; + case GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_WRITE: + case GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_READ: + assert(dp_oword_block_rw[msg_ctrl & 7]); + format(file, "owords = %s, aligned = %d", + dp_oword_block_rw[msg_ctrl & 7], (msg_ctrl >> 3) & 3); + break; + default: + format(file, "0x%x", msg_ctrl); + } + format(file, ")"); + } else { + unsupported = true; + } + break; + } + + case GFX7_SFID_PIXEL_INTERPOLATOR: + if (devinfo->ver >= 7) { + format(file, " (%s, %s, 0x%02"PRIx64")", + brw_inst_pi_nopersp(devinfo, inst) ? "linear" : "persp", + pixel_interpolator_msg_types[brw_inst_pi_message_type(devinfo, inst)], + brw_inst_pi_message_data(devinfo, inst)); + } else { + unsupported = true; + } + break; + + case GEN_RT_SFID_RAY_TRACE_ACCELERATOR: + if (devinfo->has_ray_tracing) { + format(file, " SIMD%d,", + brw_rt_trace_ray_desc_exec_size(devinfo, imm_desc)); + } else { + unsupported = true; + } + break; + + default: + unsupported = true; + break; + } + + if (unsupported) + format(file, "unsupported shared function ID %d", sfid); + + if (space) + string(file, " "); + } + if (devinfo->verx10 >= 125 && + brw_inst_send_sel_reg32_ex_desc(devinfo, inst) && + brw_inst_send_ex_bso(devinfo, inst)) { + format(file, " src1_len = %u", + (unsigned) brw_inst_send_src1_len(devinfo, inst)); + + format(file, " ex_bso"); + } + if (brw_sfid_is_lsc(sfid) || + (sfid == BRW_SFID_URB && devinfo->ver >= 20)) { + lsc_disassemble_ex_desc(devinfo, imm_desc, imm_ex_desc, file); + } else { + if (has_imm_desc) + format(file, " mlen %u", brw_message_desc_mlen(devinfo, imm_desc)); + if (has_imm_ex_desc) { + format(file, " ex_mlen %u", + brw_message_ex_desc_ex_mlen(devinfo, imm_ex_desc)); + } + if (has_imm_desc) + format(file, " rlen %u", brw_message_desc_rlen(devinfo, imm_desc)); + } + } + pad(file, 64); + if (opcode != BRW_OPCODE_NOP && opcode != BRW_OPCODE_NENOP) { + string(file, "{"); + space = 1; + err |= control(file, "access mode", access_mode, + brw_inst_access_mode(devinfo, inst), &space); + if (devinfo->ver >= 6) { + err |= control(file, "write enable control", wectrl, + brw_inst_mask_control(devinfo, inst), &space); + } else { + err |= control(file, "mask control", mask_ctrl, + brw_inst_mask_control(devinfo, inst), &space); + } + + if (devinfo->ver < 12) { + err |= control(file, "dependency control", dep_ctrl, + ((brw_inst_no_dd_check(devinfo, inst) << 1) | + brw_inst_no_dd_clear(devinfo, inst)), &space); + } + + if (devinfo->ver >= 6) + err |= qtr_ctrl(file, devinfo, inst); + else { + if (brw_inst_qtr_control(devinfo, inst) == BRW_COMPRESSION_COMPRESSED && + desc && desc->ndst > 0 && + brw_inst_dst_reg_file(devinfo, inst) == BRW_MESSAGE_REGISTER_FILE && + brw_inst_dst_da_reg_nr(devinfo, inst) & BRW_MRF_COMPR4) { + format(file, " compr4"); + } else { + err |= control(file, "compression control", compr_ctrl, + brw_inst_qtr_control(devinfo, inst), &space); + } + } + + if (devinfo->ver >= 12) + err |= swsb(file, isa, inst); + + err |= control(file, "compaction", cmpt_ctrl, is_compacted, &space); + err |= control(file, "thread control", thread_ctrl, + (devinfo->ver >= 12 ? brw_inst_atomic_control(devinfo, inst) : + brw_inst_thread_control(devinfo, inst)), + &space); + if (has_branch_ctrl(devinfo, opcode)) { + err |= control(file, "branch ctrl", branch_ctrl, + brw_inst_branch_control(devinfo, inst), &space); + } else if (devinfo->ver >= 6 && devinfo->ver < 20) { + err |= control(file, "acc write control", accwr, + brw_inst_acc_wr_control(devinfo, inst), &space); + } + if (is_send(opcode)) + err |= control(file, "end of thread", end_of_thread, + brw_inst_eot(devinfo, inst), &space); + if (space) + string(file, " "); + string(file, "}"); + } + string(file, ";"); + newline(file); + return err; +} + +int +brw_disassemble_find_end(const struct brw_isa_info *isa, + const void *assembly, int start) +{ + const struct intel_device_info *devinfo = isa->devinfo; + int offset = start; + + /* This loop exits when send-with-EOT or when opcode is 0 */ + while (true) { + const brw_inst *insn = assembly + offset; + + if (brw_inst_cmpt_control(devinfo, insn)) { + offset += 8; + } else { + offset += 16; + } + + /* Simplistic, but efficient way to terminate disasm */ + uint32_t opcode = brw_inst_opcode(isa, insn); + if (opcode == 0 || (is_send(opcode) && brw_inst_eot(devinfo, insn))) { + break; + } + } + + return offset; +} + +void +brw_disassemble_with_errors(const struct brw_isa_info *isa, + const void *assembly, int start, FILE *out) +{ + int end = brw_disassemble_find_end(isa, assembly, start); + + /* Make a dummy disasm structure that brw_validate_instructions + * can work from. + */ + struct disasm_info *disasm_info = disasm_initialize(isa, NULL); + disasm_new_inst_group(disasm_info, start); + disasm_new_inst_group(disasm_info, end); + + brw_validate_instructions(isa, assembly, start, end, disasm_info); + + void *mem_ctx = ralloc_context(NULL); + const struct brw_label *root_label = + brw_label_assembly(isa, assembly, start, end, mem_ctx); + + foreach_list_typed(struct inst_group, group, link, + &disasm_info->group_list) { + struct exec_node *next_node = exec_node_get_next(&group->link); + if (exec_node_is_tail_sentinel(next_node)) + break; + + struct inst_group *next = + exec_node_data(struct inst_group, next_node, link); + + int start_offset = group->offset; + int end_offset = next->offset; + + brw_disassemble(isa, assembly, start_offset, end_offset, + root_label, out); + + if (group->error) { + fputs(group->error, out); + } + } + + ralloc_free(mem_ctx); + ralloc_free(disasm_info); +} diff --git a/src/intel/compiler/elk/brw_disasm.h b/src/intel/compiler/elk/brw_disasm.h new file mode 100644 index 00000000000..3ebfcfd3051 --- /dev/null +++ b/src/intel/compiler/elk/brw_disasm.h @@ -0,0 +1,42 @@ +/* + * Copyright 2024 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#ifndef BRW_DISASM_H +#define BRW_DISASM_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct brw_isa_info; +struct brw_inst; + +const struct brw_label *brw_find_label(const struct brw_label *root, int offset); +void brw_create_label(struct brw_label **labels, int offset, void *mem_ctx); +int brw_disassemble_inst(FILE *file, const struct brw_isa_info *isa, + const struct brw_inst *inst, bool is_compacted, + int offset, const struct brw_label *root_label); +const struct +brw_label *brw_label_assembly(const struct brw_isa_info *isa, + const void *assembly, int start, int end, + void *mem_ctx); +void brw_disassemble_with_labels(const struct brw_isa_info *isa, + const void *assembly, int start, int end, FILE *out); +void brw_disassemble(const struct brw_isa_info *isa, + const void *assembly, int start, int end, + const struct brw_label *root_label, FILE *out); +int brw_disassemble_find_end(const struct brw_isa_info *isa, + const void *assembly, int start); +void brw_disassemble_with_errors(const struct brw_isa_info *isa, + const void *assembly, int start, FILE *out); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* BRW_DISASM_H */ diff --git a/src/intel/compiler/elk/brw_disasm_info.c b/src/intel/compiler/elk/brw_disasm_info.c new file mode 100644 index 00000000000..cb9a2e42233 --- /dev/null +++ b/src/intel/compiler/elk/brw_disasm_info.c @@ -0,0 +1,207 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_cfg.h" +#include "brw_eu.h" +#include "brw_disasm.h" +#include "brw_disasm_info.h" +#include "dev/intel_debug.h" +#include "compiler/nir/nir.h" + +__attribute__((weak)) void nir_print_instr(UNUSED const nir_instr *instr, + UNUSED FILE *fp) {} + +void +dump_assembly(void *assembly, int start_offset, int end_offset, + struct disasm_info *disasm, const unsigned *block_latency) +{ + const struct brw_isa_info *isa = disasm->isa; + const char *last_annotation_string = NULL; + const void *last_annotation_ir = NULL; + + void *mem_ctx = ralloc_context(NULL); + const struct brw_label *root_label = + brw_label_assembly(isa, assembly, start_offset, end_offset, mem_ctx); + + foreach_list_typed(struct inst_group, group, link, &disasm->group_list) { + struct exec_node *next_node = exec_node_get_next(&group->link); + if (exec_node_is_tail_sentinel(next_node)) + break; + + struct inst_group *next = + exec_node_data(struct inst_group, next_node, link); + + int start_offset = group->offset; + int end_offset = next->offset; + + if (group->block_start) { + fprintf(stderr, " START B%d", group->block_start->num); + foreach_list_typed(struct bblock_link, predecessor_link, link, + &group->block_start->parents) { + struct bblock_t *predecessor_block = predecessor_link->block; + fprintf(stderr, " <-B%d", predecessor_block->num); + } + if (block_latency) + fprintf(stderr, " (%u cycles)", + block_latency[group->block_start->num]); + fprintf(stderr, "\n"); + } + + if (last_annotation_ir != group->ir) { + last_annotation_ir = group->ir; + if (last_annotation_ir) { + fprintf(stderr, " "); + nir_print_instr(group->ir, stderr); + fprintf(stderr, "\n"); + } + } + + if (last_annotation_string != group->annotation) { + last_annotation_string = group->annotation; + if (last_annotation_string) + fprintf(stderr, " %s\n", last_annotation_string); + } + + brw_disassemble(isa, assembly, start_offset, end_offset, + root_label, stderr); + + if (group->error) { + fputs(group->error, stderr); + } + + if (group->block_end) { + fprintf(stderr, " END B%d", group->block_end->num); + foreach_list_typed(struct bblock_link, successor_link, link, + &group->block_end->children) { + struct bblock_t *successor_block = successor_link->block; + fprintf(stderr, " ->B%d", successor_block->num); + } + fprintf(stderr, "\n"); + } + } + fprintf(stderr, "\n"); + + ralloc_free(mem_ctx); +} + +struct disasm_info * +disasm_initialize(const struct brw_isa_info *isa, + const struct cfg_t *cfg) +{ + struct disasm_info *disasm = ralloc(NULL, struct disasm_info); + exec_list_make_empty(&disasm->group_list); + disasm->isa = isa; + disasm->cfg = cfg; + disasm->cur_block = 0; + disasm->use_tail = false; + return disasm; +} + +struct inst_group * +disasm_new_inst_group(struct disasm_info *disasm, unsigned next_inst_offset) +{ + struct inst_group *tail = rzalloc(disasm, struct inst_group); + tail->offset = next_inst_offset; + exec_list_push_tail(&disasm->group_list, &tail->link); + return tail; +} + +void +disasm_annotate(struct disasm_info *disasm, + struct backend_instruction *inst, unsigned offset) +{ + const struct intel_device_info *devinfo = disasm->isa->devinfo; + const struct cfg_t *cfg = disasm->cfg; + + struct inst_group *group; + if (!disasm->use_tail) { + group = disasm_new_inst_group(disasm, offset); + } else { + disasm->use_tail = false; + group = exec_node_data(struct inst_group, + exec_list_get_tail_raw(&disasm->group_list), link); + } + + if (INTEL_DEBUG(DEBUG_ANNOTATION)) { + group->ir = inst->ir; + group->annotation = inst->annotation; + } + + if (bblock_start(cfg->blocks[disasm->cur_block]) == inst) { + group->block_start = cfg->blocks[disasm->cur_block]; + } + + /* There is no hardware DO instruction on Gfx6+, so since DO always + * starts a basic block, we need to set the .block_start of the next + * instruction's annotation with a pointer to the bblock started by + * the DO. + * + * There's also only complication from emitting an annotation without + * a corresponding hardware instruction to disassemble. + */ + if (devinfo->ver >= 6 && inst->opcode == BRW_OPCODE_DO) { + disasm->use_tail = true; + } + + if (bblock_end(cfg->blocks[disasm->cur_block]) == inst) { + group->block_end = cfg->blocks[disasm->cur_block]; + disasm->cur_block++; + } +} + +void +disasm_insert_error(struct disasm_info *disasm, unsigned offset, + unsigned inst_size, const char *error) +{ + foreach_list_typed(struct inst_group, cur, link, &disasm->group_list) { + struct exec_node *next_node = exec_node_get_next(&cur->link); + if (exec_node_is_tail_sentinel(next_node)) + break; + + struct inst_group *next = + exec_node_data(struct inst_group, next_node, link); + + if (next->offset <= offset) + continue; + + if (offset + inst_size != next->offset) { + struct inst_group *new = ralloc(disasm, struct inst_group); + memcpy(new, cur, sizeof(struct inst_group)); + + cur->error = NULL; + cur->error_length = 0; + cur->block_end = NULL; + + new->offset = offset + inst_size; + new->block_start = NULL; + + exec_node_insert_after(&cur->link, &new->link); + } + + if (cur->error) + ralloc_strcat(&cur->error, error); + else + cur->error = ralloc_strdup(disasm, error); + return; + } +} diff --git a/src/intel/compiler/elk/brw_disasm_info.h b/src/intel/compiler/elk/brw_disasm_info.h new file mode 100644 index 00000000000..937180b7e2e --- /dev/null +++ b/src/intel/compiler/elk/brw_disasm_info.h @@ -0,0 +1,90 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef _INTEL_ASM_ANNOTATION_H +#define _INTEL_ASM_ANNOTATION_H + +#include "compiler/glsl/list.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct cfg_t; +struct backend_instruction; +struct intel_device_info; + +struct inst_group { + struct exec_node link; + + int offset; + + size_t error_length; + char *error; + + /* Pointers to the basic block in the CFG if the instruction group starts + * or ends a basic block. + */ + struct bblock_t *block_start; + struct bblock_t *block_end; + + /* Annotation for the generated IR. One of the two can be set. */ + const void *ir; + const char *annotation; +}; + +struct disasm_info { + struct exec_list group_list; + + const struct brw_isa_info *isa; + const struct cfg_t *cfg; + + /** Block index in the cfg. */ + int cur_block; + bool use_tail; +}; + +void +dump_assembly(void *assembly, int start_offset, int end_offset, + struct disasm_info *disasm, const unsigned *block_latency); + +struct disasm_info * +disasm_initialize(const struct brw_isa_info *isa, + const struct cfg_t *cfg); + +struct inst_group * +disasm_new_inst_group(struct disasm_info *disasm, unsigned offset); + +void +disasm_annotate(struct disasm_info *disasm, + struct backend_instruction *inst, unsigned offset); + +void +disasm_insert_error(struct disasm_info *disasm, unsigned offset, + unsigned inst_size, const char *error); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* _INTEL_ASM_ANNOTATION_H */ diff --git a/src/intel/compiler/elk/brw_disasm_tool.c b/src/intel/compiler/elk/brw_disasm_tool.c new file mode 100644 index 00000000000..1771b2e369c --- /dev/null +++ b/src/intel/compiler/elk/brw_disasm_tool.c @@ -0,0 +1,242 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include + +#include "compiler/brw_disasm.h" +#include "compiler/brw_isa_info.h" +#include "dev/intel_device_info.h" +#include "util/u_dynarray.h" + +enum opt_input_type { + OPT_INPUT_BINARY, + OPT_INPUT_C_LITERAL, +}; + +static enum opt_input_type input_type = OPT_INPUT_BINARY; + +/* Return size of file in bytes pointed by fp */ +static long +i965_disasm_get_file_size(FILE *fp) +{ + long size; + + fseek(fp, 0L, SEEK_END); + size = ftell(fp); + fseek(fp, 0L, SEEK_SET); + + return size; +} + +/* Read hex file which should be in following format: + * for example : + * { 0x00000000, 0x00000000, 0x00000000, 0x00000000 } + */ +static void * +i965_disasm_read_c_literal_file(FILE *fp, size_t *end) +{ + struct util_dynarray assembly = {}; + uint32_t temp[2]; + + if (fscanf(fp, " { ") == EOF) { + fprintf(stderr, "Couldn't find opening `{`\n"); + return NULL; + } + + if (fscanf(fp, "0x%x , 0x%x", &temp[0], &temp[1]) == 2) { + util_dynarray_append(&assembly, uint32_t, temp[0]); + util_dynarray_append(&assembly, uint32_t, temp[1]); + } else { + fprintf(stderr, "Couldn't read hex values\n"); + return NULL; + } + + while (fscanf(fp, " , 0x%x , 0x%x ", &temp[0], &temp[1]) == 2) { + util_dynarray_append(&assembly, uint32_t, temp[0]); + util_dynarray_append(&assembly, uint32_t, temp[1]); + } + + if (fscanf(fp, "}") == EOF) { + fprintf(stderr, "Couldn't find closing `}`\n"); + return NULL; + } + + *end = assembly.size; + return assembly.data; +} + +static void * +i965_disasm_read_binary(FILE *fp, size_t *end) +{ + size_t size; + void *assembly; + + long sz = i965_disasm_get_file_size(fp); + if (sz < 0) + return NULL; + + *end = (size_t)sz; + if (!*end) + return NULL; + + assembly = malloc(*end + 1); + if (assembly == NULL) + return NULL; + + size = fread(assembly, *end, 1, fp); + if (!size) { + free(assembly); + return NULL; + } + return assembly; +} + +static void +print_help(const char *progname, FILE *file) +{ + fprintf(file, + "Usage: %s [OPTION]...\n" + "Disassemble i965 instructions from binary file.\n\n" + " --help display this help and exit\n" + " --input-path=PATH read binary file from binary file PATH\n" + " --type=INPUT_TYPE INPUT_TYPE can be 'bin' (default if omitted),\n" + " 'c_literal'.\n" + " --gen=platform disassemble instructions for given \n" + " platform (3 letter platform name)\n", + progname); +} + +int main(int argc, char *argv[]) +{ + FILE *fp = NULL; + void *assembly = NULL; + char *file_path = NULL; + size_t start = 0, end = 0; + uint16_t pci_id = 0; + int c; + int result = EXIT_FAILURE; + + bool help = false; + const struct option i965_disasm_opts[] = { + { "help", no_argument, (int *) &help, true }, + { "input-path", required_argument, NULL, 'i' }, + { "type", required_argument, NULL, 't' }, + { "gen", required_argument, NULL, 'g'}, + { NULL, 0, NULL, 0 } + }; + + while ((c = getopt_long(argc, argv, ":i:t:g:h", i965_disasm_opts, NULL)) != -1) { + switch (c) { + case 'g': { + const int id = intel_device_name_to_pci_device_id(optarg); + if (id < 0) { + fprintf(stderr, "can't parse gen: '%s', expected 3 letter " + "platform name\n", optarg); + goto end; + } else { + pci_id = id; + } + break; + } + case 'i': + file_path = strdup(optarg); + fp = fopen(file_path, "r"); + if (!fp) { + fprintf(stderr, "Unable to read input file : %s\n", + file_path); + goto end; + } + break; + case 't': + if (strcmp(optarg, "c_literal") == 0) { + input_type = OPT_INPUT_C_LITERAL; + } else if (strcmp(optarg, "bin") == 0) { + input_type = OPT_INPUT_BINARY; + } else { + fprintf(stderr, "invalid value for --type: %s\n", optarg); + goto end; + } + break; + case 'h': + help = true; + print_help(argv[0], stderr); + goto end; + case 0: + break; + case ':': + fprintf(stderr, "%s: option `-%c' requires an argument\n", + argv[0], optopt); + goto end; + case '?': + default: + fprintf(stderr, "%s: option `-%c' is invalid: ignored\n", + argv[0], optopt); + goto end; + } + } + + if (help || !file_path || !pci_id) { + print_help(argv[0], stderr); + exit(0); + } + + struct intel_device_info devinfo; + if (!intel_get_device_info_from_pci_id(pci_id, &devinfo)) { + fprintf(stderr, "can't find device information: pci_id=0x%x\n", pci_id); + exit(EXIT_FAILURE); + } + + struct brw_isa_info isa; + brw_init_isa_info(&isa, &devinfo); + + if (input_type == OPT_INPUT_BINARY) + assembly = i965_disasm_read_binary(fp, &end); + else if (input_type == OPT_INPUT_C_LITERAL) + assembly = i965_disasm_read_c_literal_file(fp, &end); + + if (!assembly) { + if (end) + fprintf(stderr, "Unable to allocate buffer to read input file\n"); + else + fprintf(stderr, "Failed to read input file\n"); + + goto end; + } + + /* Disassemble i965 instructions from buffer assembly */ + brw_disassemble_with_labels(&isa, assembly, start, end, stdout); + + result = EXIT_SUCCESS; + +end: + if (fp) + fclose(fp); + + free(file_path); + free(assembly); + + exit(result); +} diff --git a/src/intel/compiler/elk/brw_eu.c b/src/intel/compiler/elk/brw_eu.c new file mode 100644 index 00000000000..d6b94f3441d --- /dev/null +++ b/src/intel/compiler/elk/brw_eu.c @@ -0,0 +1,856 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell + */ + +#include +#include + +#include "brw_disasm.h" +#include "brw_eu_defines.h" +#include "brw_eu.h" +#include "brw_shader.h" +#include "intel_gfx_ver_enum.h" +#include "dev/intel_debug.h" + +#include "util/u_debug.h" +#include "util/ralloc.h" + +/* Returns a conditional modifier that negates the condition. */ +enum brw_conditional_mod +brw_negate_cmod(enum brw_conditional_mod cmod) +{ + switch (cmod) { + case BRW_CONDITIONAL_Z: + return BRW_CONDITIONAL_NZ; + case BRW_CONDITIONAL_NZ: + return BRW_CONDITIONAL_Z; + case BRW_CONDITIONAL_G: + return BRW_CONDITIONAL_LE; + case BRW_CONDITIONAL_GE: + return BRW_CONDITIONAL_L; + case BRW_CONDITIONAL_L: + return BRW_CONDITIONAL_GE; + case BRW_CONDITIONAL_LE: + return BRW_CONDITIONAL_G; + default: + unreachable("Can't negate this cmod"); + } +} + +/* Returns the corresponding conditional mod for swapping src0 and + * src1 in e.g. CMP. + */ +enum brw_conditional_mod +brw_swap_cmod(enum brw_conditional_mod cmod) +{ + switch (cmod) { + case BRW_CONDITIONAL_Z: + case BRW_CONDITIONAL_NZ: + return cmod; + case BRW_CONDITIONAL_G: + return BRW_CONDITIONAL_L; + case BRW_CONDITIONAL_GE: + return BRW_CONDITIONAL_LE; + case BRW_CONDITIONAL_L: + return BRW_CONDITIONAL_G; + case BRW_CONDITIONAL_LE: + return BRW_CONDITIONAL_GE; + default: + return BRW_CONDITIONAL_NONE; + } +} + +/** + * Get the least significant bit offset of the i+1-th component of immediate + * type \p type. For \p i equal to the two's complement of j, return the + * offset of the j-th component starting from the end of the vector. For + * scalar register types return zero. + */ +static unsigned +imm_shift(enum brw_reg_type type, unsigned i) +{ + assert(type != BRW_REGISTER_TYPE_UV && type != BRW_REGISTER_TYPE_V && + "Not implemented."); + + if (type == BRW_REGISTER_TYPE_VF) + return 8 * (i & 3); + else + return 0; +} + +/** + * Swizzle an arbitrary immediate \p x of the given type according to the + * permutation specified as \p swz. + */ +uint32_t +brw_swizzle_immediate(enum brw_reg_type type, uint32_t x, unsigned swz) +{ + if (imm_shift(type, 1)) { + const unsigned n = 32 / imm_shift(type, 1); + uint32_t y = 0; + + for (unsigned i = 0; i < n; i++) { + /* Shift the specified component all the way to the right and left to + * discard any undesired L/MSBs, then shift it right into component i. + */ + y |= x >> imm_shift(type, (i & ~3) + BRW_GET_SWZ(swz, i & 3)) + << imm_shift(type, ~0u) + >> imm_shift(type, ~0u - i); + } + + return y; + } else { + return x; + } +} + +unsigned +brw_get_default_exec_size(struct brw_codegen *p) +{ + return p->current->exec_size; +} + +unsigned +brw_get_default_group(struct brw_codegen *p) +{ + return p->current->group; +} + +unsigned +brw_get_default_access_mode(struct brw_codegen *p) +{ + return p->current->access_mode; +} + +struct tgl_swsb +brw_get_default_swsb(struct brw_codegen *p) +{ + return p->current->swsb; +} + +void +brw_set_default_exec_size(struct brw_codegen *p, unsigned value) +{ + p->current->exec_size = value; +} + +void brw_set_default_predicate_control(struct brw_codegen *p, enum brw_predicate pc) +{ + p->current->predicate = pc; +} + +void brw_set_default_predicate_inverse(struct brw_codegen *p, bool predicate_inverse) +{ + p->current->pred_inv = predicate_inverse; +} + +void brw_set_default_flag_reg(struct brw_codegen *p, int reg, int subreg) +{ + assert(subreg < 2); + p->current->flag_subreg = reg * 2 + subreg; +} + +void brw_set_default_access_mode( struct brw_codegen *p, unsigned access_mode ) +{ + p->current->access_mode = access_mode; +} + +void +brw_set_default_compression_control(struct brw_codegen *p, + enum brw_compression compression_control) +{ + switch (compression_control) { + case BRW_COMPRESSION_NONE: + /* This is the "use the first set of bits of dmask/vmask/arf + * according to execsize" option. + */ + p->current->group = 0; + break; + case BRW_COMPRESSION_2NDHALF: + /* For SIMD8, this is "use the second set of 8 bits." */ + p->current->group = 8; + break; + case BRW_COMPRESSION_COMPRESSED: + /* For SIMD16 instruction compression, use the first set of 16 bits + * since we don't do SIMD32 dispatch. + */ + p->current->group = 0; + break; + default: + unreachable("not reached"); + } + + if (p->devinfo->ver <= 6) { + p->current->compressed = + (compression_control == BRW_COMPRESSION_COMPRESSED); + } +} + +/** + * Enable or disable instruction compression on the given instruction leaving + * the currently selected channel enable group untouched. + */ +void +brw_inst_set_compression(const struct intel_device_info *devinfo, + brw_inst *inst, bool on) +{ + if (devinfo->ver >= 6) { + /* No-op, the EU will figure out for us whether the instruction needs to + * be compressed. + */ + } else { + /* The channel group and compression controls are non-orthogonal, there + * are two possible representations for uncompressed instructions and we + * may need to preserve the current one to avoid changing the selected + * channel group inadvertently. + */ + if (on) + brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_COMPRESSED); + else if (brw_inst_qtr_control(devinfo, inst) + == BRW_COMPRESSION_COMPRESSED) + brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE); + } +} + +void +brw_set_default_compression(struct brw_codegen *p, bool on) +{ + p->current->compressed = on; +} + +/** + * Apply the range of channel enable signals given by + * [group, group + exec_size) to the instruction passed as argument. + */ +void +brw_inst_set_group(const struct intel_device_info *devinfo, + brw_inst *inst, unsigned group) +{ + if (devinfo->ver >= 20) { + assert(group % 8 == 0 && group < 32); + brw_inst_set_qtr_control(devinfo, inst, group / 8); + + } else if (devinfo->ver >= 7) { + assert(group % 4 == 0 && group < 32); + brw_inst_set_qtr_control(devinfo, inst, group / 8); + brw_inst_set_nib_control(devinfo, inst, (group / 4) % 2); + + } else if (devinfo->ver == 6) { + assert(group % 8 == 0 && group < 32); + brw_inst_set_qtr_control(devinfo, inst, group / 8); + + } else { + assert(group % 8 == 0 && group < 16); + /* The channel group and compression controls are non-orthogonal, there + * are two possible representations for group zero and we may need to + * preserve the current one to avoid changing the selected compression + * enable inadvertently. + */ + if (group == 8) + brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_2NDHALF); + else if (brw_inst_qtr_control(devinfo, inst) == BRW_COMPRESSION_2NDHALF) + brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE); + } +} + +void +brw_set_default_group(struct brw_codegen *p, unsigned group) +{ + p->current->group = group; +} + +void brw_set_default_mask_control( struct brw_codegen *p, unsigned value ) +{ + p->current->mask_control = value; +} + +void brw_set_default_saturate( struct brw_codegen *p, bool enable ) +{ + p->current->saturate = enable; +} + +void brw_set_default_acc_write_control(struct brw_codegen *p, unsigned value) +{ + p->current->acc_wr_control = value; +} + +void brw_set_default_swsb(struct brw_codegen *p, struct tgl_swsb value) +{ + p->current->swsb = value; +} + +void brw_push_insn_state( struct brw_codegen *p ) +{ + assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]); + *(p->current + 1) = *p->current; + p->current++; +} + +void brw_pop_insn_state( struct brw_codegen *p ) +{ + assert(p->current != p->stack); + p->current--; +} + + +/*********************************************************************** + */ +void +brw_init_codegen(const struct brw_isa_info *isa, + struct brw_codegen *p, void *mem_ctx) +{ + memset(p, 0, sizeof(*p)); + + p->isa = isa; + p->devinfo = isa->devinfo; + p->automatic_exec_sizes = true; + /* + * Set the initial instruction store array size to 1024, if found that + * isn't enough, then it will double the store size at brw_next_insn() + * until out of memory. + */ + p->store_size = 1024; + p->store = rzalloc_array(mem_ctx, brw_inst, p->store_size); + p->nr_insn = 0; + p->current = p->stack; + memset(p->current, 0, sizeof(p->current[0])); + + p->mem_ctx = mem_ctx; + + /* Some defaults? + */ + brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_set_default_mask_control(p, BRW_MASK_ENABLE); /* what does this do? */ + brw_set_default_saturate(p, 0); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + + /* Set up control flow stack */ + p->if_stack_depth = 0; + p->if_stack_array_size = 16; + p->if_stack = rzalloc_array(mem_ctx, int, p->if_stack_array_size); + + p->loop_stack_depth = 0; + p->loop_stack_array_size = 16; + p->loop_stack = rzalloc_array(mem_ctx, int, p->loop_stack_array_size); + p->if_depth_in_loop = rzalloc_array(mem_ctx, int, p->loop_stack_array_size); +} + + +const unsigned *brw_get_program( struct brw_codegen *p, + unsigned *sz ) +{ + *sz = p->next_insn_offset; + return (const unsigned *)p->store; +} + +const struct brw_shader_reloc * +brw_get_shader_relocs(struct brw_codegen *p, unsigned *num_relocs) +{ + *num_relocs = p->num_relocs; + return p->relocs; +} + +DEBUG_GET_ONCE_OPTION(shader_bin_dump_path, "INTEL_SHADER_BIN_DUMP_PATH", NULL); + +bool brw_should_dump_shader_bin(void) +{ + return debug_get_option_shader_bin_dump_path() != NULL; +} + +void brw_dump_shader_bin(void *assembly, int start_offset, int end_offset, + const char *identifier) +{ + char *name = ralloc_asprintf(NULL, "%s/%s.bin", + debug_get_option_shader_bin_dump_path(), + identifier); + + int fd = open(name, O_CREAT | O_WRONLY, 0777); + ralloc_free(name); + + if (fd < 0) + return; + + struct stat sb; + if (fstat(fd, &sb) != 0 || (!S_ISREG(sb.st_mode))) { + close(fd); + return; + } + + size_t to_write = end_offset - start_offset; + void *write_ptr = assembly + start_offset; + + while (to_write) { + ssize_t ret = write(fd, write_ptr, to_write); + + if (ret <= 0) { + close(fd); + return; + } + + to_write -= ret; + write_ptr += ret; + } + + close(fd); +} + +bool brw_try_override_assembly(struct brw_codegen *p, int start_offset, + const char *identifier) +{ + const char *read_path = getenv("INTEL_SHADER_ASM_READ_PATH"); + if (!read_path) { + return false; + } + + char *name = ralloc_asprintf(NULL, "%s/%s.bin", read_path, identifier); + + int fd = open(name, O_RDONLY); + ralloc_free(name); + + if (fd == -1) { + return false; + } + + struct stat sb; + if (fstat(fd, &sb) != 0 || (!S_ISREG(sb.st_mode))) { + close(fd); + return false; + } + + p->nr_insn -= (p->next_insn_offset - start_offset) / sizeof(brw_inst); + p->nr_insn += sb.st_size / sizeof(brw_inst); + + p->next_insn_offset = start_offset + sb.st_size; + p->store_size = (start_offset + sb.st_size) / sizeof(brw_inst); + p->store = (brw_inst *)reralloc_size(p->mem_ctx, p->store, p->next_insn_offset); + assert(p->store); + + ssize_t ret = read(fd, (char *)p->store + start_offset, sb.st_size); + close(fd); + if (ret != sb.st_size) { + return false; + } + + ASSERTED bool valid = + brw_validate_instructions(p->isa, p->store, + start_offset, p->next_insn_offset, + NULL); + assert(valid); + + return true; +} + +const struct brw_label * +brw_find_label(const struct brw_label *root, int offset) +{ + const struct brw_label *curr = root; + + if (curr != NULL) + { + do { + if (curr->offset == offset) + return curr; + + curr = curr->next; + } while (curr != NULL); + } + + return curr; +} + +void +brw_create_label(struct brw_label **labels, int offset, void *mem_ctx) +{ + if (*labels != NULL) { + struct brw_label *curr = *labels; + struct brw_label *prev; + + do { + prev = curr; + + if (curr->offset == offset) + return; + + curr = curr->next; + } while (curr != NULL); + + curr = ralloc(mem_ctx, struct brw_label); + curr->offset = offset; + curr->number = prev->number + 1; + curr->next = NULL; + prev->next = curr; + } else { + struct brw_label *root = ralloc(mem_ctx, struct brw_label); + root->number = 0; + root->offset = offset; + root->next = NULL; + *labels = root; + } +} + +const struct brw_label * +brw_label_assembly(const struct brw_isa_info *isa, + const void *assembly, int start, int end, void *mem_ctx) +{ + const struct intel_device_info *const devinfo = isa->devinfo; + + struct brw_label *root_label = NULL; + + int to_bytes_scale = sizeof(brw_inst) / brw_jump_scale(devinfo); + + for (int offset = start; offset < end;) { + const brw_inst *inst = (const brw_inst *) ((const char *) assembly + offset); + brw_inst uncompacted; + + bool is_compact = brw_inst_cmpt_control(devinfo, inst); + + if (is_compact) { + brw_compact_inst *compacted = (brw_compact_inst *)inst; + brw_uncompact_instruction(isa, &uncompacted, compacted); + inst = &uncompacted; + } + + if (brw_has_uip(devinfo, brw_inst_opcode(isa, inst))) { + /* Instructions that have UIP also have JIP. */ + brw_create_label(&root_label, + offset + brw_inst_uip(devinfo, inst) * to_bytes_scale, mem_ctx); + brw_create_label(&root_label, + offset + brw_inst_jip(devinfo, inst) * to_bytes_scale, mem_ctx); + } else if (brw_has_jip(devinfo, brw_inst_opcode(isa, inst))) { + int jip; + if (devinfo->ver >= 7) { + jip = brw_inst_jip(devinfo, inst); + } else { + jip = brw_inst_gfx6_jump_count(devinfo, inst); + } + + brw_create_label(&root_label, offset + jip * to_bytes_scale, mem_ctx); + } + + if (is_compact) { + offset += sizeof(brw_compact_inst); + } else { + offset += sizeof(brw_inst); + } + } + + return root_label; +} + +void +brw_disassemble_with_labels(const struct brw_isa_info *isa, + const void *assembly, int start, int end, FILE *out) +{ + void *mem_ctx = ralloc_context(NULL); + const struct brw_label *root_label = + brw_label_assembly(isa, assembly, start, end, mem_ctx); + + brw_disassemble(isa, assembly, start, end, root_label, out); + + ralloc_free(mem_ctx); +} + +void +brw_disassemble(const struct brw_isa_info *isa, + const void *assembly, int start, int end, + const struct brw_label *root_label, FILE *out) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + bool dump_hex = INTEL_DEBUG(DEBUG_HEX); + + for (int offset = start; offset < end;) { + const brw_inst *insn = (const brw_inst *)((char *)assembly + offset); + brw_inst uncompacted; + + if (root_label != NULL) { + const struct brw_label *label = brw_find_label(root_label, offset); + if (label != NULL) { + fprintf(out, "\nLABEL%d:\n", label->number); + } + } + + bool compacted = brw_inst_cmpt_control(devinfo, insn); + if (0) + fprintf(out, "0x%08x: ", offset); + + if (compacted) { + brw_compact_inst *compacted = (brw_compact_inst *)insn; + if (dump_hex) { + unsigned char * insn_ptr = ((unsigned char *)&insn[0]); + const unsigned int blank_spaces = 24; + for (int i = 0 ; i < 8; i = i + 4) { + fprintf(out, "%02x %02x %02x %02x ", + insn_ptr[i], + insn_ptr[i + 1], + insn_ptr[i + 2], + insn_ptr[i + 3]); + } + /* Make compacted instructions hex value output vertically aligned + * with uncompacted instructions hex value + */ + fprintf(out, "%*c", blank_spaces, ' '); + } + + brw_uncompact_instruction(isa, &uncompacted, compacted); + insn = &uncompacted; + } else { + if (dump_hex) { + unsigned char * insn_ptr = ((unsigned char *)&insn[0]); + for (int i = 0 ; i < 16; i = i + 4) { + fprintf(out, "%02x %02x %02x %02x ", + insn_ptr[i], + insn_ptr[i + 1], + insn_ptr[i + 2], + insn_ptr[i + 3]); + } + } + } + + brw_disassemble_inst(out, isa, insn, compacted, offset, root_label); + + if (compacted) { + offset += sizeof(brw_compact_inst); + } else { + offset += sizeof(brw_inst); + } + } +} + +static const struct opcode_desc opcode_descs[] = { + /* IR, HW, name, nsrc, ndst, gfx_vers */ + { BRW_OPCODE_ILLEGAL, 0, "illegal", 0, 0, GFX_ALL }, + { BRW_OPCODE_SYNC, 1, "sync", 1, 0, GFX_GE(GFX12) }, + { BRW_OPCODE_MOV, 1, "mov", 1, 1, GFX_LT(GFX12) }, + { BRW_OPCODE_MOV, 97, "mov", 1, 1, GFX_GE(GFX12) }, + { BRW_OPCODE_SEL, 2, "sel", 2, 1, GFX_LT(GFX12) }, + { BRW_OPCODE_SEL, 98, "sel", 2, 1, GFX_GE(GFX12) }, + { BRW_OPCODE_MOVI, 3, "movi", 2, 1, GFX_GE(GFX45) & GFX_LT(GFX12) }, + { BRW_OPCODE_MOVI, 99, "movi", 2, 1, GFX_GE(GFX12) }, + { BRW_OPCODE_NOT, 4, "not", 1, 1, GFX_LT(GFX12) }, + { BRW_OPCODE_NOT, 100, "not", 1, 1, GFX_GE(GFX12) }, + { BRW_OPCODE_AND, 5, "and", 2, 1, GFX_LT(GFX12) }, + { BRW_OPCODE_AND, 101, "and", 2, 1, GFX_GE(GFX12) }, + { BRW_OPCODE_OR, 6, "or", 2, 1, GFX_LT(GFX12) }, + { BRW_OPCODE_OR, 102, "or", 2, 1, GFX_GE(GFX12) }, + { BRW_OPCODE_XOR, 7, "xor", 2, 1, GFX_LT(GFX12) }, + { BRW_OPCODE_XOR, 103, "xor", 2, 1, GFX_GE(GFX12) }, + { BRW_OPCODE_SHR, 8, "shr", 2, 1, GFX_LT(GFX12) }, + { BRW_OPCODE_SHR, 104, "shr", 2, 1, GFX_GE(GFX12) }, + { BRW_OPCODE_SHL, 9, "shl", 2, 1, GFX_LT(GFX12) }, + { BRW_OPCODE_SHL, 105, "shl", 2, 1, GFX_GE(GFX12) }, + { BRW_OPCODE_DIM, 10, "dim", 1, 1, GFX75 }, + { BRW_OPCODE_SMOV, 10, "smov", 0, 0, GFX_GE(GFX8) & GFX_LT(GFX12) }, + { BRW_OPCODE_SMOV, 106, "smov", 0, 0, GFX_GE(GFX12) }, + { BRW_OPCODE_ASR, 12, "asr", 2, 1, GFX_LT(GFX12) }, + { BRW_OPCODE_ASR, 108, "asr", 2, 1, GFX_GE(GFX12) }, + { BRW_OPCODE_ROR, 14, "ror", 2, 1, GFX11 }, + { BRW_OPCODE_ROR, 110, "ror", 2, 1, GFX_GE(GFX12) }, + { BRW_OPCODE_ROL, 15, "rol", 2, 1, GFX11 }, + { BRW_OPCODE_ROL, 111, "rol", 2, 1, GFX_GE(GFX12) }, + { BRW_OPCODE_CMP, 16, "cmp", 2, 1, GFX_LT(GFX12) }, + { BRW_OPCODE_CMP, 112, "cmp", 2, 1, GFX_GE(GFX12) }, + { BRW_OPCODE_CMPN, 17, "cmpn", 2, 1, GFX_LT(GFX12) }, + { BRW_OPCODE_CMPN, 113, "cmpn", 2, 1, GFX_GE(GFX12) }, + { BRW_OPCODE_CSEL, 18, "csel", 3, 1, GFX_GE(GFX8) & GFX_LT(GFX12) }, + { BRW_OPCODE_CSEL, 114, "csel", 3, 1, GFX_GE(GFX12) }, + { BRW_OPCODE_F32TO16, 19, "f32to16", 1, 1, GFX7 | GFX75 }, + { BRW_OPCODE_F16TO32, 20, "f16to32", 1, 1, GFX7 | GFX75 }, + { BRW_OPCODE_BFREV, 23, "bfrev", 1, 1, GFX_GE(GFX7) & GFX_LT(GFX12) }, + { BRW_OPCODE_BFREV, 119, "bfrev", 1, 1, GFX_GE(GFX12) }, + { BRW_OPCODE_BFE, 24, "bfe", 3, 1, GFX_GE(GFX7) & GFX_LT(GFX12) }, + { BRW_OPCODE_BFE, 120, "bfe", 3, 1, GFX_GE(GFX12) }, + { BRW_OPCODE_BFI1, 25, "bfi1", 2, 1, GFX_GE(GFX7) & GFX_LT(GFX12) }, + { BRW_OPCODE_BFI1, 121, "bfi1", 2, 1, GFX_GE(GFX12) }, + { BRW_OPCODE_BFI2, 26, "bfi2", 3, 1, GFX_GE(GFX7) & GFX_LT(GFX12) }, + { BRW_OPCODE_BFI2, 122, "bfi2", 3, 1, GFX_GE(GFX12) }, + { BRW_OPCODE_JMPI, 32, "jmpi", 0, 0, GFX_ALL }, + { BRW_OPCODE_BRD, 33, "brd", 0, 0, GFX_GE(GFX7) }, + { BRW_OPCODE_IF, 34, "if", 0, 0, GFX_ALL }, + { BRW_OPCODE_IFF, 35, "iff", 0, 0, GFX_LE(GFX5) }, + { BRW_OPCODE_BRC, 35, "brc", 0, 0, GFX_GE(GFX7) }, + { BRW_OPCODE_ELSE, 36, "else", 0, 0, GFX_ALL }, + { BRW_OPCODE_ENDIF, 37, "endif", 0, 0, GFX_ALL }, + { BRW_OPCODE_DO, 38, "do", 0, 0, GFX_LE(GFX5) }, + { BRW_OPCODE_CASE, 38, "case", 0, 0, GFX6 }, + { BRW_OPCODE_WHILE, 39, "while", 0, 0, GFX_ALL }, + { BRW_OPCODE_BREAK, 40, "break", 0, 0, GFX_ALL }, + { BRW_OPCODE_CONTINUE, 41, "cont", 0, 0, GFX_ALL }, + { BRW_OPCODE_HALT, 42, "halt", 0, 0, GFX_ALL }, + { BRW_OPCODE_CALLA, 43, "calla", 0, 0, GFX_GE(GFX75) }, + { BRW_OPCODE_MSAVE, 44, "msave", 0, 0, GFX_LE(GFX5) }, + { BRW_OPCODE_CALL, 44, "call", 0, 0, GFX_GE(GFX6) }, + { BRW_OPCODE_MREST, 45, "mrest", 0, 0, GFX_LE(GFX5) }, + { BRW_OPCODE_RET, 45, "ret", 0, 0, GFX_GE(GFX6) }, + { BRW_OPCODE_PUSH, 46, "push", 0, 0, GFX_LE(GFX5) }, + { BRW_OPCODE_FORK, 46, "fork", 0, 0, GFX6 }, + { BRW_OPCODE_GOTO, 46, "goto", 0, 0, GFX_GE(GFX8) }, + { BRW_OPCODE_POP, 47, "pop", 2, 0, GFX_LE(GFX5) }, + { BRW_OPCODE_WAIT, 48, "wait", 0, 1, GFX_LT(GFX12) }, + { BRW_OPCODE_SEND, 49, "send", 1, 1, GFX_LT(GFX12) }, + { BRW_OPCODE_SENDC, 50, "sendc", 1, 1, GFX_LT(GFX12) }, + { BRW_OPCODE_SEND, 49, "send", 2, 1, GFX_GE(GFX12) }, + { BRW_OPCODE_SENDC, 50, "sendc", 2, 1, GFX_GE(GFX12) }, + { BRW_OPCODE_SENDS, 51, "sends", 2, 1, GFX_GE(GFX9) & GFX_LT(GFX12) }, + { BRW_OPCODE_SENDSC, 52, "sendsc", 2, 1, GFX_GE(GFX9) & GFX_LT(GFX12) }, + { BRW_OPCODE_MATH, 56, "math", 2, 1, GFX_GE(GFX6) }, + { BRW_OPCODE_ADD, 64, "add", 2, 1, GFX_ALL }, + { BRW_OPCODE_MUL, 65, "mul", 2, 1, GFX_ALL }, + { BRW_OPCODE_AVG, 66, "avg", 2, 1, GFX_ALL }, + { BRW_OPCODE_FRC, 67, "frc", 1, 1, GFX_ALL }, + { BRW_OPCODE_RNDU, 68, "rndu", 1, 1, GFX_ALL }, + { BRW_OPCODE_RNDD, 69, "rndd", 1, 1, GFX_ALL }, + { BRW_OPCODE_RNDE, 70, "rnde", 1, 1, GFX_ALL }, + { BRW_OPCODE_RNDZ, 71, "rndz", 1, 1, GFX_ALL }, + { BRW_OPCODE_MAC, 72, "mac", 2, 1, GFX_ALL }, + { BRW_OPCODE_MACH, 73, "mach", 2, 1, GFX_ALL }, + { BRW_OPCODE_LZD, 74, "lzd", 1, 1, GFX_ALL }, + { BRW_OPCODE_FBH, 75, "fbh", 1, 1, GFX_GE(GFX7) }, + { BRW_OPCODE_FBL, 76, "fbl", 1, 1, GFX_GE(GFX7) }, + { BRW_OPCODE_CBIT, 77, "cbit", 1, 1, GFX_GE(GFX7) }, + { BRW_OPCODE_ADDC, 78, "addc", 2, 1, GFX_GE(GFX7) }, + { BRW_OPCODE_SUBB, 79, "subb", 2, 1, GFX_GE(GFX7) }, + { BRW_OPCODE_SAD2, 80, "sad2", 2, 1, GFX_ALL }, + { BRW_OPCODE_SADA2, 81, "sada2", 2, 1, GFX_ALL }, + { BRW_OPCODE_ADD3, 82, "add3", 3, 1, GFX_GE(GFX125) }, + { BRW_OPCODE_DP4, 84, "dp4", 2, 1, GFX_LT(GFX11) }, + { BRW_OPCODE_DPH, 85, "dph", 2, 1, GFX_LT(GFX11) }, + { BRW_OPCODE_DP3, 86, "dp3", 2, 1, GFX_LT(GFX11) }, + { BRW_OPCODE_DP2, 87, "dp2", 2, 1, GFX_LT(GFX11) }, + { BRW_OPCODE_DP4A, 88, "dp4a", 3, 1, GFX_GE(GFX12) }, + { BRW_OPCODE_LINE, 89, "line", 2, 1, GFX_LE(GFX10) }, + { BRW_OPCODE_DPAS, 89, "dpas", 3, 1, GFX_GE(GFX125) }, + { BRW_OPCODE_PLN, 90, "pln", 2, 1, GFX_GE(GFX45) & GFX_LE(GFX10) }, + { BRW_OPCODE_MAD, 91, "mad", 3, 1, GFX_GE(GFX6) }, + { BRW_OPCODE_LRP, 92, "lrp", 3, 1, GFX_GE(GFX6) & GFX_LE(GFX10) }, + { BRW_OPCODE_MADM, 93, "madm", 3, 1, GFX_GE(GFX8) }, + { BRW_OPCODE_NENOP, 125, "nenop", 0, 0, GFX45 }, + { BRW_OPCODE_NOP, 126, "nop", 0, 0, GFX_LT(GFX12) }, + { BRW_OPCODE_NOP, 96, "nop", 0, 0, GFX_GE(GFX12) } +}; + +void +brw_init_isa_info(struct brw_isa_info *isa, + const struct intel_device_info *devinfo) +{ + isa->devinfo = devinfo; + + enum gfx_ver ver = gfx_ver_from_devinfo(devinfo); + + memset(isa->ir_to_descs, 0, sizeof(isa->ir_to_descs)); + memset(isa->hw_to_descs, 0, sizeof(isa->hw_to_descs)); + + for (unsigned i = 0; i < ARRAY_SIZE(opcode_descs); i++) { + if (opcode_descs[i].gfx_vers & ver) { + const unsigned e = opcode_descs[i].ir; + const unsigned h = opcode_descs[i].hw; + assert(e < ARRAY_SIZE(isa->ir_to_descs) && !isa->ir_to_descs[e]); + assert(h < ARRAY_SIZE(isa->hw_to_descs) && !isa->hw_to_descs[h]); + isa->ir_to_descs[e] = &opcode_descs[i]; + isa->hw_to_descs[h] = &opcode_descs[i]; + } + } +} + +/** + * Return the matching opcode_desc for the specified IR opcode and hardware + * generation, or NULL if the opcode is not supported by the device. + */ +const struct opcode_desc * +brw_opcode_desc(const struct brw_isa_info *isa, enum opcode op) +{ + return op < ARRAY_SIZE(isa->ir_to_descs) ? isa->ir_to_descs[op] : NULL; +} + +/** + * Return the matching opcode_desc for the specified HW opcode and hardware + * generation, or NULL if the opcode is not supported by the device. + */ +const struct opcode_desc * +brw_opcode_desc_from_hw(const struct brw_isa_info *isa, unsigned hw) +{ + return hw < ARRAY_SIZE(isa->hw_to_descs) ? isa->hw_to_descs[hw] : NULL; +} + +unsigned +brw_num_sources_from_inst(const struct brw_isa_info *isa, + const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + const struct opcode_desc *desc = + brw_opcode_desc(isa, brw_inst_opcode(isa, inst)); + unsigned math_function; + + if (brw_inst_opcode(isa, inst) == BRW_OPCODE_MATH) { + math_function = brw_inst_math_function(devinfo, inst); + } else if (devinfo->ver < 6 && + brw_inst_opcode(isa, inst) == BRW_OPCODE_SEND) { + if (brw_inst_sfid(devinfo, inst) == BRW_SFID_MATH) { + /* src1 must be a descriptor (including the information to determine + * that the SEND is doing an extended math operation), but src0 can + * actually be null since it serves as the source of the implicit GRF + * to MRF move. + * + * If we stop using that functionality, we'll have to revisit this. + */ + return 2; + } else { + /* Send instructions are allowed to have null sources since they use + * the base_mrf field to specify which message register source. + */ + return 0; + } + } else { + assert(desc->nsrc < 4); + return desc->nsrc; + } + + switch (math_function) { + case BRW_MATH_FUNCTION_INV: + case BRW_MATH_FUNCTION_LOG: + case BRW_MATH_FUNCTION_EXP: + case BRW_MATH_FUNCTION_SQRT: + case BRW_MATH_FUNCTION_RSQ: + case BRW_MATH_FUNCTION_SIN: + case BRW_MATH_FUNCTION_COS: + case BRW_MATH_FUNCTION_SINCOS: + case GFX8_MATH_FUNCTION_INVM: + case GFX8_MATH_FUNCTION_RSQRTM: + return 1; + case BRW_MATH_FUNCTION_FDIV: + case BRW_MATH_FUNCTION_POW: + case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER: + case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT: + case BRW_MATH_FUNCTION_INT_DIV_REMAINDER: + return 2; + default: + unreachable("not reached"); + } +} diff --git a/src/intel/compiler/elk/brw_eu.h b/src/intel/compiler/elk/brw_eu.h new file mode 100644 index 00000000000..e62e6e1c9e9 --- /dev/null +++ b/src/intel/compiler/elk/brw_eu.h @@ -0,0 +1,2089 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell + */ + + +#ifndef BRW_EU_H +#define BRW_EU_H + +#include +#include +#include "brw_inst.h" +#include "brw_compiler.h" +#include "brw_eu_defines.h" +#include "brw_isa_info.h" +#include "brw_reg.h" + +#include "util/bitset.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct disasm_info; + +#define BRW_EU_MAX_INSN_STACK 5 + +struct brw_insn_state { + /* One of BRW_EXECUTE_* */ + unsigned exec_size:3; + + /* Group in units of channels */ + unsigned group:5; + + /* Compression control on gfx4-5 */ + bool compressed:1; + + /* One of BRW_MASK_* */ + unsigned mask_control:1; + + /* Scheduling info for Gfx12+ */ + struct tgl_swsb swsb; + + bool saturate:1; + + /* One of BRW_ALIGN_* */ + unsigned access_mode:1; + + /* One of BRW_PREDICATE_* */ + enum brw_predicate predicate:4; + + bool pred_inv:1; + + /* Flag subreg. Bottom bit is subreg, top bit is reg */ + unsigned flag_subreg:2; + + bool acc_wr_control:1; +}; + + +/* A helper for accessing the last instruction emitted. This makes it easy + * to set various bits on an instruction without having to create temporary + * variable and assign the emitted instruction to those. + */ +#define brw_last_inst (&p->store[p->nr_insn - 1]) + +struct brw_codegen { + brw_inst *store; + int store_size; + unsigned nr_insn; + unsigned int next_insn_offset; + + void *mem_ctx; + + /* Allow clients to push/pop instruction state: + */ + struct brw_insn_state stack[BRW_EU_MAX_INSN_STACK]; + struct brw_insn_state *current; + + /** Whether or not the user wants automatic exec sizes + * + * If true, codegen will try to automatically infer the exec size of an + * instruction from the width of the destination register. If false, it + * will take whatever is set by brw_set_default_exec_size verbatim. + * + * This is set to true by default in brw_init_codegen. + */ + bool automatic_exec_sizes; + + bool single_program_flow; + const struct brw_isa_info *isa; + const struct intel_device_info *devinfo; + + /* Control flow stacks: + * - if_stack contains IF and ELSE instructions which must be patched + * (and popped) once the matching ENDIF instruction is encountered. + * + * Just store the instruction pointer(an index). + */ + int *if_stack; + int if_stack_depth; + int if_stack_array_size; + + /** + * loop_stack contains the instruction pointers of the starts of loops which + * must be patched (and popped) once the matching WHILE instruction is + * encountered. + */ + int *loop_stack; + /** + * pre-gfx6, the BREAK and CONT instructions had to tell how many IF/ENDIF + * blocks they were popping out of, to fix up the mask stack. This tracks + * the IF/ENDIF nesting in each current nested loop level. + */ + int *if_depth_in_loop; + int loop_stack_depth; + int loop_stack_array_size; + + struct brw_shader_reloc *relocs; + int num_relocs; + int reloc_array_size; +}; + +struct brw_label { + int offset; + int number; + struct brw_label *next; +}; + +void brw_pop_insn_state( struct brw_codegen *p ); +void brw_push_insn_state( struct brw_codegen *p ); +unsigned brw_get_default_exec_size(struct brw_codegen *p); +unsigned brw_get_default_group(struct brw_codegen *p); +unsigned brw_get_default_access_mode(struct brw_codegen *p); +struct tgl_swsb brw_get_default_swsb(struct brw_codegen *p); +void brw_set_default_exec_size(struct brw_codegen *p, unsigned value); +void brw_set_default_mask_control( struct brw_codegen *p, unsigned value ); +void brw_set_default_saturate( struct brw_codegen *p, bool enable ); +void brw_set_default_access_mode( struct brw_codegen *p, unsigned access_mode ); +void brw_inst_set_compression(const struct intel_device_info *devinfo, + brw_inst *inst, bool on); +void brw_set_default_compression(struct brw_codegen *p, bool on); +void brw_inst_set_group(const struct intel_device_info *devinfo, + brw_inst *inst, unsigned group); +void brw_set_default_group(struct brw_codegen *p, unsigned group); +void brw_set_default_compression_control(struct brw_codegen *p, enum brw_compression c); +void brw_set_default_predicate_control(struct brw_codegen *p, enum brw_predicate pc); +void brw_set_default_predicate_inverse(struct brw_codegen *p, bool predicate_inverse); +void brw_set_default_flag_reg(struct brw_codegen *p, int reg, int subreg); +void brw_set_default_acc_write_control(struct brw_codegen *p, unsigned value); +void brw_set_default_swsb(struct brw_codegen *p, struct tgl_swsb value); + +void brw_init_codegen(const struct brw_isa_info *isa, + struct brw_codegen *p, void *mem_ctx); +bool brw_has_jip(const struct intel_device_info *devinfo, enum opcode opcode); +bool brw_has_uip(const struct intel_device_info *devinfo, enum opcode opcode); +const struct brw_shader_reloc *brw_get_shader_relocs(struct brw_codegen *p, + unsigned *num_relocs); +const unsigned *brw_get_program( struct brw_codegen *p, unsigned *sz ); + +bool brw_should_dump_shader_bin(void); +void brw_dump_shader_bin(void *assembly, int start_offset, int end_offset, + const char *identifier); + +bool brw_try_override_assembly(struct brw_codegen *p, int start_offset, + const char *identifier); + +void brw_realign(struct brw_codegen *p, unsigned alignment); +int brw_append_data(struct brw_codegen *p, void *data, + unsigned size, unsigned alignment); +brw_inst *brw_next_insn(struct brw_codegen *p, unsigned opcode); +void brw_add_reloc(struct brw_codegen *p, uint32_t id, + enum brw_shader_reloc_type type, + uint32_t offset, uint32_t delta); +void brw_set_dest(struct brw_codegen *p, brw_inst *insn, struct brw_reg dest); +void brw_set_src0(struct brw_codegen *p, brw_inst *insn, struct brw_reg reg); + +void gfx6_resolve_implied_move(struct brw_codegen *p, + struct brw_reg *src, + unsigned msg_reg_nr); + +/* Helpers for regular instructions: + */ +#define ALU1(OP) \ +brw_inst *brw_##OP(struct brw_codegen *p, \ + struct brw_reg dest, \ + struct brw_reg src0); + +#define ALU2(OP) \ +brw_inst *brw_##OP(struct brw_codegen *p, \ + struct brw_reg dest, \ + struct brw_reg src0, \ + struct brw_reg src1); + +#define ALU3(OP) \ +brw_inst *brw_##OP(struct brw_codegen *p, \ + struct brw_reg dest, \ + struct brw_reg src0, \ + struct brw_reg src1, \ + struct brw_reg src2); + +ALU1(MOV) +ALU2(SEL) +ALU1(NOT) +ALU2(AND) +ALU2(OR) +ALU2(XOR) +ALU2(SHR) +ALU2(SHL) +ALU1(DIM) +ALU2(ASR) +ALU2(ROL) +ALU2(ROR) +ALU3(CSEL) +ALU1(F32TO16) +ALU1(F16TO32) +ALU2(ADD) +ALU3(ADD3) +ALU2(AVG) +ALU2(MUL) +ALU1(FRC) +ALU1(RNDD) +ALU1(RNDE) +ALU1(RNDU) +ALU1(RNDZ) +ALU2(MAC) +ALU2(MACH) +ALU1(LZD) +ALU2(DP4) +ALU2(DPH) +ALU2(DP3) +ALU2(DP2) +ALU3(DP4A) +ALU2(LINE) +ALU2(PLN) +ALU3(MAD) +ALU3(LRP) +ALU1(BFREV) +ALU3(BFE) +ALU2(BFI1) +ALU3(BFI2) +ALU1(FBH) +ALU1(FBL) +ALU1(CBIT) +ALU2(ADDC) +ALU2(SUBB) + +#undef ALU1 +#undef ALU2 +#undef ALU3 + +static inline unsigned +reg_unit(const struct intel_device_info *devinfo) +{ + return devinfo->ver >= 20 ? 2 : 1; +} + + +/* Helpers for SEND instruction: + */ + +/** + * Construct a message descriptor immediate with the specified common + * descriptor controls. + */ +static inline uint32_t +brw_message_desc(const struct intel_device_info *devinfo, + unsigned msg_length, + unsigned response_length, + bool header_present) +{ + if (devinfo->ver >= 5) { + assert(msg_length % reg_unit(devinfo) == 0); + assert(response_length % reg_unit(devinfo) == 0); + return (SET_BITS(msg_length / reg_unit(devinfo), 28, 25) | + SET_BITS(response_length / reg_unit(devinfo), 24, 20) | + SET_BITS(header_present, 19, 19)); + } else { + return (SET_BITS(msg_length, 23, 20) | + SET_BITS(response_length, 19, 16)); + } +} + +static inline unsigned +brw_message_desc_mlen(const struct intel_device_info *devinfo, uint32_t desc) +{ + if (devinfo->ver >= 5) + return GET_BITS(desc, 28, 25) * reg_unit(devinfo); + else + return GET_BITS(desc, 23, 20); +} + +static inline unsigned +brw_message_desc_rlen(const struct intel_device_info *devinfo, uint32_t desc) +{ + if (devinfo->ver >= 5) + return GET_BITS(desc, 24, 20) * reg_unit(devinfo); + else + return GET_BITS(desc, 19, 16); +} + +static inline bool +brw_message_desc_header_present(ASSERTED + const struct intel_device_info *devinfo, + uint32_t desc) +{ + assert(devinfo->ver >= 5); + return GET_BITS(desc, 19, 19); +} + +static inline unsigned +brw_message_ex_desc(const struct intel_device_info *devinfo, + unsigned ex_msg_length) +{ + assert(ex_msg_length % reg_unit(devinfo) == 0); + return SET_BITS(ex_msg_length / reg_unit(devinfo), 9, 6); +} + +static inline unsigned +brw_message_ex_desc_ex_mlen(const struct intel_device_info *devinfo, + uint32_t ex_desc) +{ + return GET_BITS(ex_desc, 9, 6) * reg_unit(devinfo); +} + +static inline uint32_t +brw_urb_desc(const struct intel_device_info *devinfo, + unsigned msg_type, + bool per_slot_offset_present, + bool channel_mask_present, + unsigned global_offset) +{ + if (devinfo->ver >= 8) { + return (SET_BITS(per_slot_offset_present, 17, 17) | + SET_BITS(channel_mask_present, 15, 15) | + SET_BITS(global_offset, 14, 4) | + SET_BITS(msg_type, 3, 0)); + } else if (devinfo->ver >= 7) { + assert(!channel_mask_present); + return (SET_BITS(per_slot_offset_present, 16, 16) | + SET_BITS(global_offset, 13, 3) | + SET_BITS(msg_type, 3, 0)); + } else { + unreachable("unhandled URB write generation"); + } +} + +static inline uint32_t +brw_urb_desc_msg_type(ASSERTED const struct intel_device_info *devinfo, + uint32_t desc) +{ + assert(devinfo->ver >= 7); + return GET_BITS(desc, 3, 0); +} + +static inline uint32_t +brw_urb_fence_desc(const struct intel_device_info *devinfo) +{ + assert(devinfo->has_lsc); + return brw_urb_desc(devinfo, GFX125_URB_OPCODE_FENCE, false, false, 0); +} + +/** + * Construct a message descriptor immediate with the specified sampler + * function controls. + */ +static inline uint32_t +brw_sampler_desc(const struct intel_device_info *devinfo, + unsigned binding_table_index, + unsigned sampler, + unsigned msg_type, + unsigned simd_mode, + unsigned return_format) +{ + const unsigned desc = (SET_BITS(binding_table_index, 7, 0) | + SET_BITS(sampler, 11, 8)); + + /* From GFX20 Bspec: Shared Functions - Message Descriptor - + * Sampling Engine: + * + * Message Type[5] 31 This bit represents the upper bit of message type + * 6-bit encoding (c.f. [16:12]). This bit is set + * for messages with programmable offsets. + */ + if (devinfo->ver >= 20) + return desc | SET_BITS(msg_type & 0x1F, 16, 12) | + SET_BITS(simd_mode & 0x3, 18, 17) | + SET_BITS(simd_mode >> 2, 29, 29) | + SET_BITS(return_format, 30, 30) | + SET_BITS(msg_type >> 5, 31, 31); + + /* From the CHV Bspec: Shared Functions - Message Descriptor - + * Sampling Engine: + * + * SIMD Mode[2] 29 This field is the upper bit of the 3-bit + * SIMD Mode field. + */ + if (devinfo->ver >= 8) + return desc | SET_BITS(msg_type, 16, 12) | + SET_BITS(simd_mode & 0x3, 18, 17) | + SET_BITS(simd_mode >> 2, 29, 29) | + SET_BITS(return_format, 30, 30); + if (devinfo->ver >= 7) + return (desc | SET_BITS(msg_type, 16, 12) | + SET_BITS(simd_mode, 18, 17)); + else if (devinfo->ver >= 5) + return (desc | SET_BITS(msg_type, 15, 12) | + SET_BITS(simd_mode, 17, 16)); + else if (devinfo->verx10 >= 45) + return desc | SET_BITS(msg_type, 15, 12); + else + return (desc | SET_BITS(return_format, 13, 12) | + SET_BITS(msg_type, 15, 14)); +} + +static inline unsigned +brw_sampler_desc_binding_table_index(UNUSED + const struct intel_device_info *devinfo, + uint32_t desc) +{ + return GET_BITS(desc, 7, 0); +} + +static inline unsigned +brw_sampler_desc_sampler(UNUSED const struct intel_device_info *devinfo, + uint32_t desc) +{ + return GET_BITS(desc, 11, 8); +} + +static inline unsigned +brw_sampler_desc_msg_type(const struct intel_device_info *devinfo, uint32_t desc) +{ + if (devinfo->ver >= 20) + return GET_BITS(desc, 31, 31) << 5 | GET_BITS(desc, 16, 12); + else if (devinfo->ver >= 7) + return GET_BITS(desc, 16, 12); + else if (devinfo->verx10 >= 45) + return GET_BITS(desc, 15, 12); + else + return GET_BITS(desc, 15, 14); +} + +static inline unsigned +brw_sampler_desc_simd_mode(const struct intel_device_info *devinfo, + uint32_t desc) +{ + assert(devinfo->ver >= 5); + if (devinfo->ver >= 8) + return GET_BITS(desc, 18, 17) | GET_BITS(desc, 29, 29) << 2; + else if (devinfo->ver >= 7) + return GET_BITS(desc, 18, 17); + else + return GET_BITS(desc, 17, 16); +} + +static inline unsigned +brw_sampler_desc_return_format(ASSERTED const struct intel_device_info *devinfo, + uint32_t desc) +{ + assert(devinfo->verx10 == 40 || devinfo->ver >= 8); + if (devinfo->ver >= 8) + return GET_BITS(desc, 30, 30); + else + return GET_BITS(desc, 13, 12); +} + +/** + * Construct a message descriptor for the dataport + */ +static inline uint32_t +brw_dp_desc(const struct intel_device_info *devinfo, + unsigned binding_table_index, + unsigned msg_type, + unsigned msg_control) +{ + /* Prior to gfx6, things are too inconsistent; use the dp_read/write_desc + * helpers instead. + */ + assert(devinfo->ver >= 6); + const unsigned desc = SET_BITS(binding_table_index, 7, 0); + if (devinfo->ver >= 8) { + return (desc | SET_BITS(msg_control, 13, 8) | + SET_BITS(msg_type, 18, 14)); + } else if (devinfo->ver >= 7) { + return (desc | SET_BITS(msg_control, 13, 8) | + SET_BITS(msg_type, 17, 14)); + } else { + return (desc | SET_BITS(msg_control, 12, 8) | + SET_BITS(msg_type, 16, 13)); + } +} + +static inline unsigned +brw_dp_desc_binding_table_index(UNUSED const struct intel_device_info *devinfo, + uint32_t desc) +{ + return GET_BITS(desc, 7, 0); +} + +static inline unsigned +brw_dp_desc_msg_type(const struct intel_device_info *devinfo, uint32_t desc) +{ + assert(devinfo->ver >= 6); + if (devinfo->ver >= 8) + return GET_BITS(desc, 18, 14); + else if (devinfo->ver >= 7) + return GET_BITS(desc, 17, 14); + else + return GET_BITS(desc, 16, 13); +} + +static inline unsigned +brw_dp_desc_msg_control(const struct intel_device_info *devinfo, uint32_t desc) +{ + assert(devinfo->ver >= 6); + if (devinfo->ver >= 7) + return GET_BITS(desc, 13, 8); + else + return GET_BITS(desc, 12, 8); +} + +/** + * Construct a message descriptor immediate with the specified dataport read + * function controls. + */ +static inline uint32_t +brw_dp_read_desc(const struct intel_device_info *devinfo, + unsigned binding_table_index, + unsigned msg_control, + unsigned msg_type, + unsigned target_cache) +{ + if (devinfo->ver >= 6) + return brw_dp_desc(devinfo, binding_table_index, msg_type, msg_control); + else if (devinfo->verx10 >= 45) + return (SET_BITS(binding_table_index, 7, 0) | + SET_BITS(msg_control, 10, 8) | + SET_BITS(msg_type, 13, 11) | + SET_BITS(target_cache, 15, 14)); + else + return (SET_BITS(binding_table_index, 7, 0) | + SET_BITS(msg_control, 11, 8) | + SET_BITS(msg_type, 13, 12) | + SET_BITS(target_cache, 15, 14)); +} + +static inline unsigned +brw_dp_read_desc_msg_type(const struct intel_device_info *devinfo, + uint32_t desc) +{ + if (devinfo->ver >= 6) + return brw_dp_desc_msg_type(devinfo, desc); + else if (devinfo->verx10 >= 45) + return GET_BITS(desc, 13, 11); + else + return GET_BITS(desc, 13, 12); +} + +static inline unsigned +brw_dp_read_desc_msg_control(const struct intel_device_info *devinfo, + uint32_t desc) +{ + if (devinfo->ver >= 6) + return brw_dp_desc_msg_control(devinfo, desc); + else if (devinfo->verx10 >= 45) + return GET_BITS(desc, 10, 8); + else + return GET_BITS(desc, 11, 8); +} + +/** + * Construct a message descriptor immediate with the specified dataport write + * function controls. + */ +static inline uint32_t +brw_dp_write_desc(const struct intel_device_info *devinfo, + unsigned binding_table_index, + unsigned msg_control, + unsigned msg_type, + unsigned send_commit_msg) +{ + assert(devinfo->ver <= 6 || !send_commit_msg); + if (devinfo->ver >= 6) { + return brw_dp_desc(devinfo, binding_table_index, msg_type, msg_control) | + SET_BITS(send_commit_msg, 17, 17); + } else { + return (SET_BITS(binding_table_index, 7, 0) | + SET_BITS(msg_control, 11, 8) | + SET_BITS(msg_type, 14, 12) | + SET_BITS(send_commit_msg, 15, 15)); + } +} + +static inline unsigned +brw_dp_write_desc_msg_type(const struct intel_device_info *devinfo, + uint32_t desc) +{ + if (devinfo->ver >= 6) + return brw_dp_desc_msg_type(devinfo, desc); + else + return GET_BITS(desc, 14, 12); +} + +static inline unsigned +brw_dp_write_desc_msg_control(const struct intel_device_info *devinfo, + uint32_t desc) +{ + if (devinfo->ver >= 6) + return brw_dp_desc_msg_control(devinfo, desc); + else + return GET_BITS(desc, 11, 8); +} + +static inline bool +brw_dp_write_desc_write_commit(const struct intel_device_info *devinfo, + uint32_t desc) +{ + assert(devinfo->ver <= 6); + if (devinfo->ver >= 6) + return GET_BITS(desc, 17, 17); + else + return GET_BITS(desc, 15, 15); +} + +/** + * Construct a message descriptor immediate with the specified dataport + * surface function controls. + */ +static inline uint32_t +brw_dp_surface_desc(const struct intel_device_info *devinfo, + unsigned msg_type, + unsigned msg_control) +{ + assert(devinfo->ver >= 7); + /* We'll OR in the binding table index later */ + return brw_dp_desc(devinfo, 0, msg_type, msg_control); +} + +static inline uint32_t +brw_dp_untyped_atomic_desc(const struct intel_device_info *devinfo, + unsigned exec_size, /**< 0 for SIMD4x2 */ + unsigned atomic_op, + bool response_expected) +{ + assert(exec_size <= 8 || exec_size == 16); + + unsigned msg_type; + if (devinfo->verx10 >= 75) { + if (exec_size > 0) { + msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP; + } else { + msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2; + } + } else { + msg_type = GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP; + } + + const unsigned msg_control = + SET_BITS(atomic_op, 3, 0) | + SET_BITS(0 < exec_size && exec_size <= 8, 4, 4) | + SET_BITS(response_expected, 5, 5); + + return brw_dp_surface_desc(devinfo, msg_type, msg_control); +} + +static inline uint32_t +brw_dp_untyped_atomic_float_desc(const struct intel_device_info *devinfo, + unsigned exec_size, + unsigned atomic_op, + bool response_expected) +{ + assert(exec_size <= 8 || exec_size == 16); + assert(devinfo->ver >= 9); + + assert(exec_size > 0); + const unsigned msg_type = GFX9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP; + + const unsigned msg_control = + SET_BITS(atomic_op, 1, 0) | + SET_BITS(exec_size <= 8, 4, 4) | + SET_BITS(response_expected, 5, 5); + + return brw_dp_surface_desc(devinfo, msg_type, msg_control); +} + +static inline unsigned +brw_mdc_cmask(unsigned num_channels) +{ + /* See also MDC_CMASK in the SKL PRM Vol 2d. */ + return 0xf & (0xf << num_channels); +} + +static inline unsigned +lsc_cmask(unsigned num_channels) +{ + assert(num_channels > 0 && num_channels <= 4); + return BITSET_MASK(num_channels); +} + +static inline uint32_t +brw_dp_untyped_surface_rw_desc(const struct intel_device_info *devinfo, + unsigned exec_size, /**< 0 for SIMD4x2 */ + unsigned num_channels, + bool write) +{ + assert(exec_size <= 8 || exec_size == 16); + + unsigned msg_type; + if (write) { + if (devinfo->verx10 >= 75) { + msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE; + } else { + msg_type = GFX7_DATAPORT_DC_UNTYPED_SURFACE_WRITE; + } + } else { + /* Read */ + if (devinfo->verx10 >= 75) { + msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ; + } else { + msg_type = GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ; + } + } + + /* SIMD4x2 is only valid for read messages on IVB; use SIMD8 instead */ + if (write && devinfo->verx10 == 70 && exec_size == 0) + exec_size = 8; + + /* See also MDC_SM3 in the SKL PRM Vol 2d. */ + const unsigned simd_mode = exec_size == 0 ? 0 : /* SIMD4x2 */ + exec_size <= 8 ? 2 : 1; + + const unsigned msg_control = + SET_BITS(brw_mdc_cmask(num_channels), 3, 0) | + SET_BITS(simd_mode, 5, 4); + + return brw_dp_surface_desc(devinfo, msg_type, msg_control); +} + +static inline unsigned +brw_mdc_ds(unsigned bit_size) +{ + switch (bit_size) { + case 8: + return GFX7_BYTE_SCATTERED_DATA_ELEMENT_BYTE; + case 16: + return GFX7_BYTE_SCATTERED_DATA_ELEMENT_WORD; + case 32: + return GFX7_BYTE_SCATTERED_DATA_ELEMENT_DWORD; + default: + unreachable("Unsupported bit_size for byte scattered messages"); + } +} + +static inline uint32_t +brw_dp_byte_scattered_rw_desc(const struct intel_device_info *devinfo, + unsigned exec_size, + unsigned bit_size, + bool write) +{ + assert(exec_size <= 8 || exec_size == 16); + + assert(devinfo->verx10 >= 75); + const unsigned msg_type = + write ? HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE : + HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ; + + assert(exec_size > 0); + const unsigned msg_control = + SET_BITS(exec_size == 16, 0, 0) | + SET_BITS(brw_mdc_ds(bit_size), 3, 2); + + return brw_dp_surface_desc(devinfo, msg_type, msg_control); +} + +static inline uint32_t +brw_dp_dword_scattered_rw_desc(const struct intel_device_info *devinfo, + unsigned exec_size, + bool write) +{ + assert(exec_size == 8 || exec_size == 16); + + unsigned msg_type; + if (write) { + if (devinfo->ver >= 6) { + msg_type = GFX6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE; + } else { + msg_type = BRW_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE; + } + } else { + if (devinfo->ver >= 7) { + msg_type = GFX7_DATAPORT_DC_DWORD_SCATTERED_READ; + } else if (devinfo->verx10 >= 45) { + msg_type = G45_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ; + } else { + msg_type = BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ; + } + } + + const unsigned msg_control = + SET_BITS(1, 1, 1) | /* Legacy SIMD Mode */ + SET_BITS(exec_size == 16, 0, 0); + + return brw_dp_surface_desc(devinfo, msg_type, msg_control); +} + +static inline uint32_t +brw_dp_oword_block_rw_desc(const struct intel_device_info *devinfo, + bool align_16B, + unsigned num_dwords, + bool write) +{ + /* Writes can only have addresses aligned by OWORDs (16 Bytes). */ + assert(!write || align_16B); + + const unsigned msg_type = + write ? GFX7_DATAPORT_DC_OWORD_BLOCK_WRITE : + align_16B ? GFX7_DATAPORT_DC_OWORD_BLOCK_READ : + GFX7_DATAPORT_DC_UNALIGNED_OWORD_BLOCK_READ; + + const unsigned msg_control = + SET_BITS(BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_dwords), 2, 0); + + return brw_dp_surface_desc(devinfo, msg_type, msg_control); +} + +static inline uint32_t +brw_dp_a64_untyped_surface_rw_desc(const struct intel_device_info *devinfo, + unsigned exec_size, /**< 0 for SIMD4x2 */ + unsigned num_channels, + bool write) +{ + assert(exec_size <= 8 || exec_size == 16); + assert(devinfo->ver >= 8); + + unsigned msg_type = + write ? GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE : + GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ; + + /* See also MDC_SM3 in the SKL PRM Vol 2d. */ + const unsigned simd_mode = exec_size == 0 ? 0 : /* SIMD4x2 */ + exec_size <= 8 ? 2 : 1; + + const unsigned msg_control = + SET_BITS(brw_mdc_cmask(num_channels), 3, 0) | + SET_BITS(simd_mode, 5, 4); + + return brw_dp_desc(devinfo, GFX8_BTI_STATELESS_NON_COHERENT, + msg_type, msg_control); +} + +static inline uint32_t +brw_dp_a64_oword_block_rw_desc(const struct intel_device_info *devinfo, + bool align_16B, + unsigned num_dwords, + bool write) +{ + /* Writes can only have addresses aligned by OWORDs (16 Bytes). */ + assert(!write || align_16B); + + unsigned msg_type = + write ? GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_WRITE : + GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_READ; + + unsigned msg_control = + SET_BITS(!align_16B, 4, 3) | + SET_BITS(BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_dwords), 2, 0); + + return brw_dp_desc(devinfo, GFX8_BTI_STATELESS_NON_COHERENT, + msg_type, msg_control); +} + +/** + * Calculate the data size (see MDC_A64_DS in the "Structures" volume of the + * Skylake PRM). + */ +static inline uint32_t +brw_mdc_a64_ds(unsigned elems) +{ + switch (elems) { + case 1: return 0; + case 2: return 1; + case 4: return 2; + case 8: return 3; + default: + unreachable("Unsupported elmeent count for A64 scattered message"); + } +} + +static inline uint32_t +brw_dp_a64_byte_scattered_rw_desc(const struct intel_device_info *devinfo, + unsigned exec_size, /**< 0 for SIMD4x2 */ + unsigned bit_size, + bool write) +{ + assert(exec_size <= 8 || exec_size == 16); + assert(devinfo->ver >= 8); + + unsigned msg_type = + write ? GFX8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE : + GFX9_DATAPORT_DC_PORT1_A64_SCATTERED_READ; + + const unsigned msg_control = + SET_BITS(GFX8_A64_SCATTERED_SUBTYPE_BYTE, 1, 0) | + SET_BITS(brw_mdc_a64_ds(bit_size / 8), 3, 2) | + SET_BITS(exec_size == 16, 4, 4); + + return brw_dp_desc(devinfo, GFX8_BTI_STATELESS_NON_COHERENT, + msg_type, msg_control); +} + +static inline uint32_t +brw_dp_a64_untyped_atomic_desc(const struct intel_device_info *devinfo, + ASSERTED unsigned exec_size, /**< 0 for SIMD4x2 */ + unsigned bit_size, + unsigned atomic_op, + bool response_expected) +{ + assert(exec_size == 8); + assert(devinfo->ver >= 8); + assert(bit_size == 16 || bit_size == 32 || bit_size == 64); + assert(devinfo->ver >= 12 || bit_size >= 32); + + const unsigned msg_type = bit_size == 16 ? + GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_INT_OP : + GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_OP; + + const unsigned msg_control = + SET_BITS(atomic_op, 3, 0) | + SET_BITS(bit_size == 64, 4, 4) | + SET_BITS(response_expected, 5, 5); + + return brw_dp_desc(devinfo, GFX8_BTI_STATELESS_NON_COHERENT, + msg_type, msg_control); +} + +static inline uint32_t +brw_dp_a64_untyped_atomic_float_desc(const struct intel_device_info *devinfo, + ASSERTED unsigned exec_size, + unsigned bit_size, + unsigned atomic_op, + bool response_expected) +{ + assert(exec_size == 8); + assert(devinfo->ver >= 9); + assert(bit_size == 16 || bit_size == 32); + assert(devinfo->ver >= 12 || bit_size == 32); + + assert(exec_size > 0); + const unsigned msg_type = bit_size == 32 ? + GFX9_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_FLOAT_OP : + GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_FLOAT_OP; + + const unsigned msg_control = + SET_BITS(atomic_op, 1, 0) | + SET_BITS(response_expected, 5, 5); + + return brw_dp_desc(devinfo, GFX8_BTI_STATELESS_NON_COHERENT, + msg_type, msg_control); +} + +static inline uint32_t +brw_dp_typed_atomic_desc(const struct intel_device_info *devinfo, + unsigned exec_size, + unsigned exec_group, + unsigned atomic_op, + bool response_expected) +{ + assert(exec_size > 0 || exec_group == 0); + assert(exec_group % 8 == 0); + + unsigned msg_type; + if (devinfo->verx10 >= 75) { + if (exec_size == 0) { + msg_type = HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2; + } else { + msg_type = HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP; + } + } else { + /* SIMD4x2 typed surface R/W messages only exist on HSW+ */ + assert(exec_size > 0); + msg_type = GFX7_DATAPORT_RC_TYPED_ATOMIC_OP; + } + + const bool high_sample_mask = (exec_group / 8) % 2 == 1; + + const unsigned msg_control = + SET_BITS(atomic_op, 3, 0) | + SET_BITS(high_sample_mask, 4, 4) | + SET_BITS(response_expected, 5, 5); + + return brw_dp_surface_desc(devinfo, msg_type, msg_control); +} + +static inline uint32_t +brw_dp_typed_surface_rw_desc(const struct intel_device_info *devinfo, + unsigned exec_size, + unsigned exec_group, + unsigned num_channels, + bool write) +{ + assert(exec_size > 0 || exec_group == 0); + assert(exec_group % 8 == 0); + + /* Typed surface reads and writes don't support SIMD16 */ + assert(exec_size <= 8); + + unsigned msg_type; + if (write) { + if (devinfo->verx10 >= 75) { + msg_type = HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE; + } else { + msg_type = GFX7_DATAPORT_RC_TYPED_SURFACE_WRITE; + } + } else { + if (devinfo->verx10 >= 75) { + msg_type = HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ; + } else { + msg_type = GFX7_DATAPORT_RC_TYPED_SURFACE_READ; + } + } + + /* See also MDC_SG3 in the SKL PRM Vol 2d. */ + unsigned msg_control; + if (devinfo->verx10 >= 75) { + /* See also MDC_SG3 in the SKL PRM Vol 2d. */ + const unsigned slot_group = exec_size == 0 ? 0 : /* SIMD4x2 */ + 1 + ((exec_group / 8) % 2); + + msg_control = + SET_BITS(brw_mdc_cmask(num_channels), 3, 0) | + SET_BITS(slot_group, 5, 4); + } else { + /* SIMD4x2 typed surface R/W messages only exist on HSW+ */ + assert(exec_size > 0); + const unsigned slot_group = ((exec_group / 8) % 2); + + msg_control = + SET_BITS(brw_mdc_cmask(num_channels), 3, 0) | + SET_BITS(slot_group, 5, 5); + } + + return brw_dp_surface_desc(devinfo, msg_type, msg_control); +} + +static inline uint32_t +brw_fb_desc(const struct intel_device_info *devinfo, + unsigned binding_table_index, + unsigned msg_type, + unsigned msg_control) +{ + /* Prior to gen6, things are too inconsistent; use the fb_(read|write)_desc + * helpers instead. + */ + assert(devinfo->ver >= 6); + const unsigned desc = SET_BITS(binding_table_index, 7, 0); + if (devinfo->ver >= 7) { + return (desc | SET_BITS(msg_control, 13, 8) | + SET_BITS(msg_type, 17, 14)); + } else { + return (desc | SET_BITS(msg_control, 12, 8) | + SET_BITS(msg_type, 16, 13)); + } +} + +static inline unsigned +brw_fb_desc_binding_table_index(UNUSED const struct intel_device_info *devinfo, + uint32_t desc) +{ + return GET_BITS(desc, 7, 0); +} + +static inline uint32_t +brw_fb_desc_msg_control(const struct intel_device_info *devinfo, uint32_t desc) +{ + assert(devinfo->ver >= 6); + if (devinfo->ver >= 7) + return GET_BITS(desc, 13, 8); + else + return GET_BITS(desc, 12, 8); +} + +static inline unsigned +brw_fb_desc_msg_type(const struct intel_device_info *devinfo, uint32_t desc) +{ + assert(devinfo->ver >= 6); + if (devinfo->ver >= 7) + return GET_BITS(desc, 17, 14); + else + return GET_BITS(desc, 16, 13); +} + +static inline uint32_t +brw_fb_read_desc(const struct intel_device_info *devinfo, + unsigned binding_table_index, + unsigned msg_control, + unsigned exec_size, + bool per_sample) +{ + assert(devinfo->ver >= 9); + assert(exec_size == 8 || exec_size == 16); + + return brw_fb_desc(devinfo, binding_table_index, + GFX9_DATAPORT_RC_RENDER_TARGET_READ, msg_control) | + SET_BITS(per_sample, 13, 13) | + SET_BITS(exec_size == 8, 8, 8) /* Render Target Message Subtype */; +} + +static inline uint32_t +brw_fb_write_desc(const struct intel_device_info *devinfo, + unsigned binding_table_index, + unsigned msg_control, + bool last_render_target, + bool coarse_write) +{ + const unsigned msg_type = + devinfo->ver >= 6 ? + GFX6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE : + BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE; + + assert(devinfo->ver >= 10 || !coarse_write); + + if (devinfo->ver >= 6) { + return brw_fb_desc(devinfo, binding_table_index, msg_type, msg_control) | + SET_BITS(last_render_target, 12, 12) | + SET_BITS(coarse_write, 18, 18); + } else { + return (SET_BITS(binding_table_index, 7, 0) | + SET_BITS(msg_control, 11, 8) | + SET_BITS(last_render_target, 11, 11) | + SET_BITS(msg_type, 14, 12)); + } +} + +static inline unsigned +brw_fb_write_desc_msg_type(const struct intel_device_info *devinfo, + uint32_t desc) +{ + if (devinfo->ver >= 6) + return brw_fb_desc_msg_type(devinfo, desc); + else + return GET_BITS(desc, 14, 12); +} + +static inline unsigned +brw_fb_write_desc_msg_control(const struct intel_device_info *devinfo, + uint32_t desc) +{ + if (devinfo->ver >= 6) + return brw_fb_desc_msg_control(devinfo, desc); + else + return GET_BITS(desc, 11, 8); +} + +static inline bool +brw_fb_write_desc_last_render_target(const struct intel_device_info *devinfo, + uint32_t desc) +{ + if (devinfo->ver >= 6) + return GET_BITS(desc, 12, 12); + else + return GET_BITS(desc, 11, 11); +} + +static inline bool +brw_fb_write_desc_write_commit(const struct intel_device_info *devinfo, + uint32_t desc) +{ + assert(devinfo->ver <= 6); + if (devinfo->ver >= 6) + return GET_BITS(desc, 17, 17); + else + return GET_BITS(desc, 15, 15); +} + +static inline bool +brw_fb_write_desc_coarse_write(const struct intel_device_info *devinfo, + uint32_t desc) +{ + assert(devinfo->ver >= 10); + return GET_BITS(desc, 18, 18); +} + +static inline bool +lsc_opcode_has_cmask(enum lsc_opcode opcode) +{ + return opcode == LSC_OP_LOAD_CMASK || opcode == LSC_OP_STORE_CMASK; +} + +static inline bool +lsc_opcode_has_transpose(enum lsc_opcode opcode) +{ + return opcode == LSC_OP_LOAD || opcode == LSC_OP_STORE; +} + +static inline bool +lsc_opcode_is_store(enum lsc_opcode opcode) +{ + return opcode == LSC_OP_STORE || + opcode == LSC_OP_STORE_CMASK; +} + +static inline bool +lsc_opcode_is_atomic(enum lsc_opcode opcode) +{ + switch (opcode) { + case LSC_OP_ATOMIC_INC: + case LSC_OP_ATOMIC_DEC: + case LSC_OP_ATOMIC_LOAD: + case LSC_OP_ATOMIC_STORE: + case LSC_OP_ATOMIC_ADD: + case LSC_OP_ATOMIC_SUB: + case LSC_OP_ATOMIC_MIN: + case LSC_OP_ATOMIC_MAX: + case LSC_OP_ATOMIC_UMIN: + case LSC_OP_ATOMIC_UMAX: + case LSC_OP_ATOMIC_CMPXCHG: + case LSC_OP_ATOMIC_FADD: + case LSC_OP_ATOMIC_FSUB: + case LSC_OP_ATOMIC_FMIN: + case LSC_OP_ATOMIC_FMAX: + case LSC_OP_ATOMIC_FCMPXCHG: + case LSC_OP_ATOMIC_AND: + case LSC_OP_ATOMIC_OR: + case LSC_OP_ATOMIC_XOR: + return true; + + default: + return false; + } +} + +static inline bool +lsc_opcode_is_atomic_float(enum lsc_opcode opcode) +{ + switch (opcode) { + case LSC_OP_ATOMIC_FADD: + case LSC_OP_ATOMIC_FSUB: + case LSC_OP_ATOMIC_FMIN: + case LSC_OP_ATOMIC_FMAX: + case LSC_OP_ATOMIC_FCMPXCHG: + return true; + + default: + return false; + } +} + +static inline unsigned +lsc_op_num_data_values(unsigned _op) +{ + enum lsc_opcode op = (enum lsc_opcode) _op; + + switch (op) { + case LSC_OP_ATOMIC_CMPXCHG: + case LSC_OP_ATOMIC_FCMPXCHG: + return 2; + case LSC_OP_ATOMIC_INC: + case LSC_OP_ATOMIC_DEC: + case LSC_OP_LOAD: + case LSC_OP_LOAD_CMASK: + case LSC_OP_FENCE: + /* XXX: actually check docs */ + return 0; + default: + return 1; + } +} + +static inline unsigned +lsc_op_to_legacy_atomic(unsigned _op) +{ + enum lsc_opcode op = (enum lsc_opcode) _op; + + switch (op) { + case LSC_OP_ATOMIC_INC: + return BRW_AOP_INC; + case LSC_OP_ATOMIC_DEC: + return BRW_AOP_DEC; + case LSC_OP_ATOMIC_STORE: + return BRW_AOP_MOV; + case LSC_OP_ATOMIC_ADD: + return BRW_AOP_ADD; + case LSC_OP_ATOMIC_SUB: + return BRW_AOP_SUB; + case LSC_OP_ATOMIC_MIN: + return BRW_AOP_IMIN; + case LSC_OP_ATOMIC_MAX: + return BRW_AOP_IMAX; + case LSC_OP_ATOMIC_UMIN: + return BRW_AOP_UMIN; + case LSC_OP_ATOMIC_UMAX: + return BRW_AOP_UMAX; + case LSC_OP_ATOMIC_CMPXCHG: + return BRW_AOP_CMPWR; + case LSC_OP_ATOMIC_FADD: + return BRW_AOP_FADD; + case LSC_OP_ATOMIC_FMIN: + return BRW_AOP_FMIN; + case LSC_OP_ATOMIC_FMAX: + return BRW_AOP_FMAX; + case LSC_OP_ATOMIC_FCMPXCHG: + return BRW_AOP_FCMPWR; + case LSC_OP_ATOMIC_AND: + return BRW_AOP_AND; + case LSC_OP_ATOMIC_OR: + return BRW_AOP_OR; + case LSC_OP_ATOMIC_XOR: + return BRW_AOP_XOR; + /* No LSC op maps to BRW_AOP_PREDEC */ + case LSC_OP_ATOMIC_LOAD: + case LSC_OP_ATOMIC_FSUB: + unreachable("no corresponding legacy atomic operation"); + case LSC_OP_LOAD: + case LSC_OP_LOAD_CMASK: + case LSC_OP_STORE: + case LSC_OP_STORE_CMASK: + case LSC_OP_FENCE: + unreachable("not an atomic op"); + } + + unreachable("invalid LSC op"); +} + +static inline uint32_t +lsc_data_size_bytes(enum lsc_data_size data_size) +{ + switch (data_size) { + case LSC_DATA_SIZE_D8: + return 1; + case LSC_DATA_SIZE_D16: + return 2; + case LSC_DATA_SIZE_D32: + case LSC_DATA_SIZE_D8U32: + case LSC_DATA_SIZE_D16U32: + case LSC_DATA_SIZE_D16BF32: + return 4; + case LSC_DATA_SIZE_D64: + return 8; + default: + unreachable("Unsupported data payload size."); + } +} + +static inline uint32_t +lsc_addr_size_bytes(enum lsc_addr_size addr_size) +{ + switch (addr_size) { + case LSC_ADDR_SIZE_A16: return 2; + case LSC_ADDR_SIZE_A32: return 4; + case LSC_ADDR_SIZE_A64: return 8; + default: + unreachable("Unsupported address size."); + } +} + +static inline uint32_t +lsc_vector_length(enum lsc_vect_size vect_size) +{ + switch (vect_size) { + case LSC_VECT_SIZE_V1: return 1; + case LSC_VECT_SIZE_V2: return 2; + case LSC_VECT_SIZE_V3: return 3; + case LSC_VECT_SIZE_V4: return 4; + case LSC_VECT_SIZE_V8: return 8; + case LSC_VECT_SIZE_V16: return 16; + case LSC_VECT_SIZE_V32: return 32; + case LSC_VECT_SIZE_V64: return 64; + default: + unreachable("Unsupported size of vector"); + } +} + +static inline enum lsc_vect_size +lsc_vect_size(unsigned vect_size) +{ + switch(vect_size) { + case 1: return LSC_VECT_SIZE_V1; + case 2: return LSC_VECT_SIZE_V2; + case 3: return LSC_VECT_SIZE_V3; + case 4: return LSC_VECT_SIZE_V4; + case 8: return LSC_VECT_SIZE_V8; + case 16: return LSC_VECT_SIZE_V16; + case 32: return LSC_VECT_SIZE_V32; + case 64: return LSC_VECT_SIZE_V64; + default: + unreachable("Unsupported vector size for dataport"); + } +} + +static inline uint32_t +lsc_msg_desc_wcmask(UNUSED const struct intel_device_info *devinfo, + enum lsc_opcode opcode, unsigned simd_size, + enum lsc_addr_surface_type addr_type, + enum lsc_addr_size addr_sz, unsigned num_coordinates, + enum lsc_data_size data_sz, unsigned num_channels, + bool transpose, unsigned cache_ctrl, bool has_dest, unsigned cmask) +{ + assert(devinfo->has_lsc); + + unsigned dest_length = !has_dest ? 0 : + DIV_ROUND_UP(lsc_data_size_bytes(data_sz) * num_channels * simd_size, + reg_unit(devinfo) * REG_SIZE); + + unsigned src0_length = + DIV_ROUND_UP(lsc_addr_size_bytes(addr_sz) * num_coordinates * simd_size, + reg_unit(devinfo) * REG_SIZE); + + assert(!transpose || lsc_opcode_has_transpose(opcode)); + + unsigned msg_desc = + SET_BITS(opcode, 5, 0) | + SET_BITS(addr_sz, 8, 7) | + SET_BITS(data_sz, 11, 9) | + SET_BITS(transpose, 15, 15) | + SET_BITS(cache_ctrl, 19, 17) | + SET_BITS(dest_length, 24, 20) | + SET_BITS(src0_length, 28, 25) | + SET_BITS(addr_type, 30, 29); + + if (lsc_opcode_has_cmask(opcode)) + msg_desc |= SET_BITS(cmask ? cmask : lsc_cmask(num_channels), 15, 12); + else + msg_desc |= SET_BITS(lsc_vect_size(num_channels), 14, 12); + + return msg_desc; +} + +static inline uint32_t +lsc_msg_desc(UNUSED const struct intel_device_info *devinfo, + enum lsc_opcode opcode, unsigned simd_size, + enum lsc_addr_surface_type addr_type, + enum lsc_addr_size addr_sz, unsigned num_coordinates, + enum lsc_data_size data_sz, unsigned num_channels, + bool transpose, unsigned cache_ctrl, bool has_dest) +{ + return lsc_msg_desc_wcmask(devinfo, opcode, simd_size, addr_type, addr_sz, + num_coordinates, data_sz, num_channels, transpose, cache_ctrl, + has_dest, 0); +} + +static inline enum lsc_opcode +lsc_msg_desc_opcode(UNUSED const struct intel_device_info *devinfo, + uint32_t desc) +{ + assert(devinfo->has_lsc); + return (enum lsc_opcode) GET_BITS(desc, 5, 0); +} + +static inline enum lsc_addr_size +lsc_msg_desc_addr_size(UNUSED const struct intel_device_info *devinfo, + uint32_t desc) +{ + assert(devinfo->has_lsc); + return (enum lsc_addr_size) GET_BITS(desc, 8, 7); +} + +static inline enum lsc_data_size +lsc_msg_desc_data_size(UNUSED const struct intel_device_info *devinfo, + uint32_t desc) +{ + assert(devinfo->has_lsc); + return (enum lsc_data_size) GET_BITS(desc, 11, 9); +} + +static inline enum lsc_vect_size +lsc_msg_desc_vect_size(UNUSED const struct intel_device_info *devinfo, + uint32_t desc) +{ + assert(devinfo->has_lsc); + assert(!lsc_opcode_has_cmask(lsc_msg_desc_opcode(devinfo, desc))); + return (enum lsc_vect_size) GET_BITS(desc, 14, 12); +} + +static inline enum lsc_cmask +lsc_msg_desc_cmask(UNUSED const struct intel_device_info *devinfo, + uint32_t desc) +{ + assert(devinfo->has_lsc); + assert(lsc_opcode_has_cmask(lsc_msg_desc_opcode(devinfo, desc))); + return (enum lsc_cmask) GET_BITS(desc, 15, 12); +} + +static inline bool +lsc_msg_desc_transpose(UNUSED const struct intel_device_info *devinfo, + uint32_t desc) +{ + assert(devinfo->has_lsc); + return GET_BITS(desc, 15, 15); +} + +static inline unsigned +lsc_msg_desc_cache_ctrl(UNUSED const struct intel_device_info *devinfo, + uint32_t desc) +{ + assert(devinfo->has_lsc); + return GET_BITS(desc, 19, 17); +} + +static inline unsigned +lsc_msg_desc_dest_len(const struct intel_device_info *devinfo, + uint32_t desc) +{ + assert(devinfo->has_lsc); + return GET_BITS(desc, 24, 20) * reg_unit(devinfo); +} + +static inline unsigned +lsc_msg_desc_src0_len(const struct intel_device_info *devinfo, + uint32_t desc) +{ + assert(devinfo->has_lsc); + return GET_BITS(desc, 28, 25) * reg_unit(devinfo); +} + +static inline enum lsc_addr_surface_type +lsc_msg_desc_addr_type(UNUSED const struct intel_device_info *devinfo, + uint32_t desc) +{ + assert(devinfo->has_lsc); + return (enum lsc_addr_surface_type) GET_BITS(desc, 30, 29); +} + +static inline uint32_t +lsc_fence_msg_desc(UNUSED const struct intel_device_info *devinfo, + enum lsc_fence_scope scope, + enum lsc_flush_type flush_type, + bool route_to_lsc) +{ + assert(devinfo->has_lsc); + return SET_BITS(LSC_OP_FENCE, 5, 0) | + SET_BITS(LSC_ADDR_SIZE_A32, 8, 7) | + SET_BITS(scope, 11, 9) | + SET_BITS(flush_type, 14, 12) | + SET_BITS(route_to_lsc, 18, 18) | + SET_BITS(LSC_ADDR_SURFTYPE_FLAT, 30, 29); +} + +static inline enum lsc_fence_scope +lsc_fence_msg_desc_scope(UNUSED const struct intel_device_info *devinfo, + uint32_t desc) +{ + assert(devinfo->has_lsc); + return (enum lsc_fence_scope) GET_BITS(desc, 11, 9); +} + +static inline enum lsc_flush_type +lsc_fence_msg_desc_flush_type(UNUSED const struct intel_device_info *devinfo, + uint32_t desc) +{ + assert(devinfo->has_lsc); + return (enum lsc_flush_type) GET_BITS(desc, 14, 12); +} + +static inline enum lsc_backup_fence_routing +lsc_fence_msg_desc_backup_routing(UNUSED const struct intel_device_info *devinfo, + uint32_t desc) +{ + assert(devinfo->has_lsc); + return (enum lsc_backup_fence_routing) GET_BITS(desc, 18, 18); +} + +static inline uint32_t +lsc_bti_ex_desc(const struct intel_device_info *devinfo, unsigned bti) +{ + assert(devinfo->has_lsc); + return SET_BITS(bti, 31, 24) | + SET_BITS(0, 23, 12); /* base offset */ +} + +static inline unsigned +lsc_bti_ex_desc_base_offset(const struct intel_device_info *devinfo, + uint32_t ex_desc) +{ + assert(devinfo->has_lsc); + return GET_BITS(ex_desc, 23, 12); +} + +static inline unsigned +lsc_bti_ex_desc_index(const struct intel_device_info *devinfo, + uint32_t ex_desc) +{ + assert(devinfo->has_lsc); + return GET_BITS(ex_desc, 31, 24); +} + +static inline unsigned +lsc_flat_ex_desc_base_offset(const struct intel_device_info *devinfo, + uint32_t ex_desc) +{ + assert(devinfo->has_lsc); + return GET_BITS(ex_desc, 31, 12); +} + +static inline uint32_t +lsc_bss_ex_desc(const struct intel_device_info *devinfo, + unsigned surface_state_index) +{ + assert(devinfo->has_lsc); + return SET_BITS(surface_state_index, 31, 6); +} + +static inline unsigned +lsc_bss_ex_desc_index(const struct intel_device_info *devinfo, + uint32_t ex_desc) +{ + assert(devinfo->has_lsc); + return GET_BITS(ex_desc, 31, 6); +} + +static inline uint32_t +brw_mdc_sm2(unsigned exec_size) +{ + assert(exec_size == 8 || exec_size == 16); + return exec_size > 8; +} + +static inline uint32_t +brw_mdc_sm2_exec_size(uint32_t sm2) +{ + assert(sm2 <= 1); + return 8 << sm2; +} + +static inline uint32_t +brw_btd_spawn_desc(ASSERTED const struct intel_device_info *devinfo, + unsigned exec_size, unsigned msg_type) +{ + assert(devinfo->has_ray_tracing); + assert(devinfo->ver < 20 || exec_size == 16); + + return SET_BITS(0, 19, 19) | /* No header */ + SET_BITS(msg_type, 17, 14) | + SET_BITS(brw_mdc_sm2(exec_size), 8, 8); +} + +static inline uint32_t +brw_btd_spawn_msg_type(UNUSED const struct intel_device_info *devinfo, + uint32_t desc) +{ + return GET_BITS(desc, 17, 14); +} + +static inline uint32_t +brw_btd_spawn_exec_size(UNUSED const struct intel_device_info *devinfo, + uint32_t desc) +{ + return brw_mdc_sm2_exec_size(GET_BITS(desc, 8, 8)); +} + +static inline uint32_t +brw_rt_trace_ray_desc(ASSERTED const struct intel_device_info *devinfo, + unsigned exec_size) +{ + assert(devinfo->has_ray_tracing); + assert(devinfo->ver < 20 || exec_size == 16); + + return SET_BITS(0, 19, 19) | /* No header */ + SET_BITS(0, 17, 14) | /* Message type */ + SET_BITS(brw_mdc_sm2(exec_size), 8, 8); +} + +static inline uint32_t +brw_rt_trace_ray_desc_exec_size(UNUSED const struct intel_device_info *devinfo, + uint32_t desc) +{ + return brw_mdc_sm2_exec_size(GET_BITS(desc, 8, 8)); +} + +/** + * Construct a message descriptor immediate with the specified pixel + * interpolator function controls. + */ +static inline uint32_t +brw_pixel_interp_desc(UNUSED const struct intel_device_info *devinfo, + unsigned msg_type, + bool noperspective, + bool coarse_pixel_rate, + unsigned exec_size, + unsigned group) +{ + assert(exec_size == 8 || exec_size == 16); + const bool simd_mode = exec_size == 16; + const bool slot_group = group >= 16; + + assert(devinfo->ver >= 10 || !coarse_pixel_rate); + return (SET_BITS(slot_group, 11, 11) | + SET_BITS(msg_type, 13, 12) | + SET_BITS(!!noperspective, 14, 14) | + SET_BITS(coarse_pixel_rate, 15, 15) | + SET_BITS(simd_mode, 16, 16)); +} + +void brw_urb_WRITE(struct brw_codegen *p, + struct brw_reg dest, + unsigned msg_reg_nr, + struct brw_reg src0, + enum brw_urb_write_flags flags, + unsigned msg_length, + unsigned response_length, + unsigned offset, + unsigned swizzle); + +/** + * Send message to shared unit \p sfid with a possibly indirect descriptor \p + * desc. If \p desc is not an immediate it will be transparently loaded to an + * address register using an OR instruction. + */ +void +brw_send_indirect_message(struct brw_codegen *p, + unsigned sfid, + struct brw_reg dst, + struct brw_reg payload, + struct brw_reg desc, + unsigned desc_imm, + bool eot); + +void +brw_send_indirect_split_message(struct brw_codegen *p, + unsigned sfid, + struct brw_reg dst, + struct brw_reg payload0, + struct brw_reg payload1, + struct brw_reg desc, + unsigned desc_imm, + struct brw_reg ex_desc, + unsigned ex_desc_imm, + bool ex_desc_scratch, + bool ex_bso, + bool eot); + +void brw_ff_sync(struct brw_codegen *p, + struct brw_reg dest, + unsigned msg_reg_nr, + struct brw_reg src0, + bool allocate, + unsigned response_length, + bool eot); + +void brw_svb_write(struct brw_codegen *p, + struct brw_reg dest, + unsigned msg_reg_nr, + struct brw_reg src0, + unsigned binding_table_index, + bool send_commit_msg); + +brw_inst *brw_fb_WRITE(struct brw_codegen *p, + struct brw_reg payload, + struct brw_reg implied_header, + unsigned msg_control, + unsigned binding_table_index, + unsigned msg_length, + unsigned response_length, + bool eot, + bool last_render_target, + bool header_present); + +brw_inst *gfx9_fb_READ(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg payload, + unsigned binding_table_index, + unsigned msg_length, + unsigned response_length, + bool per_sample); + +void brw_SAMPLE(struct brw_codegen *p, + struct brw_reg dest, + unsigned msg_reg_nr, + struct brw_reg src0, + unsigned binding_table_index, + unsigned sampler, + unsigned msg_type, + unsigned response_length, + unsigned msg_length, + unsigned header_present, + unsigned simd_mode, + unsigned return_format); + +void brw_adjust_sampler_state_pointer(struct brw_codegen *p, + struct brw_reg header, + struct brw_reg sampler_index); + +void gfx4_math(struct brw_codegen *p, + struct brw_reg dest, + unsigned function, + unsigned msg_reg_nr, + struct brw_reg src, + unsigned precision ); + +void gfx6_math(struct brw_codegen *p, + struct brw_reg dest, + unsigned function, + struct brw_reg src0, + struct brw_reg src1); + +void brw_oword_block_read(struct brw_codegen *p, + struct brw_reg dest, + struct brw_reg mrf, + uint32_t offset, + uint32_t bind_table_index); + +unsigned brw_scratch_surface_idx(const struct brw_codegen *p); + +void brw_oword_block_read_scratch(struct brw_codegen *p, + struct brw_reg dest, + struct brw_reg mrf, + int num_regs, + unsigned offset); + +void brw_oword_block_write_scratch(struct brw_codegen *p, + struct brw_reg mrf, + int num_regs, + unsigned offset); + +void gfx7_block_read_scratch(struct brw_codegen *p, + struct brw_reg dest, + int num_regs, + unsigned offset); + +/** + * Return the generation-specific jump distance scaling factor. + * + * Given the number of instructions to jump, we need to scale by + * some number to obtain the actual jump distance to program in an + * instruction. + */ +static inline unsigned +brw_jump_scale(const struct intel_device_info *devinfo) +{ + /* Broadwell measures jump targets in bytes. */ + if (devinfo->ver >= 8) + return 16; + + /* Ironlake and later measure jump targets in 64-bit data chunks (in order + * (to support compaction), so each 128-bit instruction requires 2 chunks. + */ + if (devinfo->ver >= 5) + return 2; + + /* Gfx4 simply uses the number of 128-bit instructions. */ + return 1; +} + +void brw_barrier(struct brw_codegen *p, struct brw_reg src); + +/* If/else/endif. Works by manipulating the execution flags on each + * channel. + */ +brw_inst *brw_IF(struct brw_codegen *p, unsigned execute_size); +brw_inst *gfx6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional, + struct brw_reg src0, struct brw_reg src1); + +void brw_ELSE(struct brw_codegen *p); +void brw_ENDIF(struct brw_codegen *p); + +/* DO/WHILE loops: + */ +brw_inst *brw_DO(struct brw_codegen *p, unsigned execute_size); + +brw_inst *brw_WHILE(struct brw_codegen *p); + +brw_inst *brw_BREAK(struct brw_codegen *p); +brw_inst *brw_CONT(struct brw_codegen *p); +brw_inst *brw_HALT(struct brw_codegen *p); + +/* Forward jumps: + */ +void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx); + +brw_inst *brw_JMPI(struct brw_codegen *p, struct brw_reg index, + unsigned predicate_control); + +void brw_NOP(struct brw_codegen *p); + +void brw_WAIT(struct brw_codegen *p); + +void brw_SYNC(struct brw_codegen *p, enum tgl_sync_function func); + +/* Special case: there is never a destination, execution size will be + * taken from src0: + */ +void brw_CMP(struct brw_codegen *p, + struct brw_reg dest, + unsigned conditional, + struct brw_reg src0, + struct brw_reg src1); + +void brw_CMPN(struct brw_codegen *p, + struct brw_reg dest, + unsigned conditional, + struct brw_reg src0, + struct brw_reg src1); + +brw_inst *brw_DPAS(struct brw_codegen *p, enum gfx12_systolic_depth sdepth, + unsigned rcount, struct brw_reg dest, struct brw_reg src0, + struct brw_reg src1, struct brw_reg src2); + +void +brw_untyped_atomic(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg payload, + struct brw_reg surface, + unsigned atomic_op, + unsigned msg_length, + bool response_expected, + bool header_present); + +void +brw_untyped_surface_read(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg payload, + struct brw_reg surface, + unsigned msg_length, + unsigned num_channels); + +void +brw_untyped_surface_write(struct brw_codegen *p, + struct brw_reg payload, + struct brw_reg surface, + unsigned msg_length, + unsigned num_channels, + bool header_present); + +void +brw_memory_fence(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src, + enum opcode send_op, + enum brw_message_target sfid, + uint32_t desc, + bool commit_enable, + unsigned bti); + +void +brw_pixel_interpolator_query(struct brw_codegen *p, + struct brw_reg dest, + struct brw_reg mrf, + bool noperspective, + bool coarse_pixel_rate, + unsigned mode, + struct brw_reg data, + unsigned msg_length, + unsigned response_length); + +void +brw_find_live_channel(struct brw_codegen *p, + struct brw_reg dst, + bool last); + +void +brw_broadcast(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src, + struct brw_reg idx); + +void +brw_float_controls_mode(struct brw_codegen *p, + unsigned mode, unsigned mask); + +void +brw_update_reloc_imm(const struct brw_isa_info *isa, + brw_inst *inst, + uint32_t value); + +void +brw_MOV_reloc_imm(struct brw_codegen *p, + struct brw_reg dst, + enum brw_reg_type src_type, + uint32_t id); + +unsigned +brw_num_sources_from_inst(const struct brw_isa_info *isa, + const brw_inst *inst); + +/*********************************************************************** + * brw_eu_util.c: + */ + +void brw_copy_indirect_to_indirect(struct brw_codegen *p, + struct brw_indirect dst_ptr, + struct brw_indirect src_ptr, + unsigned count); + +void brw_copy_from_indirect(struct brw_codegen *p, + struct brw_reg dst, + struct brw_indirect ptr, + unsigned count); + +void brw_copy4(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src, + unsigned count); + +void brw_copy8(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src, + unsigned count); + +void brw_math_invert( struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src); + +void brw_set_src1(struct brw_codegen *p, brw_inst *insn, struct brw_reg reg); + +void brw_set_desc_ex(struct brw_codegen *p, brw_inst *insn, + unsigned desc, unsigned ex_desc); + +static inline void +brw_set_desc(struct brw_codegen *p, brw_inst *insn, unsigned desc) +{ + brw_set_desc_ex(p, insn, desc, 0); +} + +void brw_set_uip_jip(struct brw_codegen *p, int start_offset); + +enum brw_conditional_mod brw_negate_cmod(enum brw_conditional_mod cmod); +enum brw_conditional_mod brw_swap_cmod(enum brw_conditional_mod cmod); + +/* brw_eu_compact.c */ +void brw_compact_instructions(struct brw_codegen *p, int start_offset, + struct disasm_info *disasm); +void brw_uncompact_instruction(const struct brw_isa_info *isa, + brw_inst *dst, brw_compact_inst *src); +bool brw_try_compact_instruction(const struct brw_isa_info *isa, + brw_compact_inst *dst, const brw_inst *src); + +void brw_debug_compact_uncompact(const struct brw_isa_info *isa, + brw_inst *orig, brw_inst *uncompacted); + +/* brw_eu_validate.c */ +bool brw_validate_instruction(const struct brw_isa_info *isa, + const brw_inst *inst, int offset, + unsigned inst_size, + struct disasm_info *disasm); +bool brw_validate_instructions(const struct brw_isa_info *isa, + const void *assembly, int start_offset, int end_offset, + struct disasm_info *disasm); + +static inline int +next_offset(const struct intel_device_info *devinfo, void *store, int offset) +{ + brw_inst *insn = (brw_inst *)((char *)store + offset); + + if (brw_inst_cmpt_control(devinfo, insn)) + return offset + 8; + else + return offset + 16; +} + +/** Maximum SEND message length */ +#define BRW_MAX_MSG_LENGTH 15 + +/** First MRF register used by pull loads */ +#define FIRST_SPILL_MRF(gen) ((gen) == 6 ? 21 : 13) + +/** First MRF register used by spills */ +#define FIRST_PULL_LOAD_MRF(gen) ((gen) == 6 ? 16 : 13) + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/intel/compiler/elk/brw_eu_compact.c b/src/intel/compiler/elk/brw_eu_compact.c new file mode 100644 index 00000000000..356650ffd20 --- /dev/null +++ b/src/intel/compiler/elk/brw_eu_compact.c @@ -0,0 +1,3081 @@ +/* + * Copyright © 2012-2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_eu_compact.c + * + * Instruction compaction is a feature of G45 and newer hardware that allows + * for a smaller instruction encoding. + * + * The instruction cache is on the order of 32KB, and many programs generate + * far more instructions than that. The instruction cache is built to barely + * keep up with instruction dispatch ability in cache hit cases -- L1 + * instruction cache misses that still hit in the next level could limit + * throughput by around 50%. + * + * The idea of instruction compaction is that most instructions use a tiny + * subset of the GPU functionality, so we can encode what would be a 16 byte + * instruction in 8 bytes using some lookup tables for various fields. + * + * + * Instruction compaction capabilities vary subtly by generation. + * + * G45's support for instruction compaction is very limited. Jump counts on + * this generation are in units of 16-byte uncompacted instructions. As such, + * all jump targets must be 16-byte aligned. Also, all instructions must be + * naturally aligned, i.e. uncompacted instructions must be 16-byte aligned. + * A G45-only instruction, NENOP, must be used to provide padding to align + * uncompacted instructions. + * + * Gfx5 removes these restrictions and changes jump counts to be in units of + * 8-byte compacted instructions, allowing jump targets to be only 8-byte + * aligned. Uncompacted instructions can also be placed on 8-byte boundaries. + * + * Gfx6 adds the ability to compact instructions with a limited range of + * immediate values. Compactable immediates have 12 unrestricted bits, and a + * 13th bit that's replicated through the high 20 bits, to create the 32-bit + * value of DW3 in the uncompacted instruction word. + * + * On Gfx7 we can compact some control flow instructions with a small positive + * immediate in the low bits of DW3, like ENDIF with the JIP field. Other + * control flow instructions with UIP cannot be compacted, because of the + * replicated 13th bit. No control flow instructions can be compacted on Gfx6 + * since the jump count field is not in DW3. + * + * break JIP/UIP + * cont JIP/UIP + * halt JIP/UIP + * if JIP/UIP + * else JIP (plus UIP on BDW+) + * endif JIP + * while JIP (must be negative) + * + * Gen 8 adds support for compacting 3-src instructions. + * + * Gfx12 reduces the number of bits that available to compacted immediates from + * 13 to 12, but improves the compaction of floating-point immediates by + * allowing the high bits to be encoded (the sign, 8-bit exponent, and the + * three most significant bits of the mantissa), rather than the lowest bits of + * the mantissa. + */ + +#include "brw_eu.h" +#include "brw_disasm.h" +#include "brw_shader.h" +#include "brw_disasm_info.h" +#include "dev/intel_debug.h" + +static const uint32_t g45_control_index_table[32] = { + 0b00000000000000000, + 0b01000000000000000, + 0b00110000000000000, + 0b00000000000000010, + 0b00100000000000000, + 0b00010000000000000, + 0b01000000000100000, + 0b01000000100000000, + 0b01010000000100000, + 0b00000000100000010, + 0b11000000000000000, + 0b00001000100000010, + 0b01001000100000000, + 0b00000000100000000, + 0b11000000000100000, + 0b00001000100000000, + 0b10110000000000000, + 0b11010000000100000, + 0b00110000100000000, + 0b00100000100000000, + 0b01000000000001000, + 0b01000000000000100, + 0b00111100000000000, + 0b00101011000000000, + 0b00110000000010000, + 0b00010000100000000, + 0b01000000000100100, + 0b01000000000101000, + 0b00110000000000110, + 0b00000000000001010, + 0b01010000000101000, + 0b01010000000100100, +}; + +static const uint32_t g45_datatype_table[32] = { + 0b001000000000100001, + 0b001011010110101101, + 0b001000001000110001, + 0b001111011110111101, + 0b001011010110101100, + 0b001000000110101101, + 0b001000000000100000, + 0b010100010110110001, + 0b001100011000101101, + 0b001000000000100010, + 0b001000001000110110, + 0b010000001000110001, + 0b001000001000110010, + 0b011000001000110010, + 0b001111011110111100, + 0b001000000100101000, + 0b010100011000110001, + 0b001010010100101001, + 0b001000001000101001, + 0b010000001000110110, + 0b101000001000110001, + 0b001011011000101101, + 0b001000000100001001, + 0b001011011000101100, + 0b110100011000110001, + 0b001000001110111101, + 0b110000001000110001, + 0b011000000100101010, + 0b101000001000101001, + 0b001011010110001100, + 0b001000000110100001, + 0b001010010100001000, +}; + +static const uint16_t g45_subreg_table[32] = { + 0b000000000000000, + 0b000000010000000, + 0b000001000000000, + 0b000100000000000, + 0b000000000100000, + 0b100000000000000, + 0b000000000010000, + 0b001100000000000, + 0b001010000000000, + 0b000000100000000, + 0b001000000000000, + 0b000000000001000, + 0b000000001000000, + 0b000000000000001, + 0b000010000000000, + 0b000000010100000, + 0b000000000000111, + 0b000001000100000, + 0b011000000000000, + 0b000000110000000, + 0b000000000000010, + 0b000000000000100, + 0b000000001100000, + 0b000100000000010, + 0b001110011000110, + 0b001110100001000, + 0b000110011000110, + 0b000001000011000, + 0b000110010000100, + 0b001100000000110, + 0b000000010000110, + 0b000001000110000, +}; + +static const uint16_t g45_src_index_table[32] = { + 0b000000000000, + 0b010001101000, + 0b010110001000, + 0b011010010000, + 0b001101001000, + 0b010110001010, + 0b010101110000, + 0b011001111000, + 0b001000101000, + 0b000000101000, + 0b010001010000, + 0b111101101100, + 0b010110001100, + 0b010001101100, + 0b011010010100, + 0b010001001100, + 0b001100101000, + 0b000000000010, + 0b111101001100, + 0b011001101000, + 0b010101001000, + 0b000000000100, + 0b000000101100, + 0b010001101010, + 0b000000111000, + 0b010101011000, + 0b000100100000, + 0b010110000000, + 0b010000000100, + 0b010000111000, + 0b000101100000, + 0b111101110100, +}; + +static const uint32_t gfx6_control_index_table[32] = { + 0b00000000000000000, + 0b01000000000000000, + 0b00110000000000000, + 0b00000000100000000, + 0b00010000000000000, + 0b00001000100000000, + 0b00000000100000010, + 0b00000000000000010, + 0b01000000100000000, + 0b01010000000000000, + 0b10110000000000000, + 0b00100000000000000, + 0b11010000000000000, + 0b11000000000000000, + 0b01001000100000000, + 0b01000000000001000, + 0b01000000000000100, + 0b00000000000001000, + 0b00000000000000100, + 0b00111000100000000, + 0b00001000100000010, + 0b00110000100000000, + 0b00110000000000001, + 0b00100000000000001, + 0b00110000000000010, + 0b00110000000000101, + 0b00110000000001001, + 0b00110000000010000, + 0b00110000000000011, + 0b00110000000000100, + 0b00110000100001000, + 0b00100000000001001, +}; + +static const uint32_t gfx6_datatype_table[32] = { + 0b001001110000000000, + 0b001000110000100000, + 0b001001110000000001, + 0b001000000001100000, + 0b001010110100101001, + 0b001000000110101101, + 0b001100011000101100, + 0b001011110110101101, + 0b001000000111101100, + 0b001000000001100001, + 0b001000110010100101, + 0b001000000001000001, + 0b001000001000110001, + 0b001000001000101001, + 0b001000000000100000, + 0b001000001000110010, + 0b001010010100101001, + 0b001011010010100101, + 0b001000000110100101, + 0b001100011000101001, + 0b001011011000101100, + 0b001011010110100101, + 0b001011110110100101, + 0b001111011110111101, + 0b001111011110111100, + 0b001111011110111101, + 0b001111011110011101, + 0b001111011110111110, + 0b001000000000100001, + 0b001000000000100010, + 0b001001111111011101, + 0b001000001110111110, +}; + +static const uint16_t gfx6_subreg_table[32] = { + 0b000000000000000, + 0b000000000000100, + 0b000000110000000, + 0b111000000000000, + 0b011110000001000, + 0b000010000000000, + 0b000000000010000, + 0b000110000001100, + 0b001000000000000, + 0b000001000000000, + 0b000001010010100, + 0b000000001010110, + 0b010000000000000, + 0b110000000000000, + 0b000100000000000, + 0b000000010000000, + 0b000000000001000, + 0b100000000000000, + 0b000001010000000, + 0b001010000000000, + 0b001100000000000, + 0b000000001010100, + 0b101101010010100, + 0b010100000000000, + 0b000000010001111, + 0b011000000000000, + 0b111110000000000, + 0b101000000000000, + 0b000000000001111, + 0b000100010001111, + 0b001000010001111, + 0b000110000000000, +}; + +static const uint16_t gfx6_src_index_table[32] = { + 0b000000000000, + 0b010110001000, + 0b010001101000, + 0b001000101000, + 0b011010010000, + 0b000100100000, + 0b010001101100, + 0b010101110000, + 0b011001111000, + 0b001100101000, + 0b010110001100, + 0b001000100000, + 0b010110001010, + 0b000000000010, + 0b010101010000, + 0b010101101000, + 0b111101001100, + 0b111100101100, + 0b011001110000, + 0b010110001001, + 0b010101011000, + 0b001101001000, + 0b010000101100, + 0b010000000000, + 0b001101110000, + 0b001100010000, + 0b001100000000, + 0b010001101010, + 0b001101111000, + 0b000001110000, + 0b001100100000, + 0b001101010000, +}; + +static const uint32_t gfx7_control_index_table[32] = { + 0b0000000000000000010, + 0b0000100000000000000, + 0b0000100000000000001, + 0b0000100000000000010, + 0b0000100000000000011, + 0b0000100000000000100, + 0b0000100000000000101, + 0b0000100000000000111, + 0b0000100000000001000, + 0b0000100000000001001, + 0b0000100000000001101, + 0b0000110000000000000, + 0b0000110000000000001, + 0b0000110000000000010, + 0b0000110000000000011, + 0b0000110000000000100, + 0b0000110000000000101, + 0b0000110000000000111, + 0b0000110000000001001, + 0b0000110000000001101, + 0b0000110000000010000, + 0b0000110000100000000, + 0b0001000000000000000, + 0b0001000000000000010, + 0b0001000000000000100, + 0b0001000000100000000, + 0b0010110000000000000, + 0b0010110000000010000, + 0b0011000000000000000, + 0b0011000000100000000, + 0b0101000000000000000, + 0b0101000000100000000, +}; + +static const uint32_t gfx7_datatype_table[32] = { + 0b001000000000000001, + 0b001000000000100000, + 0b001000000000100001, + 0b001000000001100001, + 0b001000000010111101, + 0b001000001011111101, + 0b001000001110100001, + 0b001000001110100101, + 0b001000001110111101, + 0b001000010000100001, + 0b001000110000100000, + 0b001000110000100001, + 0b001001010010100101, + 0b001001110010100100, + 0b001001110010100101, + 0b001111001110111101, + 0b001111011110011101, + 0b001111011110111100, + 0b001111011110111101, + 0b001111111110111100, + 0b000000001000001100, + 0b001000000000111101, + 0b001000000010100101, + 0b001000010000100000, + 0b001001010010100100, + 0b001001110010000100, + 0b001010010100001001, + 0b001101111110111101, + 0b001111111110111101, + 0b001011110110101100, + 0b001010010100101000, + 0b001010110100101000, +}; + +static const uint16_t gfx7_subreg_table[32] = { + 0b000000000000000, + 0b000000000000001, + 0b000000000001000, + 0b000000000001111, + 0b000000000010000, + 0b000000010000000, + 0b000000100000000, + 0b000000110000000, + 0b000001000000000, + 0b000001000010000, + 0b000010100000000, + 0b001000000000000, + 0b001000000000001, + 0b001000010000001, + 0b001000010000010, + 0b001000010000011, + 0b001000010000100, + 0b001000010000111, + 0b001000010001000, + 0b001000010001110, + 0b001000010001111, + 0b001000110000000, + 0b001000111101000, + 0b010000000000000, + 0b010000110000000, + 0b011000000000000, + 0b011110010000111, + 0b100000000000000, + 0b101000000000000, + 0b110000000000000, + 0b111000000000000, + 0b111000000011100, +}; + +static const uint16_t gfx7_src_index_table[32] = { + 0b000000000000, + 0b000000000010, + 0b000000010000, + 0b000000010010, + 0b000000011000, + 0b000000100000, + 0b000000101000, + 0b000001001000, + 0b000001010000, + 0b000001110000, + 0b000001111000, + 0b001100000000, + 0b001100000010, + 0b001100001000, + 0b001100010000, + 0b001100010010, + 0b001100100000, + 0b001100101000, + 0b001100111000, + 0b001101000000, + 0b001101000010, + 0b001101001000, + 0b001101010000, + 0b001101100000, + 0b001101101000, + 0b001101110000, + 0b001101110001, + 0b001101111000, + 0b010001101000, + 0b010001101001, + 0b010001101010, + 0b010110001000, +}; + +static const uint32_t gfx8_control_index_table[32] = { + 0b0000000000000000010, + 0b0000100000000000000, + 0b0000100000000000001, + 0b0000100000000000010, + 0b0000100000000000011, + 0b0000100000000000100, + 0b0000100000000000101, + 0b0000100000000000111, + 0b0000100000000001000, + 0b0000100000000001001, + 0b0000100000000001101, + 0b0000110000000000000, + 0b0000110000000000001, + 0b0000110000000000010, + 0b0000110000000000011, + 0b0000110000000000100, + 0b0000110000000000101, + 0b0000110000000000111, + 0b0000110000000001001, + 0b0000110000000001101, + 0b0000110000000010000, + 0b0000110000100000000, + 0b0001000000000000000, + 0b0001000000000000010, + 0b0001000000000000100, + 0b0001000000100000000, + 0b0010110000000000000, + 0b0010110000000010000, + 0b0011000000000000000, + 0b0011000000100000000, + 0b0101000000000000000, + 0b0101000000100000000, +}; + +static const uint32_t gfx8_datatype_table[32] = { + 0b001000000000000000001, + 0b001000000000001000000, + 0b001000000000001000001, + 0b001000000000011000001, + 0b001000000000101011101, + 0b001000000010111011101, + 0b001000000011101000001, + 0b001000000011101000101, + 0b001000000011101011101, + 0b001000001000001000001, + 0b001000011000001000000, + 0b001000011000001000001, + 0b001000101000101000101, + 0b001000111000101000100, + 0b001000111000101000101, + 0b001011100011101011101, + 0b001011101011100011101, + 0b001011101011101011100, + 0b001011101011101011101, + 0b001011111011101011100, + 0b000000000010000001100, + 0b001000000000001011101, + 0b001000000000101000101, + 0b001000001000001000000, + 0b001000101000101000100, + 0b001000111000100000100, + 0b001001001001000001001, + 0b001010111011101011101, + 0b001011111011101011101, + 0b001001111001101001100, + 0b001001001001001001000, + 0b001001011001001001000, +}; + +static const uint16_t gfx8_subreg_table[32] = { + 0b000000000000000, + 0b000000000000001, + 0b000000000001000, + 0b000000000001111, + 0b000000000010000, + 0b000000010000000, + 0b000000100000000, + 0b000000110000000, + 0b000001000000000, + 0b000001000010000, + 0b000001010000000, + 0b001000000000000, + 0b001000000000001, + 0b001000010000001, + 0b001000010000010, + 0b001000010000011, + 0b001000010000100, + 0b001000010000111, + 0b001000010001000, + 0b001000010001110, + 0b001000010001111, + 0b001000110000000, + 0b001000111101000, + 0b010000000000000, + 0b010000110000000, + 0b011000000000000, + 0b011110010000111, + 0b100000000000000, + 0b101000000000000, + 0b110000000000000, + 0b111000000000000, + 0b111000000011100, +}; + +static const uint16_t gfx8_src_index_table[32] = { + 0b000000000000, + 0b000000000010, + 0b000000010000, + 0b000000010010, + 0b000000011000, + 0b000000100000, + 0b000000101000, + 0b000001001000, + 0b000001010000, + 0b000001110000, + 0b000001111000, + 0b001100000000, + 0b001100000010, + 0b001100001000, + 0b001100010000, + 0b001100010010, + 0b001100100000, + 0b001100101000, + 0b001100111000, + 0b001101000000, + 0b001101000010, + 0b001101001000, + 0b001101010000, + 0b001101100000, + 0b001101101000, + 0b001101110000, + 0b001101110001, + 0b001101111000, + 0b010001101000, + 0b010001101001, + 0b010001101010, + 0b010110001000, +}; + +static const uint32_t gfx11_datatype_table[32] = { + 0b001000000000000000001, + 0b001000000000001000000, + 0b001000000000001000001, + 0b001000000000011000001, + 0b001000000000101100101, + 0b001000000101111100101, + 0b001000000100101000001, + 0b001000000100101000101, + 0b001000000100101100101, + 0b001000001000001000001, + 0b001000011000001000000, + 0b001000011000001000001, + 0b001000101000101000101, + 0b001000111000101000100, + 0b001000111000101000101, + 0b001100100100101100101, + 0b001100101100100100101, + 0b001100101100101100100, + 0b001100101100101100101, + 0b001100111100101100100, + 0b000000000010000001100, + 0b001000000000001100101, + 0b001000000000101000101, + 0b001000001000001000000, + 0b001000101000101000100, + 0b001000111000100000100, + 0b001001001001000001001, + 0b001101111100101100101, + 0b001100111100101100101, + 0b001001111001101001100, + 0b001001001001001001000, + 0b001001011001001001000, +}; + +static const uint32_t gfx12_control_index_table[32] = { + 0b000000000000000000100, /* (16|M0) */ + 0b000000000000000000011, /* (8|M0) */ + 0b000000010000000000000, /* (W) (1|M0) */ + 0b000000010000000000100, /* (W) (16|M0) */ + 0b000000010000000000011, /* (W) (8|M0) */ + 0b010000000000000000100, /* (16|M0) (ge)f0.0 */ + 0b000000000000000100100, /* (16|M16) */ + 0b010100000000000000100, /* (16|M0) (lt)f0.0 */ + 0b000000000000000000000, /* (1|M0) */ + 0b000010000000000000100, /* (16|M0) (sat) */ + 0b000000000000000010011, /* (8|M8) */ + 0b001100000000000000100, /* (16|M0) (gt)f0.0 */ + 0b000100000000000000100, /* (16|M0) (eq)f0.0 */ + 0b000100010000000000100, /* (W) (16|M0) (eq)f0.0 */ + 0b001000000000000000100, /* (16|M0) (ne)f0.0 */ + 0b000000000000100000100, /* (f0.0) (16|M0) */ + 0b010100000000000000011, /* (8|M0) (lt)f0.0 */ + 0b000000000000110000100, /* (f1.0) (16|M0) */ + 0b000000010000000000001, /* (W) (2|M0) */ + 0b000000000000101000100, /* (f0.1) (16|M0) */ + 0b000000000000111000100, /* (f1.1) (16|M0) */ + 0b010000010000000000100, /* (W) (16|M0) (ge)f0.0 */ + 0b000000000000000100011, /* (8|M16) */ + 0b000000000000000110011, /* (8|M24) */ + 0b010100010000000000100, /* (W) (16|M0) (lt)f0.0 */ + 0b010000000000000000011, /* (8|M0) (ge)f0.0 */ + 0b000100010000000000000, /* (W) (1|M0) (eq)f0.0 */ + 0b000010000000000000011, /* (8|M0) (sat) */ + 0b010100000000010000100, /* (16|M0) (lt)f1.0 */ + 0b000100000000000000011, /* (8|M0) (eq)f0.0 */ + 0b000001000000000000011, /* (8|M0) {AccWrEn} */ + 0b000000010000000100100, /* (W) (16|M16) */ +}; + +static const uint32_t gfx12_datatype_table[32] = { + 0b11010110100101010100, /* grf<1>:f grf:f grf:f */ + 0b00000110100101010100, /* grf<1>:f grf:f arf:ub */ + 0b00000010101101010100, /* grf<1>:f imm:f arf:ub */ + 0b01010110110101010100, /* grf<1>:f grf:f imm:f */ + 0b11010100100101010100, /* arf<1>:f grf:f grf:f */ + 0b11010010100101010100, /* grf<1>:f arf:f grf:f */ + 0b01010100110101010100, /* arf<1>:f grf:f imm:f */ + 0b00000000100000000000, /* arf<1>:ub arf:ub arf:ub */ + 0b11010000100101010100, /* arf<1>:f arf:f grf:f */ + 0b00101110110011001100, /* grf<1>:d grf:d imm:w */ + 0b10110110100011001100, /* grf<1>:d grf:d grf:d */ + 0b01010010110101010100, /* grf<1>:f arf:f imm:f */ + 0b10010110100001000100, /* grf<1>:ud grf:ud grf:ud */ + 0b01010000110101010100, /* arf<1>:f arf:f imm:f */ + 0b00110110110011001100, /* grf<1>:d grf:d imm:d */ + 0b00010110110001000100, /* grf<1>:ud grf:ud imm:ud */ + 0b00000111000101010100, /* grf<2>:f grf:f arf:ub */ + 0b00101100110011001100, /* arf<1>:d grf:d imm:w */ + 0b00000000100000100010, /* arf<1>:uw arf:uw arf:ub */ + 0b00000010100001000100, /* grf<1>:ud arf:ud arf:ub */ + 0b00100110110000101010, /* grf<1>:w grf:uw imm:uv */ + 0b00001110110000100010, /* grf<1>:uw grf:uw imm:uw */ + 0b10010111000001000100, /* grf<2>:ud grf:ud grf:ud */ + 0b00000110100101001100, /* grf<1>:d grf:f arf:ub */ + 0b10001100100011001100, /* arf<1>:d grf:d grf:uw */ + 0b00000110100001010100, /* grf<1>:f grf:ud arf:ub */ + 0b00101110110001001100, /* grf<1>:d grf:ud imm:w */ + 0b00000010100000100010, /* grf<1>:uw arf:uw arf:ub */ + 0b00000110100000110100, /* grf<1>:f grf:uw arf:ub */ + 0b00000110100000010100, /* grf<1>:f grf:ub arf:ub */ + 0b00000110100011010100, /* grf<1>:f grf:d arf:ub */ + 0b00000010100101010100, /* grf<1>:f arf:f arf:ub */ +}; + +static const uint16_t gfx12_subreg_table[32] = { + 0b000000000000000, /* .0 .0 .0 */ + 0b100000000000000, /* .0 .0 .16 */ + 0b001000000000000, /* .0 .0 .4 */ + 0b011000000000000, /* .0 .0 .12 */ + 0b000000010000000, /* .0 .4 .0 */ + 0b010000000000000, /* .0 .0 .8 */ + 0b101000000000000, /* .0 .0 .20 */ + 0b000000000001000, /* .8 .0 .0 */ + 0b000000100000000, /* .0 .8 .0 */ + 0b110000000000000, /* .0 .0 .24 */ + 0b111000000000000, /* .0 .0 .28 */ + 0b000001000000000, /* .0 .16 .0 */ + 0b000000000000100, /* .4 .0 .0 */ + 0b000001100000000, /* .0 .24 .0 */ + 0b000001010000000, /* .0 .20 .0 */ + 0b000000110000000, /* .0 .12 .0 */ + 0b000001110000000, /* .0 .28 .0 */ + 0b000000000011100, /* .28 .0 .0 */ + 0b000000000010000, /* .16 .0 .0 */ + 0b000000000001100, /* .12 .0 .0 */ + 0b000000000011000, /* .24 .0 .0 */ + 0b000000000010100, /* .20 .0 .0 */ + 0b000000000000010, /* .2 .0 .0 */ + 0b000000101000000, /* .0 .10 .0 */ + 0b000000001000000, /* .0 .2 .0 */ + 0b000000010000100, /* .4 .4 .0 */ + 0b000000001011100, /* .28 .2 .0 */ + 0b000000001000010, /* .2 .2 .0 */ + 0b000000110001100, /* .12 .12 .0 */ + 0b000000000100000, /* .0 .1 .0 */ + 0b000000001100000, /* .0 .3 .0 */ + 0b110001100000000, /* .0 .24 .24 */ +}; + +static const uint16_t gfx12_src0_index_table[16] = { + 0b010001100100, /* r<8;8,1> */ + 0b000000000000, /* r<0;1,0> */ + 0b010001100110, /* -r<8;8,1> */ + 0b010001100101, /* (abs)r<8;8,1> */ + 0b000000000010, /* -r<0;1,0> */ + 0b001000000000, /* r<2;1,0> */ + 0b001001000000, /* r<2;4,0> */ + 0b001101000000, /* r<4;4,0> */ + 0b001000100100, /* r<2;2,1> */ + 0b001100000000, /* r<4;1,0> */ + 0b001000100110, /* -r<2;2,1> */ + 0b001101000100, /* r<4;4,1> */ + 0b010001100111, /* -(abs)r<8;8,1> */ + 0b000100000000, /* r<1;1,0> */ + 0b000000000001, /* (abs)r<0;1,0> */ + 0b111100010000, /* r[a]<1,0> */ +}; + +static const uint16_t gfx12_src1_index_table[16] = { + 0b000100011001, /* r<8;8,1> */ + 0b000000000000, /* r<0;1,0> */ + 0b100100011001, /* -r<8;8,1> */ + 0b100000000000, /* -r<0;1,0> */ + 0b010100011001, /* (abs)r<8;8,1> */ + 0b100011010000, /* -r<4;4,0> */ + 0b000010000000, /* r<2;1,0> */ + 0b000010001001, /* r<2;2,1> */ + 0b100010001001, /* -r<2;2,1> */ + 0b000011010000, /* r<4;4,0> */ + 0b000011010001, /* r<4;4,1> */ + 0b000011000000, /* r<4;1,0> */ + 0b110100011001, /* -(abs)r<8;8,1> */ + 0b010000000000, /* (abs)r<0;1,0> */ + 0b110000000000, /* -(abs)r<0;1,0> */ + 0b100011010001, /* -r<4;4,1> */ +}; + +static const uint16_t xehp_src0_index_table[16] = { + 0b000100000000, /* r<1;1,0> */ + 0b000000000000, /* r<0;1,0> */ + 0b000100000010, /* -r<1;1,0> */ + 0b000100000001, /* (abs)r<1;1,0> */ + 0b000000000010, /* -r<0;1,0> */ + 0b001000000000, /* r<2;1,0> */ + 0b001001000000, /* r<2;4,0> */ + 0b001101000000, /* r<4;4,0> */ + 0b001100000000, /* r<4;1,0> */ + 0b000100000011, /* -(abs)r<1;1,0> */ + 0b000000000001, /* (abs)r<0;1,0> */ + 0b111100010000, /* r[a]<1,0> */ + 0b010001100000, /* r<8;8,0> */ + 0b000101000000, /* r<1;4,0> */ + 0b010001001000, /* r<8;4,2> */ + 0b001000000010, /* -r<2;1,0> */ +}; + +static const uint16_t xehp_src1_index_table[16] = { + 0b000001000000, /* r<1;1,0> */ + 0b000000000000, /* r<0;1,0> */ + 0b100001000000, /* -r<1;1,0> */ + 0b100000000000, /* -r<0;1,0> */ + 0b010001000000, /* (abs)r<1;1,0> */ + 0b100011010000, /* -r<4;4,0> */ + 0b000010000000, /* r<2;1,0> */ + 0b000011010000, /* r<4;4,0> */ + 0b000011000000, /* r<4;1,0> */ + 0b110001000000, /* -(abs)r<1;1,0> */ + 0b010000000000, /* (abs)r<0;1,0> */ + 0b110000000000, /* -(abs)r<0;1,0> */ + 0b000100011000, /* r<8;8,0> */ + 0b100010000000, /* -r<2;1,0> */ + 0b100000001001, /* -r<0;2,1> */ + 0b100001000100, /* -r[a]<1;1,0> */ +}; + +static const uint32_t xe2_control_index_table[32] = { + 0b000000000000000100, /* (16|M0) */ + 0b000000100000000000, /* (W) (1|M0) */ + 0b000000000010000100, /* (16|M16) */ + 0b000000000000000000, /* (1|M0) */ + 0b000000100000000100, /* (W) (16|M0) */ + 0b010000000000000100, /* (16|M0) (.ge)f0.0 */ + 0b010100000000000100, /* (16|M0) (.lt)f0.0 */ + 0b000000100000000010, /* (W) (4|M0) */ + 0b000000000000000101, /* (32|M0) */ + 0b000000100000000011, /* (W) (8|M0) */ + 0b001100100000000000, /* (W) (1|M0) (.gt)f0.0 */ + 0b000010000000000100, /* (16|M0) (sat) */ + 0b000100000000000100, /* (16|M0) (.eq)f0.0 */ + 0b000000100000000001, /* (W) (2|M0) */ + 0b001100000000000100, /* (16|M0) (.gt)f0.0 */ + 0b000100100000000000, /* (W) (1|M0) (.eq)f0.0 */ + 0b010100100000000010, /* (W) (4|M0) (.lt)f0.0 */ + 0b010000100000000000, /* (W) (1|M0) (.ge)f0.0 */ + 0b010000100000000010, /* (W) (4|M0) (.ge)f0.0 */ + 0b010100100000000000, /* (W) (1|M0) (.lt)f0.0 */ + 0b001000000000000100, /* (16|M0) (.ne)f0.0 */ + 0b000000000100100100, /* (f2.0) (16|M0) */ + 0b010100100000000011, /* (W) (8|M0) (.lt)f0.0 */ + 0b000000000100011100, /* (f1.1) (16|M0) */ + 0b010000100000000011, /* (W) (8|M0) (.ge)f0.0 */ + 0b000000000100001100, /* (f0.1) (16|M0) */ + 0b000000000100010100, /* (f1.0) (16|M0) */ + 0b000000000100110100, /* (f3.0) (16|M0) */ + 0b000000000100111100, /* (f3.1) (16|M0) */ + 0b000000000100101100, /* (f2.1) (16|M0) */ + 0b000000000100000100, /* (f0.0) (16|M0) */ + 0b010100000000100100, /* (16|M0) (.lt)f2.0 */ +}; + +static const uint32_t xe2_datatype_table[32] = { + 0b11010110100101010100, /* grf<1>:f grf:f grf:f */ + 0b11010100100101010100, /* arf<1>:f grf:f grf:f */ + 0b00000110100101010100, /* grf<1>:f grf:f arf:ub */ + 0b00000110100001000100, /* grf<1>:ud grf:ud arf:ub */ + 0b01010110110101010100, /* grf<1>:f grf:f imm:f */ + 0b11010010100101010100, /* grf<1>:f arf:f grf:f */ + 0b10111110100011101110, /* grf<1>:q grf:q grf:q */ + 0b00000000100000000000, /* arf<1>:ub arf:ub arf:ub */ + 0b01010110100101010100, /* grf<1>:f grf:f arf:f */ + 0b00000010101001000100, /* grf<1>:ud imm:ud */ + 0b00101110110011001100, /* grf<1>:d grf:d imm:w */ + 0b11010000100101010100, /* arf<1>:f arf:f grf:f */ + 0b01010100100101010100, /* arf<1>:f grf:f arf:f */ + 0b01010100110101010100, /* arf<1>:f grf:f imm:f */ + 0b00000010101101010100, /* grf<1>:f imm:f */ + 0b00000110100011001100, /* grf<1>:d grf:d arf:ub */ + 0b00101110110011101110, /* grf<1>:q grf:q imm:w */ + 0b00000110100001100110, /* grf<1>:uq grf:uq arf:ub */ + 0b01010000100101010100, /* arf<1>:f arf:f arf:f */ + 0b10110110100011001100, /* grf<1>:d grf:d grf:d */ + 0b01010010100101010100, /* grf<1>:f arf:f arf:f */ + 0b00000111000001000100, /* grf<2>:ud grf:ud arf:ub */ + 0b00110110110011001110, /* grf<1>:q grf:d imm:d */ + 0b00101100110011001100, /* arf<1>:d grf:d imm:w */ + 0b11011110100101110110, /* grf<1>:df grf:df grf:df */ + 0b01010010110101010100, /* grf<1>:f arf:f imm:f */ + 0b10010110100001000100, /* grf<1>:ud grf:ud grf:ud */ + 0b00000010100001000100, /* grf<1>:ud arf:ud arf:ub */ + 0b00001110110001000100, /* grf<1>:ud grf:ud imm:uw */ + 0b00000010101010101100, /* grf<1>:d imm:w */ + 0b01010000110101010100, /* arf<1>:f arf:f imm:f */ + 0b00000100100001000100, /* arf<1>:ud grf:ud arf:ub */ +}; + +static const uint16_t xe2_subreg_table[16] = { + 0b000000000000, /* .0 .0 */ + 0b000010000000, /* .0 .4 */ + 0b000000000100, /* .4 .0 */ + 0b010000000000, /* .0 .32 */ + 0b001000000000, /* .0 .16 */ + 0b000000001000, /* .8 .0 */ + 0b000100000000, /* .0 .8 */ + 0b010100000000, /* .0 .40 */ + 0b011000000000, /* .0 .48 */ + 0b000110000000, /* .0 .12 */ + 0b000000010000, /* .16 .0 */ + 0b011010000000, /* .0 .52 */ + 0b001100000000, /* .0 .24 */ + 0b011100000000, /* .0 .56 */ + 0b010110000000, /* .0 .44 */ + 0b010010000000, /* .0 .36 */ +}; + +static const uint16_t xe2_src0_index_table[8] = { + 0b00100000000, /* r<1;1,0> */ + 0b00000000000, /* r<0;1,0> */ + 0b01000000000, /* r<2;1,0> */ + 0b00100000010, /* -r<1;1,0> */ + 0b01100000000, /* r<4;1,0> */ + 0b00100000001, /* (abs)r<1;1,0> */ + 0b00000000010, /* -r<0;1,0> */ + 0b01001000000, /* r<2;4,0> */ +}; + +static const uint16_t xe2_src1_index_table[16] = { + 0b0000100000000000, /* r<1;1,0>.0 */ + 0b0000000000000000, /* r<0;1,0>.0 */ + 0b1000100000000000, /* -r<1;1,0>.0 */ + 0b0000000000010000, /* r<0;1,0>.8 */ + 0b0000000000001000, /* r<0;1,0>.4 */ + 0b0000000000011000, /* r<0;1,0>.12 */ + 0b0000000001010000, /* r<0;1,0>.40 */ + 0b0000000001000000, /* r<0;1,0>.32 */ + 0b0000000000100000, /* r<0;1,0>.16 */ + 0b0000000001111000, /* r<0;1,0>.60 */ + 0b0000000000111000, /* r<0;1,0>.28 */ + 0b0000000000101000, /* r<0;1,0>.20 */ + 0b0000000001011000, /* r<0;1,0>.44 */ + 0b0000000001001000, /* r<0;1,0>.36 */ + 0b0000000001110000, /* r<0;1,0>.56 */ + 0b0000000000110000, /* r<0;1,0>.24 */ +}; + +/* This is actually the control index table for Cherryview (26 bits), but the + * only difference from Broadwell (24 bits) is that it has two extra 0-bits at + * the start. + * + * The low 24 bits have the same mappings on both hardware. + */ +static const uint32_t gfx8_3src_control_index_table[4] = { + 0b00100000000110000000000001, + 0b00000000000110000000000001, + 0b00000000001000000000000001, + 0b00000000001000000000100001, +}; + +/* This is actually the control index table for Cherryview (49 bits), but the + * only difference from Broadwell (46 bits) is that it has three extra 0-bits + * at the start. + * + * The low 44 bits have the same mappings on both hardware, and since the high + * three bits on Broadwell are zero, we can reuse Cherryview's table. + */ +static const uint64_t gfx8_3src_source_index_table[4] = { + 0b0000001110010011100100111001000001111000000000000, + 0b0000001110010011100100111001000001111000000000010, + 0b0000001110010011100100111001000001111000000001000, + 0b0000001110010011100100111001000001111000000100000, +}; + +static const uint64_t gfx12_3src_control_index_table[32] = { + 0b000001001010010101000000000000000100, /* (16|M0) grf<1>:f :f :f :f */ + 0b000001001010010101000000000000000011, /* (8|M0) grf<1>:f :f :f :f */ + 0b000001001000010101000000000000000011, /* (8|M0) arf<1>:f :f :f :f */ + 0b000001001010010101000010000000000011, /* (W) (8|M0) grf<1>:f :f :f :f */ + 0b000001001000010101000010000000000011, /* (W) (8|M0) arf<1>:f :f :f :f */ + 0b000001001000010101000000000000010011, /* (8|M8) arf<1>:f :f :f :f */ + 0b000001001010010101000000000000010011, /* (8|M8) grf<1>:f :f :f :f */ + 0b000001001000010101000010000000010011, /* (W) (8|M8) arf<1>:f :f :f :f */ + 0b000001001010010101000010000000010011, /* (W) (8|M8) grf<1>:f :f :f :f */ + 0b000001001010010101000010000000000100, /* (W) (16|M0) grf<1>:f :f :f :f */ + 0b000001001000010101000000000000000100, /* (16|M0) arf<1>:f :f :f :f */ + 0b000001001010010101010000000000000100, /* (16|M0) (sat)grf<1>:f :f :f :f */ + 0b000001001010010101000000000000100100, /* (16|M16) grf<1>:f :f :f :f */ + 0b000001001000010101000010000000000100, /* (W) (16|M0) arf<1>:f :f :f :f */ + 0b000001001010010101000010000000000000, /* (W) (1|M0) grf<1>:f :f :f :f */ + 0b000001001010010101010000000000000011, /* (8|M0) (sat)grf<1>:f :f :f :f */ + 0b000001001000010101000010000000110011, /* (W) (8|M24) arf<1>:f :f :f :f */ + 0b000001001000010101000010000000100011, /* (W) (8|M16) arf<1>:f :f :f :f */ + 0b000001001010010101000010000000110011, /* (W) (8|M24) grf<1>:f :f :f :f */ + 0b000001001010010101000010000000100011, /* (W) (8|M16) grf<1>:f :f :f :f */ + 0b000001001000010101000000000000100011, /* (8|M16) arf<1>:f :f :f :f */ + 0b000001001000010101000000000000110011, /* (8|M24) arf<1>:f :f :f :f */ + 0b000001001010010101000000000000100011, /* (8|M16) grf<1>:f :f :f :f */ + 0b000001001010010101000000000000110011, /* (8|M24) grf<1>:f :f :f :f */ + 0b000001001000010101010000000000000100, /* (16|M0) (sat)arf<1>:f :f :f :f */ + 0b000001001010010101010010000000000100, /* (W) (16|M0) (sat)grf<1>:f :f :f :f */ + 0b000001001010010101000010000000100100, /* (W) (16|M16) grf<1>:f :f :f :f */ + 0b000001001010010001000010000000000000, /* (W) (1|M0) grf<1>:ud :ud :ud :ud */ + 0b000001001000010101000000000000100100, /* (16|M16) arf<1>:f :f :f :f */ + 0b000001001010010101010000000000100100, /* (16|M16) (sat)grf<1>:f :f :f :f */ + 0b000001001010010101000010000000000010, /* (W) (4|M0) grf<1>:f :f :f :f */ + 0b000001001000010101010000000000000011, /* (8|M0) (sat)arf<1>:f :f :f :f */ +}; + +static const uint64_t xehp_3src_control_index_table[32] = { + 0b0000010010100010101000000000000000100, /* (16|M0) grf<1>:f :f :f :f */ + 0b0000010010100010101000000000000000011, /* (8|M0) grf<1>:f :f :f :f */ + 0b0000010010000010101000000000000000011, /* (8|M0) arf<1>:f :f :f :f */ + 0b0000010010100010101000010000000000011, /* (W) (8|M0) grf<1>:f :f :f :f */ + 0b0000010010000010101000010000000000011, /* (W) (8|M0) arf<1>:f :f :f :f */ + 0b0000010010000010101000000000000010011, /* (8|M8) arf<1>:f :f :f :f */ + 0b0000010010100010101000000000000010011, /* (8|M8) grf<1>:f :f :f :f */ + 0b0000010010000010101000010000000010011, /* (W) (8|M8) arf<1>:f :f :f :f */ + 0b0000010010100010101000010000000010011, /* (W) (8|M8) grf<1>:f :f :f :f */ + 0b0000010010100010101000010000000000100, /* (W) (16|M0) grf<1>:f :f :f :f */ + 0b0000010010000010101000000000000000100, /* (16|M0) arf<1>:f :f :f :f */ + 0b0000010010100010101010000000000000100, /* (16|M0) (sat)grf<1>:f :f :f :f */ + 0b0000010010100010101000000000000100100, /* (16|M16) grf<1>:f :f :f :f */ + 0b0000010010000010101000010000000000100, /* (W) (16|M0) arf<1>:f :f :f :f */ + 0b0000010010100010101000010000000000000, /* (W) (1|M0) grf<1>:f :f :f :f */ + 0b0000010010100010101010000000000000011, /* (8|M0) (sat)grf<1>:f :f :f :f */ + 0b0000010010000010101000010000000100011, /* (W) (8|M16) arf<1>:f :f :f :f */ + 0b0000010010000010101000010000000110011, /* (W) (8|M24) arf<1>:f :f :f :f */ + 0b0000010010100010101000010000000100011, /* (W) (8|M16) grf<1>:f :f :f :f */ + 0b0000010010100010101000010000000110011, /* (W) (8|M24) grf<1>:f :f :f :f */ + 0b0000010010000010101000000000000110011, /* (8|M24) arf<1>:f :f :f :f */ + 0b0000010010000010101000000000000100011, /* (8|M16) arf<1>:f :f :f :f */ + 0b0000000100111110011000000000000000011, /* dpas.8x* (8|M0) grf<1>:d :d :ub :b */ + 0b0000000000111110011000100000000000011, /* dpas.8x* (8|M0) grf<1>:d :d :ub :ub {Atomic} */ + 0b0000100100111110011000100000000000011, /* dpas.8x* (8|M0) grf<1>:d :d :b :b {Atomic} */ + 0b0000100000111110011000100000000000011, /* dpas.8x* (8|M0) grf<1>:d :d :b :ub {Atomic} */ + 0b0000100100111110011000000000000000011, /* dpas.8x* (8|M0) grf<1>:d :d :b :b */ + 0b0000000000111110011000000000000000011, /* dpas.8x* (8|M0) grf<1>:d :d :ub :ub */ + 0b0000000100111110011000100000000000011, /* dpas.8x* (8|M0) grf<1>:d :d :ub :b {Atomic} */ + 0b0000100000111110011000000000000000011, /* dpas.8x* (8|M0) grf<1>:d :d :b :ub */ + 0b0000101101111010101000100000000000011, /* dpas.8x* (8|M0) grf<1>:f :f :bf :bf {Atomic} */ + 0b0000101101111010101000000000000000011, /* dpas.8x* (8|M0) grf<1>:f :f :bf :bf */ +}; + +static const uint64_t xe2_3src_control_index_table[16] = { + 0b0000010010100010101000000000000100, /* (16|M0) grf<1>:f :f :f :f */ + 0b0000010010000010101000000000000100, /* (16|M0) arf<1>:f :f :f :f */ + 0b0000010010100010101000100000000100, /* (W)(16|M0) grf<1>:f :f :f :f */ + 0b0000010010000010101000100000000100, /* (W)(16|M0) arf<1>:f :f :f :f */ + 0b0000011011100011101100000000000100, /* (16|M0) grf<1>:df :df :df :df */ + 0b0000011011100011101100000010000100, /* (16|M16) grf<1>:df :df :df :df */ + 0b0000011011000011101100000000000100, /* (16|M0) arf<1>:df :df :df :df */ + 0b0000010010100010101000000000000101, /* (32|M0) grf<1>:f :f :f :f */ + 0b0000010010000010101000000000000101, /* (32|M0) arf<1>:f :f :f :f */ + 0b0000010010000010101010000000000100, /* (16|M0) (sat)arf<1>:f :f :f :f */ + 0b0000010010100010101010000000000100, /* (16|M0) (sat)grf<1>:f :f :f :f */ + 0b0000011011000011101100000010000100, /* (16|M16) arf<1>:df :df :df :df */ + 0b0000010010100010101000100000000000, /* (W)(1|M0) grf<1>:f :f :f :f */ + 0b0000010010100010001000000000000100, /* (16|M0) grf<1>:ud :ud :ud :ud */ + 0b0000110110100110011000000000000101, /* (32|M0) grf<1>:d :d :d :d */ + 0b0000011011000011101100000000000011, /* (8|M0) arf<1>:df :df :df :df */ +}; + +static const uint64_t xe2_3src_dpas_control_index_table[16] = { + 0b0000000000111110011001000000000100, /* dpas.8x* (16|M0) grf:d :d :ub :ub Atomic */ + 0b0000000100111110011001000000000100, /* dpas.8x* (16|M0) grf:d :d :ub :b Atomic */ + 0b0000100000111110011001000000000100, /* dpas.8x* (16|M0) grf:d :d :b :ub Atomic */ + 0b0000100100111110011001000000000100, /* dpas.8x* (16|M0) grf:d :d :b :b Atomic */ + 0b0000000000111110011000000000000100, /* dpas.8x* (16|M0) grf:d :d :ub :ub */ + 0b0000100100111110011000000000000100, /* dpas.8x* (16|M0) grf:d :d :b :b */ + 0b0000101101111010101001000000000100, /* dpas.8x* (16|M0) grf:f :f :bf :bf Atomic */ + 0b0000101101111101101001000000000100, /* dpas.8x* (16|M0) grf:f :bf :bf :bf Atomic */ + 0b0000101101111010110101000000000100, /* dpas.8x* (16|M0) grf:bf :f :bf :bf Atomic */ + 0b0000101101111101110101000000000100, /* dpas.8x* (16|M0) grf:bf :bf :bf :bf Atomic */ + 0b0000101101111010101000000000000100, /* dpas.8x* (16|M0) grf:f :f :bf :bf */ + 0b0000001001111010101001000000000100, /* dpas.8x* (16|M0) grf:f :f :hf :hf Atomic */ + 0b0000001001111001101001000000000100, /* dpas.8x* (16|M0) grf:f :hf :hf :hf Atomic */ + 0b0000001001111010100101000000000100, /* dpas.8x* (16|M0) grf:hf :f :hf :hf Atomic */ + 0b0000001001111001100101000000000100, /* dpas.8x* (16|M0) grf:hf :hf :hf :hf Atomic */ + 0b0000001001111010101000000000000100, /* dpas.8x* (16|M0) grf:f :f :hf :hf */ +}; + +static const uint32_t gfx12_3src_source_index_table[32] = { + 0b100101100001100000000, /* grf<0;0> grf<8;1> grf<0> */ + 0b100101100001001000010, /* arf<4;1> grf<8;1> grf<0> */ + 0b101101100001101000011, /* grf<8;1> grf<8;1> grf<1> */ + 0b100101100001101000011, /* grf<8;1> grf<8;1> grf<0> */ + 0b101100000000101000011, /* grf<8;1> grf<0;0> grf<1> */ + 0b101101100001101001011, /* -grf<8;1> grf<8;1> grf<1> */ + 0b101001100001101000011, /* grf<8;1> arf<8;1> grf<1> */ + 0b100001100001100000000, /* grf<0;0> arf<8;1> grf<0> */ + 0b101101100001100000000, /* grf<0;0> grf<8;1> grf<1> */ + 0b101101100101101000011, /* grf<8;1> grf<8;1> -grf<1> */ + 0b101101110001101000011, /* grf<8;1> -grf<8;1> grf<1> */ + 0b101100000000100000000, /* grf<0;0> grf<0;0> grf<1> */ + 0b100001100001101000011, /* grf<8;1> arf<8;1> grf<0> */ + 0b100101110001100000000, /* grf<0;0> -grf<8;1> grf<0> */ + 0b100101110001101000011, /* grf<8;1> -grf<8;1> grf<0> */ + 0b100101100001101001011, /* -grf<8;1> grf<8;1> grf<0> */ + 0b100100000000101000011, /* grf<8;1> grf<0;0> grf<0> */ + 0b100101100001100001000, /* -grf<0;0> grf<8;1> grf<0> */ + 0b100100000000100000000, /* grf<0;0> grf<0;0> grf<0> */ + 0b101101110001100000000, /* grf<0;0> -grf<8;1> grf<1> */ + 0b100101100101100000000, /* grf<0;0> grf<8;1> -grf<0> */ + 0b101001100001100000000, /* grf<0;0> arf<8;1> grf<1> */ + 0b100101100101101000011, /* grf<8;1> grf<8;1> -grf<0> */ + 0b101101100101101001011, /* -grf<8;1> grf<8;1> -grf<1> */ + 0b101001100001101001011, /* -grf<8;1> arf<8;1> grf<1> */ + 0b101101110001101001011, /* -grf<8;1> -grf<8;1> grf<1> */ + 0b101100010000101000011, /* grf<8;1> -grf<0;0> grf<1> */ + 0b101100000100101000011, /* grf<8;1> grf<0;0> -grf<1> */ + 0b101101100001100001000, /* -grf<0;0> grf<8;1> grf<1> */ + 0b101101100101100000000, /* grf<0;0> grf<8;1> -grf<1> */ + 0b100100000100101000011, /* grf<8;1> grf<0;0> -grf<0> */ + 0b101001100101101000011, /* grf<8;1> arf<8;1> -grf<1> */ +}; + +static const uint32_t xehp_3src_source_index_table[32] = { + 0b100100000001100000000, /* grf<0;0> grf<1;0> grf<0> */ + 0b100100000001000000001, /* arf<1;0> grf<1;0> grf<0> */ + 0b101100000001100000001, /* grf<1;0> grf<1;0> grf<1> */ + 0b100100000001100000001, /* grf<1;0> grf<1;0> grf<0> */ + 0b101100000000100000001, /* grf<1;0> grf<0;0> grf<1> */ + 0b101100000001100001001, /* -grf<1;0> grf<1;0> grf<1> */ + 0b101000000001100000001, /* grf<1;0> arf<1;0> grf<1> */ + 0b101100000001100000000, /* grf<0;0> grf<1;0> grf<1> */ + 0b100000000001100000000, /* grf<0;0> arf<1;0> grf<0> */ + 0b101100000101100000001, /* grf<1;0> grf<1;0> -grf<1> */ + 0b101100010001100000001, /* grf<1;0> -grf<1;0> grf<1> */ + 0b101100000000100000000, /* grf<0;0> grf<0;0> grf<1> */ + 0b100000000001100000001, /* grf<1;0> arf<1;0> grf<0> */ + 0b100100010001100000000, /* grf<0;0> -grf<1;0> grf<0> */ + 0b100100010001100000001, /* grf<1;0> -grf<1;0> grf<0> */ + 0b100100000001100001001, /* -grf<1;0> grf<1;0> grf<0> */ + 0b100100000000100000001, /* grf<1;0> grf<0;0> grf<0> */ + 0b100100000001100001000, /* -grf<0;0> grf<1;0> grf<0> */ + 0b100100000000100000000, /* grf<0;0> grf<0;0> grf<0> + * dpas.*x1 grf:d grf:[ub,b] grf:[ub,b] + * dpas.*x1 grf:f grf:bf grf:bf + */ + 0b101100010001100000000, /* grf<0;0> -grf<1;0> grf<1> */ + 0b100100000101100000000, /* grf<0;0> grf<1;0> -grf<0> */ + 0b101000000001100000000, /* grf<0;0> arf<1;0> grf<1> */ + 0b100100000101100000001, /* grf<1;0> grf<1;0> -grf<0> */ + 0b101100000101100001001, /* -grf<1;0> grf<1;0> -grf<1> */ + 0b100100010000100000000, /* dpas.*x1 grf:d grf:[u2,s2] grf:[ub,b] */ + 0b100100000100100000000, /* dpas.*x1 grf:d grf:[ub,b] grf:[u2,s2] */ + 0b100100010100100000000, /* dpas.*x1 grf:d grf:[u2,s2] grf:[u2,s2] */ + 0b100100001000100000000, /* dpas.*x1 grf:d grf:[u4,s4] grf:[ub,b] */ + 0b100100001100100000000, /* dpas.*x1 grf:d grf:[u4,s4] grf:[u2,s2] */ + 0b100100000010100000000, /* dpas.*x1 grf:d grf:[ub,b] grf:[u4,s4] */ + 0b100100001010100000000, /* dpas.*x1 grf:d grf:[u4,s4] grf:[u4,s4] */ + 0b100100010010100000000, /* dpas.*x1 grf:d grf:[u2,s2] grf:[u4,s4] */ +}; + +static const uint32_t xe2_3src_source_index_table[16] = { + 0b101100000001100000001, /* grf<1;0> grf<1;0> grf<1> */ + 0b101100000001000000001, /* arf<1;0> grf<1;0> grf<1> */ + 0b100100000001100000000, /* grf<0;0> grf<1;0> grf<0> */ + 0b100100000001000000001, /* arf<1;0> grf<1;0> grf<0> */ + 0b100100000001100000001, /* grf<1;0> grf<1;0> grf<0> */ + 0b100000000001100000000, /* grf<0;0> arf<1;0> grf<0> */ + 0b100000000001100000001, /* grf<1;0> arf<1;0> grf<0> */ + 0b101100000101100000001, /* grf<1;0> grf<1;0> -grf<1> */ + 0b101000000001100000001, /* grf<1;0> arf<1;0> grf<1> */ + 0b101000000001000000001, /* arf<1;0> arf<1;0> grf<1> */ + 0b100000000001000000001, /* arf<1;0> arf<1;0> grf<0> */ + 0b100100000000100000000, /* grf<0;0> grf<0;0> grf<0> */ + 0b100100000000100000001, /* grf<1;0> grf<0;0> grf<0> */ + 0b101100000101000000001, /* arf<1;0> grf<1;0> -grf<1> */ + 0b100100010001100000001, /* grf<1;0> -grf<1;0> grf<0> */ + 0b100100010001000000001, /* arf<1;0> -grf<1;0> grf<0> */ +}; + +static const uint32_t xe2_3src_dpas_source_index_table[16] = { + 0b100100000000100000000, /* dpas.*x1 grf:d grf:[ub,b] grf:[ub,b] + * dpas.*x1 grf:[f,bf] grf:bf grf:bf + * dpas.*x1 grf:[f,hf] grf:hf grf:hf + */ + 0b100100000010100000000, /* dpas.*x1 grf:d grf:[ub,b] grf:[u4,s4] */ + 0b100100000100100000000, /* dpas.*x1 grf:d grf:[ub,b] grf:[u2,s2] */ + 0b100100001000100000000, /* dpas.*x1 grf:d grf:[u4,s4] grf:[ub,b] */ + 0b100100001010100000000, /* dpas.*x1 grf:d grf:[u4,s4] grf:[u4,s4] */ + 0b100100001100100000000, /* dpas.*x1 grf:d grf:[u4,s4] grf:[u2,s2] */ + 0b100100010000100000000, /* dpas.*x1 grf:d grf:[u2,s2] grf:[ub,b] */ + 0b100100010010100000000, /* dpas.*x1 grf:d grf:[u2,s2] grf:[u4,s4] */ + 0b100100010100100000000, /* dpas.*x1 grf:d grf:[u2,s2] grf:[u2,s2] */ + 0b100100000000100000010, /* dpas.*x2 grf:d grf:[ub,b] grf:[ub,b] */ + 0b100100000010100000010, /* dpas.*x2 grf:d grf:[ub,b] grf:[u4,s4] */ + 0b100100001000100000010, /* dpas.*x2 grf:d grf:[u4,s4] grf:[ub,b] */ + 0b100100001010100000010, /* dpas.*x2 grf:d grf:[u4,s4] grf:[u4,s4] */ + 0b100100010100100000010, /* dpas.*x2 grf:d grf:[u2,s2] grf:[u2,s2] */ + 0b100100000000100001110, /* dpas.*x8 grf:d grf:[ub,b] grf:[ub,b] */ + 0b100100001010100001110, /* dpas.*x8 grf:d grf:[u4,s4] grf:[u4,s4] */ +}; + +static const uint32_t gfx12_3src_subreg_table[32] = { + 0b00000000000000000000, /* .0 .0 .0 .0 */ + 0b00100000000000000000, /* .0 .0 .0 .4 */ + 0b00000000000110000000, /* .0 .12 .0 .0 */ + 0b10100000000000000000, /* .0 .0 .0 .20 */ + 0b10000000001110000000, /* .0 .28 .0 .16 */ + 0b01100000000000000000, /* .0 .0 .0 .12 */ + 0b01000000000000000000, /* .0 .0 .0 .8 */ + 0b00000010000000000000, /* .0 .0 .8 .0 */ + 0b00000001000000000000, /* .0 .0 .4 .0 */ + 0b11000000000000000000, /* .0 .0 .0 .24 */ + 0b10000000000000000000, /* .0 .0 .0 .16 */ + 0b11100000000000000000, /* .0 .0 .0 .28 */ + 0b00000110000000000000, /* .0 .0 .24 .0 */ + 0b00000000000010000000, /* .0 .4 .0 .0 */ + 0b00000100000000000000, /* .0 .0 .16 .0 */ + 0b00000011000000000000, /* .0 .0 .12 .0 */ + 0b00000101000000000000, /* .0 .0 .20 .0 */ + 0b00000111000000000000, /* .0 .0 .28 .0 */ + 0b00000000000100000000, /* .0 .8 .0 .0 */ + 0b00000000001000000000, /* .0 .16 .0 .0 */ + 0b00000000001100000000, /* .0 .24 .0 .0 */ + 0b00000000001010000000, /* .0 .20 .0 .0 */ + 0b00000000001110000000, /* .0 .28 .0 .0 */ + 0b11000000001110000000, /* .0 .28 .0 .24 */ + 0b00100000000100000000, /* .0 .8 .0 .4 */ + 0b00100000000110000000, /* .0 .12 .0 .4 */ + 0b01000000000110000000, /* .0 .12 .0 .8 */ + 0b10000000001100000000, /* .0 .24 .0 .16 */ + 0b10000000001010000000, /* .0 .20 .0 .16 */ + 0b01100000000010000000, /* .0 .4 .0 .12 */ + 0b10100000001110000000, /* .0 .28 .0 .20 */ + 0b01000000000010000000, /* .0 .4 .0 .8 */ +}; + +static const uint32_t xe2_3src_subreg_table[32] = { + 0b00000000000000000000, /* .0 .0 .0 .0 */ + 0b00100000000000000000, /* .0 .0 .0 .8 */ + 0b10000000000000000000, /* .0 .0 .0 .32 */ + 0b00010000000000000000, /* .0 .0 .0 .4 */ + 0b11100000000000000000, /* .0 .0 .0 .56 */ + 0b01010000000000000000, /* .0 .0 .0 .20 */ + 0b10110000000000000000, /* .0 .0 .0 .44 */ + 0b01000000000011000000, /* .0 .12 .0 .16 */ + 0b01100000000000000000, /* .0 .0 .0 .24 */ + 0b10100000000000000000, /* .0 .0 .0 .40 */ + 0b11000000000000000000, /* .0 .0 .0 .48 */ + 0b01000000000000000000, /* .0 .0 .0 .16 */ + 0b01110000000110000000, /* .0 .24 .0 .28 */ + 0b10100000001001000000, /* .0 .36 .0 .40 */ + 0b11010000001100000000, /* .0 .48 .0 .52 */ + 0b01110000000000000000, /* .0 .0 .0 .28 */ + 0b11110000000000000000, /* .0 .0 .0 .60 */ + 0b10010000000000000000, /* .0 .0 .0 .36 */ + 0b00110000000000000000, /* .0 .0 .0 .12 */ + 0b00100000000010000000, /* .0 .8 .0 .8 */ + 0b00010000000001000000, /* .0 .4 .0 .4 */ + 0b00110000000011000000, /* .0 .12 .0 .12 */ + 0b11010000000000000000, /* .0 .0 .0 .52 */ + 0b00000000000001000000, /* .0 .4 .0 .0 */ + 0b00000101100000000000, /* .0 .0 .44 .0 */ + 0b00000100000000000000, /* .0 .0 .32 .0 */ + 0b00000000000010000000, /* .0 .8 .0 .0 */ + 0b00000000001100000000, /* .0 .48 .0 .0 */ + 0b00000000001101000000, /* .0 .52 .0 .0 */ + 0b00000110100000000000, /* .0 .0 .52 .0 */ + 0b00000000001000000000, /* .0 .32 .0 .0 */ + 0b00000000001111000000, /* .0 .60 .0 .0 */ +}; + +struct compaction_state { + const struct brw_isa_info *isa; + const uint32_t *control_index_table; + const uint32_t *datatype_table; + const uint16_t *subreg_table; + const uint16_t *src0_index_table; + const uint16_t *src1_index_table; +}; + +static void compaction_state_init(struct compaction_state *c, + const struct brw_isa_info *isa); + +static bool +set_control_index(const struct compaction_state *c, + brw_compact_inst *dst, const brw_inst *src) +{ + const struct intel_device_info *devinfo = c->isa->devinfo; + uint32_t uncompacted; /* 17b/G45; 19b/IVB+; 21b/TGL+ */ + + if (devinfo->ver >= 20) { + uncompacted = (brw_inst_bits(src, 95, 92) << 14) | /* 4b */ + (brw_inst_bits(src, 34, 34) << 13) | /* 1b */ + (brw_inst_bits(src, 32, 32) << 12) | /* 1b */ + (brw_inst_bits(src, 31, 31) << 11) | /* 1b */ + (brw_inst_bits(src, 28, 28) << 10) | /* 1b */ + (brw_inst_bits(src, 27, 26) << 8) | /* 2b */ + (brw_inst_bits(src, 25, 24) << 6) | /* 2b */ + (brw_inst_bits(src, 23, 21) << 3) | /* 3b */ + (brw_inst_bits(src, 20, 18)); /* 3b */ + } else if (devinfo->ver >= 12) { + uncompacted = (brw_inst_bits(src, 95, 92) << 17) | /* 4b */ + (brw_inst_bits(src, 34, 34) << 16) | /* 1b */ + (brw_inst_bits(src, 33, 33) << 15) | /* 1b */ + (brw_inst_bits(src, 32, 32) << 14) | /* 1b */ + (brw_inst_bits(src, 31, 31) << 13) | /* 1b */ + (brw_inst_bits(src, 28, 28) << 12) | /* 1b */ + (brw_inst_bits(src, 27, 24) << 8) | /* 4b */ + (brw_inst_bits(src, 23, 22) << 6) | /* 2b */ + (brw_inst_bits(src, 21, 19) << 3) | /* 3b */ + (brw_inst_bits(src, 18, 16)); /* 3b */ + } else if (devinfo->ver >= 8) { + uncompacted = (brw_inst_bits(src, 33, 31) << 16) | /* 3b */ + (brw_inst_bits(src, 23, 12) << 4) | /* 12b */ + (brw_inst_bits(src, 10, 9) << 2) | /* 2b */ + (brw_inst_bits(src, 34, 34) << 1) | /* 1b */ + (brw_inst_bits(src, 8, 8)); /* 1b */ + } else { + uncompacted = (brw_inst_bits(src, 31, 31) << 16) | /* 1b */ + (brw_inst_bits(src, 23, 8)); /* 16b */ + + /* On gfx7, the flag register and subregister numbers are integrated into + * the control index. + */ + if (devinfo->ver == 7) + uncompacted |= brw_inst_bits(src, 90, 89) << 17; /* 2b */ + } + + for (int i = 0; i < 32; i++) { + if (c->control_index_table[i] == uncompacted) { + brw_compact_inst_set_control_index(devinfo, dst, i); + return true; + } + } + + return false; +} + +static bool +set_datatype_index(const struct compaction_state *c, brw_compact_inst *dst, + const brw_inst *src, bool is_immediate) +{ + const struct intel_device_info *devinfo = c->isa->devinfo; + uint32_t uncompacted; /* 18b/G45+; 21b/BDW+; 20b/TGL+ */ + + if (devinfo->ver >= 12) { + uncompacted = (brw_inst_bits(src, 91, 88) << 15) | /* 4b */ + (brw_inst_bits(src, 66, 66) << 14) | /* 1b */ + (brw_inst_bits(src, 50, 50) << 13) | /* 1b */ + (brw_inst_bits(src, 49, 48) << 11) | /* 2b */ + (brw_inst_bits(src, 47, 47) << 10) | /* 1b */ + (brw_inst_bits(src, 46, 46) << 9) | /* 1b */ + (brw_inst_bits(src, 43, 40) << 5) | /* 4b */ + (brw_inst_bits(src, 39, 36) << 1) | /* 4b */ + (brw_inst_bits(src, 35, 35)); /* 1b */ + + /* Src1.RegFile overlaps with the immediate, so ignore it if an immediate + * is present + */ + if (!is_immediate) { + uncompacted |= brw_inst_bits(src, 98, 98) << 19; /* 1b */ + } + } else if (devinfo->ver >= 8) { + uncompacted = (brw_inst_bits(src, 63, 61) << 18) | /* 3b */ + (brw_inst_bits(src, 94, 89) << 12) | /* 6b */ + (brw_inst_bits(src, 46, 35)); /* 12b */ + } else { + uncompacted = (brw_inst_bits(src, 63, 61) << 15) | /* 3b */ + (brw_inst_bits(src, 46, 32)); /* 15b */ + } + + for (int i = 0; i < 32; i++) { + if (c->datatype_table[i] == uncompacted) { + brw_compact_inst_set_datatype_index(devinfo, dst, i); + return true; + } + } + + return false; +} + +static bool +set_subreg_index(const struct compaction_state *c, brw_compact_inst *dst, + const brw_inst *src, bool is_immediate) +{ + const struct intel_device_info *devinfo = c->isa->devinfo; + const unsigned table_len = devinfo->ver >= 20 ? + ARRAY_SIZE(xe2_subreg_table) : ARRAY_SIZE(g45_subreg_table); + uint16_t uncompacted; /* 15b/G45+; 12b/Xe2+ */ + + if (devinfo->ver >= 20) { + uncompacted = (brw_inst_bits(src, 33, 33) << 0) | /* 1b */ + (brw_inst_bits(src, 55, 51) << 1) | /* 5b */ + (brw_inst_bits(src, 71, 67) << 6) | /* 5b */ + (brw_inst_bits(src, 87, 87) << 11); /* 1b */ + } else if (devinfo->ver >= 12) { + uncompacted = (brw_inst_bits(src, 55, 51) << 0) | /* 5b */ + (brw_inst_bits(src, 71, 67) << 5); /* 5b */ + + if (!is_immediate) + uncompacted |= brw_inst_bits(src, 103, 99) << 10; /* 5b */ + } else { + uncompacted = (brw_inst_bits(src, 52, 48) << 0) | /* 5b */ + (brw_inst_bits(src, 68, 64) << 5); /* 5b */ + + if (!is_immediate) + uncompacted |= brw_inst_bits(src, 100, 96) << 10; /* 5b */ + } + + for (int i = 0; i < table_len; i++) { + if (c->subreg_table[i] == uncompacted) { + brw_compact_inst_set_subreg_index(devinfo, dst, i); + return true; + } + } + + return false; +} + +static bool +set_src0_index(const struct compaction_state *c, brw_compact_inst *dst, + const brw_inst *src) +{ + const struct intel_device_info *devinfo = c->isa->devinfo; + uint16_t uncompacted; /* 12b/G45+; 11b/Xe2+ */ + int table_len; + + if (devinfo->ver >= 12) { + table_len = (devinfo->ver >= 20 ? ARRAY_SIZE(xe2_src0_index_table) : + ARRAY_SIZE(gfx12_src0_index_table)); + uncompacted = (devinfo->ver >= 20 ? 0 : + brw_inst_bits(src, 87, 87) << 11) | /* 1b */ + (brw_inst_bits(src, 86, 84) << 8) | /* 3b */ + (brw_inst_bits(src, 83, 81) << 5) | /* 3b */ + (brw_inst_bits(src, 80, 80) << 4) | /* 1b */ + (brw_inst_bits(src, 65, 64) << 2) | /* 2b */ + (brw_inst_bits(src, 45, 44)); /* 2b */ + } else { + table_len = ARRAY_SIZE(gfx8_src_index_table); + uncompacted = brw_inst_bits(src, 88, 77); /* 12b */ + } + + for (int i = 0; i < table_len; i++) { + if (c->src0_index_table[i] == uncompacted) { + brw_compact_inst_set_src0_index(devinfo, dst, i); + return true; + } + } + + return false; +} + +static bool +set_src1_index(const struct compaction_state *c, brw_compact_inst *dst, + const brw_inst *src, bool is_immediate, unsigned imm) +{ + const struct intel_device_info *devinfo = c->isa->devinfo; + if (is_immediate) { + if (devinfo->ver >= 12) { + /* src1 index takes the low 4 bits of the 12-bit compacted value */ + brw_compact_inst_set_src1_index(devinfo, dst, imm & 0xf); + } else { + /* src1 index takes the high 5 bits of the 13-bit compacted value */ + brw_compact_inst_set_src1_index(devinfo, dst, imm >> 8); + } + return true; + } else { + uint16_t uncompacted; /* 12b/G45+ 16b/Xe2+ */ + int table_len; + + if (devinfo->ver >= 20) { + table_len = ARRAY_SIZE(xe2_src1_index_table); + uncompacted = (brw_inst_bits(src, 121, 120) << 14) | /* 2b */ + (brw_inst_bits(src, 118, 116) << 11) | /* 3b */ + (brw_inst_bits(src, 115, 113) << 8) | /* 3b */ + (brw_inst_bits(src, 112, 112) << 7) | /* 1b */ + (brw_inst_bits(src, 103, 99) << 2) | /* 5b */ + (brw_inst_bits(src, 97, 96)); /* 2b */ + } else if (devinfo->ver >= 12) { + table_len = ARRAY_SIZE(gfx12_src0_index_table); + uncompacted = (brw_inst_bits(src, 121, 120) << 10) | /* 2b */ + (brw_inst_bits(src, 119, 116) << 6) | /* 4b */ + (brw_inst_bits(src, 115, 113) << 3) | /* 3b */ + (brw_inst_bits(src, 112, 112) << 2) | /* 1b */ + (brw_inst_bits(src, 97, 96)); /* 2b */ + } else { + table_len = ARRAY_SIZE(gfx8_src_index_table); + uncompacted = brw_inst_bits(src, 120, 109); /* 12b */ + } + + for (int i = 0; i < table_len; i++) { + if (c->src1_index_table[i] == uncompacted) { + brw_compact_inst_set_src1_index(devinfo, dst, i); + return true; + } + } + } + + return false; +} + +static bool +set_3src_control_index(const struct intel_device_info *devinfo, + brw_compact_inst *dst, const brw_inst *src, + bool is_dpas) +{ + assert(devinfo->ver >= 8); + + if (devinfo->ver >= 20) { + assert(is_dpas || !brw_inst_bits(src, 49, 49)); + + const uint64_t uncompacted = /* 34b/Xe2+ */ + (brw_inst_bits(src, 95, 92) << 30) | /* 4b */ + (brw_inst_bits(src, 90, 88) << 27) | /* 3b */ + (brw_inst_bits(src, 82, 80) << 24) | /* 3b */ + (brw_inst_bits(src, 50, 50) << 23) | /* 1b */ + (brw_inst_bits(src, 49, 48) << 21) | /* 2b */ + (brw_inst_bits(src, 42, 40) << 18) | /* 3b */ + (brw_inst_bits(src, 39, 39) << 17) | /* 1b */ + (brw_inst_bits(src, 38, 36) << 14) | /* 3b */ + (brw_inst_bits(src, 34, 34) << 13) | /* 1b */ + (brw_inst_bits(src, 32, 32) << 12) | /* 1b */ + (brw_inst_bits(src, 31, 31) << 11) | /* 1b */ + (brw_inst_bits(src, 28, 28) << 10) | /* 1b */ + (brw_inst_bits(src, 27, 26) << 8) | /* 2b */ + (brw_inst_bits(src, 25, 24) << 6) | /* 2b */ + (brw_inst_bits(src, 23, 21) << 3) | /* 3b */ + (brw_inst_bits(src, 20, 18)); /* 3b */ + + /* The bits used to index the tables for 3src and 3src-dpas + * are the same, so just need to pick the right one. + */ + const uint64_t *table = is_dpas ? xe2_3src_dpas_control_index_table : + xe2_3src_control_index_table; + const unsigned size = is_dpas ? ARRAY_SIZE(xe2_3src_dpas_control_index_table) : + ARRAY_SIZE(xe2_3src_control_index_table); + for (unsigned i = 0; i < size; i++) { + if (table[i] == uncompacted) { + brw_compact_inst_set_3src_control_index(devinfo, dst, i); + return true; + } + } + } else if (devinfo->verx10 >= 125) { + uint64_t uncompacted = /* 37b/XeHP+ */ + (brw_inst_bits(src, 95, 92) << 33) | /* 4b */ + (brw_inst_bits(src, 90, 88) << 30) | /* 3b */ + (brw_inst_bits(src, 82, 80) << 27) | /* 3b */ + (brw_inst_bits(src, 50, 50) << 26) | /* 1b */ + (brw_inst_bits(src, 49, 48) << 24) | /* 2b */ + (brw_inst_bits(src, 42, 40) << 21) | /* 3b */ + (brw_inst_bits(src, 39, 39) << 20) | /* 1b */ + (brw_inst_bits(src, 38, 36) << 17) | /* 3b */ + (brw_inst_bits(src, 34, 34) << 16) | /* 1b */ + (brw_inst_bits(src, 33, 33) << 15) | /* 1b */ + (brw_inst_bits(src, 32, 32) << 14) | /* 1b */ + (brw_inst_bits(src, 31, 31) << 13) | /* 1b */ + (brw_inst_bits(src, 28, 28) << 12) | /* 1b */ + (brw_inst_bits(src, 27, 24) << 8) | /* 4b */ + (brw_inst_bits(src, 23, 23) << 7) | /* 1b */ + (brw_inst_bits(src, 22, 22) << 6) | /* 1b */ + (brw_inst_bits(src, 21, 19) << 3) | /* 3b */ + (brw_inst_bits(src, 18, 16)); /* 3b */ + + for (unsigned i = 0; i < ARRAY_SIZE(xehp_3src_control_index_table); i++) { + if (xehp_3src_control_index_table[i] == uncompacted) { + brw_compact_inst_set_3src_control_index(devinfo, dst, i); + return true; + } + } + } else if (devinfo->ver >= 12) { + uint64_t uncompacted = /* 36b/TGL+ */ + (brw_inst_bits(src, 95, 92) << 32) | /* 4b */ + (brw_inst_bits(src, 90, 88) << 29) | /* 3b */ + (brw_inst_bits(src, 82, 80) << 26) | /* 3b */ + (brw_inst_bits(src, 50, 50) << 25) | /* 1b */ + (brw_inst_bits(src, 48, 48) << 24) | /* 1b */ + (brw_inst_bits(src, 42, 40) << 21) | /* 3b */ + (brw_inst_bits(src, 39, 39) << 20) | /* 1b */ + (brw_inst_bits(src, 38, 36) << 17) | /* 3b */ + (brw_inst_bits(src, 34, 34) << 16) | /* 1b */ + (brw_inst_bits(src, 33, 33) << 15) | /* 1b */ + (brw_inst_bits(src, 32, 32) << 14) | /* 1b */ + (brw_inst_bits(src, 31, 31) << 13) | /* 1b */ + (brw_inst_bits(src, 28, 28) << 12) | /* 1b */ + (brw_inst_bits(src, 27, 24) << 8) | /* 4b */ + (brw_inst_bits(src, 23, 23) << 7) | /* 1b */ + (brw_inst_bits(src, 22, 22) << 6) | /* 1b */ + (brw_inst_bits(src, 21, 19) << 3) | /* 3b */ + (brw_inst_bits(src, 18, 16)); /* 3b */ + + for (unsigned i = 0; i < ARRAY_SIZE(gfx12_3src_control_index_table); i++) { + if (gfx12_3src_control_index_table[i] == uncompacted) { + brw_compact_inst_set_3src_control_index(devinfo, dst, i); + return true; + } + } + } else { + uint32_t uncompacted = /* 24b/BDW; 26b/CHV/SKL+ */ + (brw_inst_bits(src, 34, 32) << 21) | /* 3b */ + (brw_inst_bits(src, 28, 8)); /* 21b */ + + if (devinfo->ver >= 9 || devinfo->platform == INTEL_PLATFORM_CHV) { + uncompacted |= + brw_inst_bits(src, 36, 35) << 24; /* 2b */ + } + + for (unsigned i = 0; i < ARRAY_SIZE(gfx8_3src_control_index_table); i++) { + if (gfx8_3src_control_index_table[i] == uncompacted) { + brw_compact_inst_set_3src_control_index(devinfo, dst, i); + return true; + } + } + } + + return false; +} + +static bool +set_3src_source_index(const struct intel_device_info *devinfo, + brw_compact_inst *dst, const brw_inst *src, + bool is_dpas) +{ + assert(devinfo->ver >= 8); + + if (devinfo->ver >= 12) { + uint32_t uncompacted = /* 21b/TGL+ */ + (brw_inst_bits(src, 114, 114) << 20) | /* 1b */ + (brw_inst_bits(src, 113, 112) << 18) | /* 2b */ + (brw_inst_bits(src, 98, 98) << 17) | /* 1b */ + (brw_inst_bits(src, 97, 96) << 15) | /* 2b */ + (brw_inst_bits(src, 91, 91) << 14) | /* 1b */ + (brw_inst_bits(src, 87, 86) << 12) | /* 2b */ + (brw_inst_bits(src, 85, 84) << 10) | /* 2b */ + (brw_inst_bits(src, 83, 83) << 9) | /* 1b */ + (brw_inst_bits(src, 66, 66) << 8) | /* 1b */ + (brw_inst_bits(src, 65, 64) << 6) | /* 2b */ + (brw_inst_bits(src, 47, 47) << 5) | /* 1b */ + (brw_inst_bits(src, 46, 46) << 4) | /* 1b */ + (brw_inst_bits(src, 45, 44) << 2) | /* 2b */ + (brw_inst_bits(src, 43, 43) << 1) | /* 1b */ + (brw_inst_bits(src, 35, 35)); /* 1b */ + + /* In Xe2, the bits used to index the tables for 3src and 3src-dpas + * are the same, so just need to pick the right one. + */ + const uint32_t *three_src_source_index_table = + devinfo->ver >= 20 ? (is_dpas ? xe2_3src_dpas_source_index_table : + xe2_3src_source_index_table) : + devinfo->verx10 >= 125 ? xehp_3src_source_index_table : + gfx12_3src_source_index_table; + const uint32_t three_src_source_index_table_len = + devinfo->ver >= 20 ? (is_dpas ? ARRAY_SIZE(xe2_3src_dpas_source_index_table) : + ARRAY_SIZE(xe2_3src_source_index_table)) : + devinfo->verx10 >= 125 ? ARRAY_SIZE(xehp_3src_source_index_table) : + ARRAY_SIZE(gfx12_3src_source_index_table); + + for (unsigned i = 0; i < three_src_source_index_table_len; i++) { + if (three_src_source_index_table[i] == uncompacted) { + brw_compact_inst_set_3src_source_index(devinfo, dst, i); + return true; + } + } + } else { + uint64_t uncompacted = /* 46b/BDW; 49b/CHV/SKL+ */ + (brw_inst_bits(src, 83, 83) << 43) | /* 1b */ + (brw_inst_bits(src, 114, 107) << 35) | /* 8b */ + (brw_inst_bits(src, 93, 86) << 27) | /* 8b */ + (brw_inst_bits(src, 72, 65) << 19) | /* 8b */ + (brw_inst_bits(src, 55, 37)); /* 19b */ + + if (devinfo->ver >= 9 || devinfo->platform == INTEL_PLATFORM_CHV) { + uncompacted |= + (brw_inst_bits(src, 126, 125) << 47) | /* 2b */ + (brw_inst_bits(src, 105, 104) << 45) | /* 2b */ + (brw_inst_bits(src, 84, 84) << 44); /* 1b */ + } else { + uncompacted |= + (brw_inst_bits(src, 125, 125) << 45) | /* 1b */ + (brw_inst_bits(src, 104, 104) << 44); /* 1b */ + } + + for (unsigned i = 0; i < ARRAY_SIZE(gfx8_3src_source_index_table); i++) { + if (gfx8_3src_source_index_table[i] == uncompacted) { + brw_compact_inst_set_3src_source_index(devinfo, dst, i); + return true; + } + } + } + + return false; +} + +static bool +set_3src_subreg_index(const struct intel_device_info *devinfo, + brw_compact_inst *dst, const brw_inst *src) +{ + assert(devinfo->ver >= 12); + + uint32_t uncompacted = /* 20b/TGL+ */ + (brw_inst_bits(src, 119, 115) << 15) | /* 5b */ + (brw_inst_bits(src, 103, 99) << 10) | /* 5b */ + (brw_inst_bits(src, 71, 67) << 5) | /* 5b */ + (brw_inst_bits(src, 55, 51)); /* 5b */ + + const uint32_t *table = devinfo->ver >= 20 ? xe2_3src_subreg_table : + gfx12_3src_subreg_table; + const uint32_t len = + devinfo->ver >= 20 ? ARRAY_SIZE(xe2_3src_subreg_table) : + ARRAY_SIZE(gfx12_3src_subreg_table); + + for (unsigned i = 0; i < len; i++) { + if (table[i] == uncompacted) { + brw_compact_inst_set_3src_subreg_index(devinfo, dst, i); + return true; + } + } + + return false; +} + +static bool +has_unmapped_bits(const struct brw_isa_info *isa, const brw_inst *src) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + /* EOT can only be mapped on a send if the src1 is an immediate */ + if ((brw_inst_opcode(isa, src) == BRW_OPCODE_SENDC || + brw_inst_opcode(isa, src) == BRW_OPCODE_SEND) && + brw_inst_eot(devinfo, src)) + return true; + + /* Check for instruction bits that don't map to any of the fields of the + * compacted instruction. The instruction cannot be compacted if any of + * them are set. They overlap with: + * - NibCtrl (bit 47 on Gfx7, bit 11 on Gfx8) + * - Dst.AddrImm[9] (bit 47 on Gfx8) + * - Src0.AddrImm[9] (bit 95 on Gfx8) + * - Imm64[27:31] (bits 91-95 on Gfx7, bit 95 on Gfx8) + * - UIP[31] (bit 95 on Gfx8) + */ + if (devinfo->ver >= 12) { + assert(!brw_inst_bits(src, 7, 7)); + return false; + } else if (devinfo->ver >= 8) { + assert(!brw_inst_bits(src, 7, 7)); + return brw_inst_bits(src, 95, 95) || + brw_inst_bits(src, 47, 47) || + brw_inst_bits(src, 11, 11); + } else { + assert(!brw_inst_bits(src, 7, 7) && + !(devinfo->ver < 7 && brw_inst_bits(src, 90, 90))); + return brw_inst_bits(src, 95, 91) || + brw_inst_bits(src, 47, 47); + } +} + +static bool +has_3src_unmapped_bits(const struct intel_device_info *devinfo, + const brw_inst *src, bool is_dpas) +{ + /* Check for three-source instruction bits that don't map to any of the + * fields of the compacted instruction. All of them seem to be reserved + * bits currently. + */ + if (devinfo->ver >= 20) { + assert(is_dpas || !brw_inst_bits(src, 49, 49)); + assert(!brw_inst_bits(src, 33, 33)); + assert(!brw_inst_bits(src, 7, 7)); + } else if (devinfo->ver >= 12) { + assert(is_dpas || !brw_inst_bits(src, 49, 49)); + assert(!brw_inst_bits(src, 7, 7)); + } else if (devinfo->ver >= 9 || devinfo->platform == INTEL_PLATFORM_CHV) { + assert(!brw_inst_bits(src, 127, 127) && + !brw_inst_bits(src, 7, 7)); + } else { + assert(devinfo->ver >= 8); + assert(!brw_inst_bits(src, 127, 126) && + !brw_inst_bits(src, 105, 105) && + !brw_inst_bits(src, 84, 84) && + !brw_inst_bits(src, 7, 7)); + + /* Src1Type and Src2Type, used for mixed-precision floating point */ + if (brw_inst_bits(src, 36, 35)) + return true; + } + + return false; +} + +static bool +brw_try_compact_3src_instruction(const struct brw_isa_info *isa, + brw_compact_inst *dst, const brw_inst *src) +{ + const struct intel_device_info *devinfo = isa->devinfo; + assert(devinfo->ver >= 8); + + bool is_dpas = brw_inst_opcode(isa, src) == BRW_OPCODE_DPAS; + if (has_3src_unmapped_bits(devinfo, src, is_dpas)) + return false; + +#define compact(field) \ + brw_compact_inst_set_3src_##field(devinfo, dst, brw_inst_3src_##field(devinfo, src)) +#define compact_a16(field) \ + brw_compact_inst_set_3src_##field(devinfo, dst, brw_inst_3src_a16_##field(devinfo, src)) + + compact(hw_opcode); + + if (!set_3src_control_index(devinfo, dst, src, is_dpas)) + return false; + + if (!set_3src_source_index(devinfo, dst, src, is_dpas)) + return false; + + if (devinfo->ver >= 12) { + if (!set_3src_subreg_index(devinfo, dst, src)) + return false; + + compact(swsb); + compact(debug_control); + compact(dst_reg_nr); + compact(src0_reg_nr); + compact(src1_reg_nr); + compact(src2_reg_nr); + } else { + compact(dst_reg_nr); + compact_a16(src0_rep_ctrl); + compact(debug_control); + compact(saturate); + compact_a16(src1_rep_ctrl); + compact_a16(src2_rep_ctrl); + compact(src0_reg_nr); + compact(src1_reg_nr); + compact(src2_reg_nr); + compact_a16(src0_subreg_nr); + compact_a16(src1_subreg_nr); + compact_a16(src2_subreg_nr); + } + brw_compact_inst_set_3src_cmpt_control(devinfo, dst, true); + +#undef compact +#undef compact_a16 + + return true; +} + +/* On SNB through ICL, compacted instructions have 12-bits for immediate + * sources, and a 13th bit that's replicated through the high 20 bits. + * + * Effectively this means we get 12-bit integers, 0.0f, and some limited uses + * of packed vectors as compactable immediates. + * + * On TGL+, the high 12-bits of floating-point values (:f and :hf) are encoded + * rather than the low 12-bits. For signed integer the 12th bit is replicated, + * while for unsigned integers it is not. + * + * Returns the compacted immediate, or -1 if immediate cannot be compacted + */ +static int +compact_immediate(const struct intel_device_info *devinfo, + enum brw_reg_type type, unsigned imm) +{ + if (devinfo->ver >= 12) { + /* 16-bit immediates need to be replicated through the 32-bit immediate + * field + */ + switch (type) { + case BRW_REGISTER_TYPE_W: + case BRW_REGISTER_TYPE_UW: + case BRW_REGISTER_TYPE_HF: + if ((imm >> 16) != (imm & 0xffff)) + return -1; + break; + default: + break; + } + + switch (type) { + case BRW_REGISTER_TYPE_F: + /* We get the high 12-bits as-is; rest must be zero */ + if ((imm & 0xfffff) == 0) + return (imm >> 20) & 0xfff; + break; + case BRW_REGISTER_TYPE_HF: + /* We get the high 12-bits as-is; rest must be zero */ + if ((imm & 0xf) == 0) + return (imm >> 4) & 0xfff; + break; + case BRW_REGISTER_TYPE_UD: + case BRW_REGISTER_TYPE_VF: + case BRW_REGISTER_TYPE_UV: + case BRW_REGISTER_TYPE_V: + /* We get the low 12-bits as-is; rest must be zero */ + if ((imm & 0xfffff000) == 0) + return imm & 0xfff; + break; + case BRW_REGISTER_TYPE_UW: + /* We get the low 12-bits as-is; rest must be zero */ + if ((imm & 0xf000) == 0) + return imm & 0xfff; + break; + case BRW_REGISTER_TYPE_D: + /* We get the low 11-bits as-is; 12th is replicated */ + if (((int)imm >> 11) == 0 || ((int)imm >> 11) == -1) + return imm & 0xfff; + break; + case BRW_REGISTER_TYPE_W: + /* We get the low 11-bits as-is; 12th is replicated */ + if (((short)imm >> 11) == 0 || ((short)imm >> 11) == -1) + return imm & 0xfff; + break; + case BRW_REGISTER_TYPE_NF: + case BRW_REGISTER_TYPE_DF: + case BRW_REGISTER_TYPE_Q: + case BRW_REGISTER_TYPE_UQ: + case BRW_REGISTER_TYPE_B: + case BRW_REGISTER_TYPE_UB: + return -1; + } + } else { + /* We get the low 12 bits as-is; 13th is replicated */ + if (((int)imm >> 12) == 0 || ((int)imm >> 12 == -1)) { + return imm & 0x1fff; + } + } + + return -1; +} + +static int +uncompact_immediate(const struct intel_device_info *devinfo, + enum brw_reg_type type, unsigned compact_imm) +{ + if (devinfo->ver >= 12) { + switch (type) { + case BRW_REGISTER_TYPE_F: + return compact_imm << 20; + case BRW_REGISTER_TYPE_HF: + return (compact_imm << 20) | (compact_imm << 4); + case BRW_REGISTER_TYPE_UD: + case BRW_REGISTER_TYPE_VF: + case BRW_REGISTER_TYPE_UV: + case BRW_REGISTER_TYPE_V: + return compact_imm; + case BRW_REGISTER_TYPE_UW: + /* Replicate */ + return compact_imm << 16 | compact_imm; + case BRW_REGISTER_TYPE_D: + /* Extend the 12th bit into the high 20 bits */ + return (int)(compact_imm << 20) >> 20; + case BRW_REGISTER_TYPE_W: + /* Extend the 12th bit into the high 4 bits and replicate */ + return ((int)(compact_imm << 20) >> 4) | + ((unsigned short)((short)(compact_imm << 4) >> 4)); + case BRW_REGISTER_TYPE_NF: + case BRW_REGISTER_TYPE_DF: + case BRW_REGISTER_TYPE_Q: + case BRW_REGISTER_TYPE_UQ: + case BRW_REGISTER_TYPE_B: + case BRW_REGISTER_TYPE_UB: + unreachable("not reached"); + } + } else { + /* Replicate the 13th bit into the high 19 bits */ + return (int)(compact_imm << 19) >> 19; + } + + unreachable("not reached"); +} + +static bool +has_immediate(const struct intel_device_info *devinfo, const brw_inst *inst, + enum brw_reg_type *type) +{ + if (brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) { + *type = brw_inst_src0_type(devinfo, inst); + return *type != INVALID_REG_TYPE; + } else if (brw_inst_src1_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) { + *type = brw_inst_src1_type(devinfo, inst); + return *type != INVALID_REG_TYPE; + } + + return false; +} + +/** + * Applies some small changes to instruction types to increase chances of + * compaction. + */ +static brw_inst +precompact(const struct brw_isa_info *isa, brw_inst inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + /* In XeHP the compaction tables removed the entries for source regions + * <8;8,1> giving preference to <1;1,0> as the way to indicate + * sequential elements, so convert to those before compacting. + */ + if (devinfo->verx10 >= 125) { + if (brw_inst_src0_reg_file(devinfo, &inst) == BRW_GENERAL_REGISTER_FILE && + brw_inst_src0_vstride(devinfo, &inst) > BRW_VERTICAL_STRIDE_1 && + brw_inst_src0_vstride(devinfo, &inst) == (brw_inst_src0_width(devinfo, &inst) + 1) && + brw_inst_src0_hstride(devinfo, &inst) == BRW_HORIZONTAL_STRIDE_1) { + brw_inst_set_src0_vstride(devinfo, &inst, BRW_VERTICAL_STRIDE_1); + brw_inst_set_src0_width(devinfo, &inst, BRW_WIDTH_1); + brw_inst_set_src0_hstride(devinfo, &inst, BRW_HORIZONTAL_STRIDE_0); + } + + if (brw_inst_src1_reg_file(devinfo, &inst) == BRW_GENERAL_REGISTER_FILE && + brw_inst_src1_vstride(devinfo, &inst) > BRW_VERTICAL_STRIDE_1 && + brw_inst_src1_vstride(devinfo, &inst) == (brw_inst_src1_width(devinfo, &inst) + 1) && + brw_inst_src1_hstride(devinfo, &inst) == BRW_HORIZONTAL_STRIDE_1) { + brw_inst_set_src1_vstride(devinfo, &inst, BRW_VERTICAL_STRIDE_1); + brw_inst_set_src1_width(devinfo, &inst, BRW_WIDTH_1); + brw_inst_set_src1_hstride(devinfo, &inst, BRW_HORIZONTAL_STRIDE_0); + } + } + + if (brw_inst_src0_reg_file(devinfo, &inst) != BRW_IMMEDIATE_VALUE) + return inst; + + /* The Bspec's section titled "Non-present Operands" claims that if src0 + * is an immediate that src1's type must be the same as that of src0. + * + * The SNB+ DataTypeIndex instruction compaction tables contain mappings + * that do not follow this rule. E.g., from the IVB/HSW table: + * + * DataTypeIndex 18-Bit Mapping Mapped Meaning + * 3 001000001011111101 r:f | i:vf | a:ud | <1> | dir | + * + * And from the SNB table: + * + * DataTypeIndex 18-Bit Mapping Mapped Meaning + * 8 001000000111101100 a:w | i:w | a:ud | <1> | dir | + * + * Neither of these cause warnings from the simulator when used, + * compacted or otherwise. In fact, all compaction mappings that have an + * immediate in src0 use a:ud for src1. + * + * The GM45 instruction compaction tables do not contain mapped meanings + * so it's not clear whether it has the restriction. We'll assume it was + * lifted on SNB. (FINISHME: decode the GM45 tables and check.) + * + * Don't do any of this for 64-bit immediates, since the src1 fields + * overlap with the immediate and setting them would overwrite the + * immediate we set. + */ + if (devinfo->ver >= 6 && + !(devinfo->platform == INTEL_PLATFORM_HSW && + brw_inst_opcode(isa, &inst) == BRW_OPCODE_DIM) && + !(devinfo->ver >= 8 && + (brw_inst_src0_type(devinfo, &inst) == BRW_REGISTER_TYPE_DF || + brw_inst_src0_type(devinfo, &inst) == BRW_REGISTER_TYPE_UQ || + brw_inst_src0_type(devinfo, &inst) == BRW_REGISTER_TYPE_Q))) { + brw_inst_set_src1_reg_hw_type(devinfo, &inst, 0); + } + + /* Compacted instructions only have 12-bits (plus 1 for the other 20) + * for immediate values. Presumably the hardware engineers realized + * that the only useful floating-point value that could be represented + * in this format is 0.0, which can also be represented as a VF-typed + * immediate, so they gave us the previously mentioned mapping on IVB+. + * + * Strangely, we do have a mapping for imm:f in src1, so we don't need + * to do this there. + * + * If we see a 0.0:F, change the type to VF so that it can be compacted. + * + * Compaction of floating-point immediates is improved on Gfx12, thus + * removing the need for this. + */ + if (devinfo->ver < 12 && + brw_inst_imm_ud(devinfo, &inst) == 0x0 && + brw_inst_src0_type(devinfo, &inst) == BRW_REGISTER_TYPE_F && + brw_inst_dst_type(devinfo, &inst) == BRW_REGISTER_TYPE_F && + brw_inst_dst_hstride(devinfo, &inst) == BRW_HORIZONTAL_STRIDE_1) { + enum brw_reg_file file = brw_inst_src0_reg_file(devinfo, &inst); + brw_inst_set_src0_file_type(devinfo, &inst, file, BRW_REGISTER_TYPE_VF); + } + + /* There are no mappings for dst:d | i:d, so if the immediate is suitable + * set the types to :UD so the instruction can be compacted. + * + * FINISHME: Use dst:f | imm:f on Gfx12 + */ + if (devinfo->ver < 12 && + compact_immediate(devinfo, BRW_REGISTER_TYPE_D, + brw_inst_imm_ud(devinfo, &inst)) != -1 && + brw_inst_cond_modifier(devinfo, &inst) == BRW_CONDITIONAL_NONE && + brw_inst_src0_type(devinfo, &inst) == BRW_REGISTER_TYPE_D && + brw_inst_dst_type(devinfo, &inst) == BRW_REGISTER_TYPE_D) { + enum brw_reg_file src_file = brw_inst_src0_reg_file(devinfo, &inst); + enum brw_reg_file dst_file = brw_inst_dst_reg_file(devinfo, &inst); + + brw_inst_set_src0_file_type(devinfo, &inst, src_file, BRW_REGISTER_TYPE_UD); + brw_inst_set_dst_file_type(devinfo, &inst, dst_file, BRW_REGISTER_TYPE_UD); + } + + return inst; +} + +/** + * Tries to compact instruction src into dst. + * + * It doesn't modify dst unless src is compactable, which is relied on by + * brw_compact_instructions(). + */ +static bool +try_compact_instruction(const struct compaction_state *c, + brw_compact_inst *dst, const brw_inst *src) +{ + const struct intel_device_info *devinfo = c->isa->devinfo; + brw_compact_inst temp; + + assert(brw_inst_cmpt_control(devinfo, src) == 0); + + if (is_3src(c->isa, brw_inst_opcode(c->isa, src))) { + if (devinfo->ver >= 8) { + memset(&temp, 0, sizeof(temp)); + if (brw_try_compact_3src_instruction(c->isa, &temp, src)) { + *dst = temp; + return true; + } else { + return false; + } + } else { + return false; + } + } + + enum brw_reg_type type; + bool is_immediate = has_immediate(devinfo, src, &type); + + unsigned compacted_imm = 0; + + if (is_immediate) { + /* Instructions with immediates cannot be compacted on Gen < 6 */ + if (devinfo->ver < 6) + return false; + + compacted_imm = compact_immediate(devinfo, type, + brw_inst_imm_ud(devinfo, src)); + if (compacted_imm == -1) + return false; + } + + if (has_unmapped_bits(c->isa, src)) + return false; + + memset(&temp, 0, sizeof(temp)); + +#define compact(field) \ + brw_compact_inst_set_##field(devinfo, &temp, brw_inst_##field(devinfo, src)) +#define compact_reg(field) \ + brw_compact_inst_set_##field##_reg_nr(devinfo, &temp, \ + brw_inst_##field##_da_reg_nr(devinfo, src)) + + compact(hw_opcode); + compact(debug_control); + + if (!set_control_index(c, &temp, src)) + return false; + if (!set_datatype_index(c, &temp, src, is_immediate)) + return false; + if (!set_subreg_index(c, &temp, src, is_immediate)) + return false; + if (!set_src0_index(c, &temp, src)) + return false; + if (!set_src1_index(c, &temp, src, is_immediate, compacted_imm)) + return false; + + if (devinfo->ver >= 12) { + compact(swsb); + compact_reg(dst); + compact_reg(src0); + + if (is_immediate) { + /* src1 reg takes the high 8 bits (of the 12-bit compacted value) */ + brw_compact_inst_set_src1_reg_nr(devinfo, &temp, compacted_imm >> 4); + } else { + compact_reg(src1); + } + } else { + if (devinfo->ver >= 6) { + compact(acc_wr_control); + } else { + compact(mask_control_ex); + } + + if (devinfo->ver <= 6) + compact(flag_subreg_nr); + + compact(cond_modifier); + + compact_reg(dst); + compact_reg(src0); + + if (is_immediate) { + /* src1 reg takes the low 8 bits (of the 13-bit compacted value) */ + brw_compact_inst_set_src1_reg_nr(devinfo, &temp, compacted_imm & 0xff); + } else { + compact_reg(src1); + } + } + brw_compact_inst_set_cmpt_control(devinfo, &temp, true); + +#undef compact +#undef compact_reg + + *dst = temp; + + return true; +} + +bool +brw_try_compact_instruction(const struct brw_isa_info *isa, + brw_compact_inst *dst, const brw_inst *src) +{ + struct compaction_state c; + compaction_state_init(&c, isa); + return try_compact_instruction(&c, dst, src); +} + +static void +set_uncompacted_control(const struct compaction_state *c, brw_inst *dst, + brw_compact_inst *src) +{ + const struct intel_device_info *devinfo = c->isa->devinfo; + uint32_t uncompacted = + c->control_index_table[brw_compact_inst_control_index(devinfo, src)]; + + if (devinfo->ver >= 20) { + brw_inst_set_bits(dst, 95, 92, (uncompacted >> 14) & 0xf); + brw_inst_set_bits(dst, 34, 34, (uncompacted >> 13) & 0x1); + brw_inst_set_bits(dst, 32, 32, (uncompacted >> 12) & 0x1); + brw_inst_set_bits(dst, 31, 31, (uncompacted >> 11) & 0x1); + brw_inst_set_bits(dst, 28, 28, (uncompacted >> 10) & 0x1); + brw_inst_set_bits(dst, 27, 26, (uncompacted >> 8) & 0x3); + brw_inst_set_bits(dst, 25, 24, (uncompacted >> 6) & 0x3); + brw_inst_set_bits(dst, 23, 21, (uncompacted >> 3) & 0x7); + brw_inst_set_bits(dst, 20, 18, (uncompacted >> 0) & 0x7); + } else if (devinfo->ver >= 12) { + brw_inst_set_bits(dst, 95, 92, (uncompacted >> 17)); + brw_inst_set_bits(dst, 34, 34, (uncompacted >> 16) & 0x1); + brw_inst_set_bits(dst, 33, 33, (uncompacted >> 15) & 0x1); + brw_inst_set_bits(dst, 32, 32, (uncompacted >> 14) & 0x1); + brw_inst_set_bits(dst, 31, 31, (uncompacted >> 13) & 0x1); + brw_inst_set_bits(dst, 28, 28, (uncompacted >> 12) & 0x1); + brw_inst_set_bits(dst, 27, 24, (uncompacted >> 8) & 0xf); + brw_inst_set_bits(dst, 23, 22, (uncompacted >> 6) & 0x3); + brw_inst_set_bits(dst, 21, 19, (uncompacted >> 3) & 0x7); + brw_inst_set_bits(dst, 18, 16, (uncompacted >> 0) & 0x7); + } else if (devinfo->ver >= 8) { + brw_inst_set_bits(dst, 33, 31, (uncompacted >> 16)); + brw_inst_set_bits(dst, 23, 12, (uncompacted >> 4) & 0xfff); + brw_inst_set_bits(dst, 10, 9, (uncompacted >> 2) & 0x3); + brw_inst_set_bits(dst, 34, 34, (uncompacted >> 1) & 0x1); + brw_inst_set_bits(dst, 8, 8, (uncompacted >> 0) & 0x1); + } else { + brw_inst_set_bits(dst, 31, 31, (uncompacted >> 16) & 0x1); + brw_inst_set_bits(dst, 23, 8, (uncompacted & 0xffff)); + + if (devinfo->ver == 7) + brw_inst_set_bits(dst, 90, 89, uncompacted >> 17); + } +} + +static void +set_uncompacted_datatype(const struct compaction_state *c, brw_inst *dst, + brw_compact_inst *src) +{ + const struct intel_device_info *devinfo = c->isa->devinfo; + uint32_t uncompacted = + c->datatype_table[brw_compact_inst_datatype_index(devinfo, src)]; + + if (devinfo->ver >= 12) { + brw_inst_set_bits(dst, 98, 98, (uncompacted >> 19)); + brw_inst_set_bits(dst, 91, 88, (uncompacted >> 15) & 0xf); + brw_inst_set_bits(dst, 66, 66, (uncompacted >> 14) & 0x1); + brw_inst_set_bits(dst, 50, 50, (uncompacted >> 13) & 0x1); + brw_inst_set_bits(dst, 49, 48, (uncompacted >> 11) & 0x3); + brw_inst_set_bits(dst, 47, 47, (uncompacted >> 10) & 0x1); + brw_inst_set_bits(dst, 46, 46, (uncompacted >> 9) & 0x1); + brw_inst_set_bits(dst, 43, 40, (uncompacted >> 5) & 0xf); + brw_inst_set_bits(dst, 39, 36, (uncompacted >> 1) & 0xf); + brw_inst_set_bits(dst, 35, 35, (uncompacted >> 0) & 0x1); + } else if (devinfo->ver >= 8) { + brw_inst_set_bits(dst, 63, 61, (uncompacted >> 18)); + brw_inst_set_bits(dst, 94, 89, (uncompacted >> 12) & 0x3f); + brw_inst_set_bits(dst, 46, 35, (uncompacted >> 0) & 0xfff); + } else { + brw_inst_set_bits(dst, 63, 61, (uncompacted >> 15)); + brw_inst_set_bits(dst, 46, 32, (uncompacted & 0x7fff)); + } +} + +static void +set_uncompacted_subreg(const struct compaction_state *c, brw_inst *dst, + brw_compact_inst *src) +{ + const struct intel_device_info *devinfo = c->isa->devinfo; + uint16_t uncompacted = + c->subreg_table[brw_compact_inst_subreg_index(devinfo, src)]; + + if (devinfo->ver >= 20) { + brw_inst_set_bits(dst, 33, 33, (uncompacted >> 0) & 0x1); + brw_inst_set_bits(dst, 55, 51, (uncompacted >> 1) & 0x1f); + brw_inst_set_bits(dst, 71, 67, (uncompacted >> 6) & 0x1f); + brw_inst_set_bits(dst, 87, 87, (uncompacted >> 11) & 0x1); + } else if (devinfo->ver >= 12) { + brw_inst_set_bits(dst, 103, 99, (uncompacted >> 10)); + brw_inst_set_bits(dst, 71, 67, (uncompacted >> 5) & 0x1f); + brw_inst_set_bits(dst, 55, 51, (uncompacted >> 0) & 0x1f); + } else { + brw_inst_set_bits(dst, 100, 96, (uncompacted >> 10)); + brw_inst_set_bits(dst, 68, 64, (uncompacted >> 5) & 0x1f); + brw_inst_set_bits(dst, 52, 48, (uncompacted >> 0) & 0x1f); + } +} + +static void +set_uncompacted_src0(const struct compaction_state *c, brw_inst *dst, + brw_compact_inst *src) +{ + const struct intel_device_info *devinfo = c->isa->devinfo; + uint32_t compacted = brw_compact_inst_src0_index(devinfo, src); + uint16_t uncompacted = c->src0_index_table[compacted]; + + if (devinfo->ver >= 12) { + if (devinfo->ver < 20) + brw_inst_set_bits(dst, 87, 87, (uncompacted >> 11) & 0x1); + brw_inst_set_bits(dst, 86, 84, (uncompacted >> 8) & 0x7); + brw_inst_set_bits(dst, 83, 81, (uncompacted >> 5) & 0x7); + brw_inst_set_bits(dst, 80, 80, (uncompacted >> 4) & 0x1); + brw_inst_set_bits(dst, 65, 64, (uncompacted >> 2) & 0x3); + brw_inst_set_bits(dst, 45, 44, (uncompacted >> 0) & 0x3); + } else { + brw_inst_set_bits(dst, 88, 77, uncompacted); + } +} + +static void +set_uncompacted_src1(const struct compaction_state *c, brw_inst *dst, + brw_compact_inst *src) +{ + const struct intel_device_info *devinfo = c->isa->devinfo; + uint16_t uncompacted = + c->src1_index_table[brw_compact_inst_src1_index(devinfo, src)]; + + if (devinfo->ver >= 20) { + brw_inst_set_bits(dst, 121, 120, (uncompacted >> 14) & 0x3); + brw_inst_set_bits(dst, 118, 116, (uncompacted >> 11) & 0x7); + brw_inst_set_bits(dst, 115, 113, (uncompacted >> 8) & 0x7); + brw_inst_set_bits(dst, 112, 112, (uncompacted >> 7) & 0x1); + brw_inst_set_bits(dst, 103, 99, (uncompacted >> 2) & 0x1f); + brw_inst_set_bits(dst, 97, 96, (uncompacted >> 0) & 0x3); + } else if (devinfo->ver >= 12) { + brw_inst_set_bits(dst, 121, 120, (uncompacted >> 10)); + brw_inst_set_bits(dst, 119, 116, (uncompacted >> 6) & 0xf); + brw_inst_set_bits(dst, 115, 113, (uncompacted >> 3) & 0x7); + brw_inst_set_bits(dst, 112, 112, (uncompacted >> 2) & 0x1); + brw_inst_set_bits(dst, 97, 96, (uncompacted >> 0) & 0x3); + } else { + brw_inst_set_bits(dst, 120, 109, uncompacted); + } +} + +static void +set_uncompacted_3src_control_index(const struct compaction_state *c, + brw_inst *dst, brw_compact_inst *src, + bool is_dpas) +{ + const struct intel_device_info *devinfo = c->isa->devinfo; + assert(devinfo->ver >= 8); + + if (devinfo->ver >= 20) { + uint64_t compacted = brw_compact_inst_3src_control_index(devinfo, src); + uint64_t uncompacted = is_dpas ? xe2_3src_dpas_control_index_table[compacted] : + xe2_3src_control_index_table[compacted]; + + brw_inst_set_bits(dst, 95, 92, (uncompacted >> 30) & 0xf); + brw_inst_set_bits(dst, 90, 88, (uncompacted >> 27) & 0x7); + brw_inst_set_bits(dst, 82, 80, (uncompacted >> 24) & 0x7); + brw_inst_set_bits(dst, 50, 50, (uncompacted >> 23) & 0x1); + brw_inst_set_bits(dst, 49, 48, (uncompacted >> 21) & 0x3); + brw_inst_set_bits(dst, 42, 40, (uncompacted >> 18) & 0x7); + brw_inst_set_bits(dst, 39, 39, (uncompacted >> 17) & 0x1); + brw_inst_set_bits(dst, 38, 36, (uncompacted >> 14) & 0x7); + brw_inst_set_bits(dst, 34, 34, (uncompacted >> 13) & 0x1); + brw_inst_set_bits(dst, 32, 32, (uncompacted >> 12) & 0x1); + brw_inst_set_bits(dst, 31, 31, (uncompacted >> 11) & 0x1); + brw_inst_set_bits(dst, 28, 28, (uncompacted >> 10) & 0x1); + brw_inst_set_bits(dst, 27, 26, (uncompacted >> 8) & 0x3); + brw_inst_set_bits(dst, 25, 24, (uncompacted >> 6) & 0x3); + brw_inst_set_bits(dst, 23, 21, (uncompacted >> 3) & 0x7); + brw_inst_set_bits(dst, 20, 18, (uncompacted >> 0) & 0x7); + + } else if (devinfo->verx10 >= 125) { + uint64_t compacted = brw_compact_inst_3src_control_index(devinfo, src); + uint64_t uncompacted = xehp_3src_control_index_table[compacted]; + + brw_inst_set_bits(dst, 95, 92, (uncompacted >> 33)); + brw_inst_set_bits(dst, 90, 88, (uncompacted >> 30) & 0x7); + brw_inst_set_bits(dst, 82, 80, (uncompacted >> 27) & 0x7); + brw_inst_set_bits(dst, 50, 50, (uncompacted >> 26) & 0x1); + brw_inst_set_bits(dst, 49, 48, (uncompacted >> 24) & 0x3); + brw_inst_set_bits(dst, 42, 40, (uncompacted >> 21) & 0x7); + brw_inst_set_bits(dst, 39, 39, (uncompacted >> 20) & 0x1); + brw_inst_set_bits(dst, 38, 36, (uncompacted >> 17) & 0x7); + brw_inst_set_bits(dst, 34, 34, (uncompacted >> 16) & 0x1); + brw_inst_set_bits(dst, 33, 33, (uncompacted >> 15) & 0x1); + brw_inst_set_bits(dst, 32, 32, (uncompacted >> 14) & 0x1); + brw_inst_set_bits(dst, 31, 31, (uncompacted >> 13) & 0x1); + brw_inst_set_bits(dst, 28, 28, (uncompacted >> 12) & 0x1); + brw_inst_set_bits(dst, 27, 24, (uncompacted >> 8) & 0xf); + brw_inst_set_bits(dst, 23, 23, (uncompacted >> 7) & 0x1); + brw_inst_set_bits(dst, 22, 22, (uncompacted >> 6) & 0x1); + brw_inst_set_bits(dst, 21, 19, (uncompacted >> 3) & 0x7); + brw_inst_set_bits(dst, 18, 16, (uncompacted >> 0) & 0x7); + + } else if (devinfo->ver >= 12) { + uint64_t compacted = brw_compact_inst_3src_control_index(devinfo, src); + uint64_t uncompacted = gfx12_3src_control_index_table[compacted]; + + brw_inst_set_bits(dst, 95, 92, (uncompacted >> 32)); + brw_inst_set_bits(dst, 90, 88, (uncompacted >> 29) & 0x7); + brw_inst_set_bits(dst, 82, 80, (uncompacted >> 26) & 0x7); + brw_inst_set_bits(dst, 50, 50, (uncompacted >> 25) & 0x1); + brw_inst_set_bits(dst, 48, 48, (uncompacted >> 24) & 0x1); + brw_inst_set_bits(dst, 42, 40, (uncompacted >> 21) & 0x7); + brw_inst_set_bits(dst, 39, 39, (uncompacted >> 20) & 0x1); + brw_inst_set_bits(dst, 38, 36, (uncompacted >> 17) & 0x7); + brw_inst_set_bits(dst, 34, 34, (uncompacted >> 16) & 0x1); + brw_inst_set_bits(dst, 33, 33, (uncompacted >> 15) & 0x1); + brw_inst_set_bits(dst, 32, 32, (uncompacted >> 14) & 0x1); + brw_inst_set_bits(dst, 31, 31, (uncompacted >> 13) & 0x1); + brw_inst_set_bits(dst, 28, 28, (uncompacted >> 12) & 0x1); + brw_inst_set_bits(dst, 27, 24, (uncompacted >> 8) & 0xf); + brw_inst_set_bits(dst, 23, 23, (uncompacted >> 7) & 0x1); + brw_inst_set_bits(dst, 22, 22, (uncompacted >> 6) & 0x1); + brw_inst_set_bits(dst, 21, 19, (uncompacted >> 3) & 0x7); + brw_inst_set_bits(dst, 18, 16, (uncompacted >> 0) & 0x7); + } else { + uint32_t compacted = brw_compact_inst_3src_control_index(devinfo, src); + uint32_t uncompacted = gfx8_3src_control_index_table[compacted]; + + brw_inst_set_bits(dst, 34, 32, (uncompacted >> 21) & 0x7); + brw_inst_set_bits(dst, 28, 8, (uncompacted >> 0) & 0x1fffff); + + if (devinfo->ver >= 9 || devinfo->platform == INTEL_PLATFORM_CHV) + brw_inst_set_bits(dst, 36, 35, (uncompacted >> 24) & 0x3); + } +} + +static void +set_uncompacted_3src_source_index(const struct intel_device_info *devinfo, + brw_inst *dst, brw_compact_inst *src, + bool is_dpas) +{ + assert(devinfo->ver >= 8); + + uint32_t compacted = brw_compact_inst_3src_source_index(devinfo, src); + + if (devinfo->ver >= 12) { + const uint32_t *three_src_source_index_table = + devinfo->ver >= 20 ? (is_dpas ? xe2_3src_dpas_source_index_table : + xe2_3src_source_index_table) : + devinfo->verx10 >= 125 ? xehp_3src_source_index_table : + gfx12_3src_source_index_table; + uint32_t uncompacted = three_src_source_index_table[compacted]; + + brw_inst_set_bits(dst, 114, 114, (uncompacted >> 20)); + brw_inst_set_bits(dst, 113, 112, (uncompacted >> 18) & 0x3); + brw_inst_set_bits(dst, 98, 98, (uncompacted >> 17) & 0x1); + brw_inst_set_bits(dst, 97, 96, (uncompacted >> 15) & 0x3); + brw_inst_set_bits(dst, 91, 91, (uncompacted >> 14) & 0x1); + brw_inst_set_bits(dst, 87, 86, (uncompacted >> 12) & 0x3); + brw_inst_set_bits(dst, 85, 84, (uncompacted >> 10) & 0x3); + brw_inst_set_bits(dst, 83, 83, (uncompacted >> 9) & 0x1); + brw_inst_set_bits(dst, 66, 66, (uncompacted >> 8) & 0x1); + brw_inst_set_bits(dst, 65, 64, (uncompacted >> 6) & 0x3); + brw_inst_set_bits(dst, 47, 47, (uncompacted >> 5) & 0x1); + brw_inst_set_bits(dst, 46, 46, (uncompacted >> 4) & 0x1); + brw_inst_set_bits(dst, 45, 44, (uncompacted >> 2) & 0x3); + brw_inst_set_bits(dst, 43, 43, (uncompacted >> 1) & 0x1); + brw_inst_set_bits(dst, 35, 35, (uncompacted >> 0) & 0x1); + } else { + uint64_t uncompacted = gfx8_3src_source_index_table[compacted]; + + brw_inst_set_bits(dst, 83, 83, (uncompacted >> 43) & 0x1); + brw_inst_set_bits(dst, 114, 107, (uncompacted >> 35) & 0xff); + brw_inst_set_bits(dst, 93, 86, (uncompacted >> 27) & 0xff); + brw_inst_set_bits(dst, 72, 65, (uncompacted >> 19) & 0xff); + brw_inst_set_bits(dst, 55, 37, (uncompacted >> 0) & 0x7ffff); + + if (devinfo->ver >= 9 || devinfo->platform == INTEL_PLATFORM_CHV) { + brw_inst_set_bits(dst, 126, 125, (uncompacted >> 47) & 0x3); + brw_inst_set_bits(dst, 105, 104, (uncompacted >> 45) & 0x3); + brw_inst_set_bits(dst, 84, 84, (uncompacted >> 44) & 0x1); + } else { + brw_inst_set_bits(dst, 125, 125, (uncompacted >> 45) & 0x1); + brw_inst_set_bits(dst, 104, 104, (uncompacted >> 44) & 0x1); + } + } +} + +static void +set_uncompacted_3src_subreg_index(const struct intel_device_info *devinfo, + brw_inst *dst, brw_compact_inst *src) +{ + assert(devinfo->ver >= 12); + + uint32_t compacted = brw_compact_inst_3src_subreg_index(devinfo, src); + uint32_t uncompacted = (devinfo->ver >= 20 ? xe2_3src_subreg_table[compacted]: + gfx12_3src_subreg_table[compacted]); + + brw_inst_set_bits(dst, 119, 115, (uncompacted >> 15)); + brw_inst_set_bits(dst, 103, 99, (uncompacted >> 10) & 0x1f); + brw_inst_set_bits(dst, 71, 67, (uncompacted >> 5) & 0x1f); + brw_inst_set_bits(dst, 55, 51, (uncompacted >> 0) & 0x1f); +} + +static void +brw_uncompact_3src_instruction(const struct compaction_state *c, + brw_inst *dst, brw_compact_inst *src, bool is_dpas) +{ + const struct intel_device_info *devinfo = c->isa->devinfo; + assert(devinfo->ver >= 8); + +#define uncompact(field) \ + brw_inst_set_3src_##field(devinfo, dst, brw_compact_inst_3src_##field(devinfo, src)) +#define uncompact_a16(field) \ + brw_inst_set_3src_a16_##field(devinfo, dst, brw_compact_inst_3src_##field(devinfo, src)) + + uncompact(hw_opcode); + + if (devinfo->ver >= 12) { + set_uncompacted_3src_control_index(c, dst, src, is_dpas); + set_uncompacted_3src_source_index(devinfo, dst, src, is_dpas); + set_uncompacted_3src_subreg_index(devinfo, dst, src); + + uncompact(debug_control); + uncompact(swsb); + uncompact(dst_reg_nr); + uncompact(src0_reg_nr); + uncompact(src1_reg_nr); + uncompact(src2_reg_nr); + } else { + set_uncompacted_3src_control_index(c, dst, src, is_dpas); + set_uncompacted_3src_source_index(devinfo, dst, src, is_dpas); + + uncompact(dst_reg_nr); + uncompact_a16(src0_rep_ctrl); + uncompact(debug_control); + uncompact(saturate); + uncompact_a16(src1_rep_ctrl); + uncompact_a16(src2_rep_ctrl); + uncompact(src0_reg_nr); + uncompact(src1_reg_nr); + uncompact(src2_reg_nr); + uncompact_a16(src0_subreg_nr); + uncompact_a16(src1_subreg_nr); + uncompact_a16(src2_subreg_nr); + } + brw_inst_set_3src_cmpt_control(devinfo, dst, false); + +#undef uncompact +#undef uncompact_a16 +} + +static void +uncompact_instruction(const struct compaction_state *c, brw_inst *dst, + brw_compact_inst *src) +{ + const struct intel_device_info *devinfo = c->isa->devinfo; + memset(dst, 0, sizeof(*dst)); + + if (devinfo->ver >= 8) { + const enum opcode opcode = + brw_opcode_decode(c->isa, brw_compact_inst_3src_hw_opcode(devinfo, src)); + if (is_3src(c->isa, opcode)) { + const bool is_dpas = opcode == BRW_OPCODE_DPAS; + brw_uncompact_3src_instruction(c, dst, src, is_dpas); + return; + } + } + +#define uncompact(field) \ + brw_inst_set_##field(devinfo, dst, brw_compact_inst_##field(devinfo, src)) +#define uncompact_reg(field) \ + brw_inst_set_##field##_da_reg_nr(devinfo, dst, \ + brw_compact_inst_##field##_reg_nr(devinfo, src)) + + uncompact(hw_opcode); + uncompact(debug_control); + + set_uncompacted_control(c, dst, src); + set_uncompacted_datatype(c, dst, src); + set_uncompacted_subreg(c, dst, src); + set_uncompacted_src0(c, dst, src); + + enum brw_reg_type type; + if (has_immediate(devinfo, dst, &type)) { + unsigned imm = uncompact_immediate(devinfo, type, + brw_compact_inst_imm(devinfo, src)); + brw_inst_set_imm_ud(devinfo, dst, imm); + } else { + set_uncompacted_src1(c, dst, src); + uncompact_reg(src1); + } + + if (devinfo->ver >= 12) { + uncompact(swsb); + uncompact_reg(dst); + uncompact_reg(src0); + } else { + if (devinfo->ver >= 6) { + uncompact(acc_wr_control); + } else { + uncompact(mask_control_ex); + } + + uncompact(cond_modifier); + + if (devinfo->ver <= 6) + uncompact(flag_subreg_nr); + + uncompact_reg(dst); + uncompact_reg(src0); + } + brw_inst_set_cmpt_control(devinfo, dst, false); + +#undef uncompact +#undef uncompact_reg +} + +void +brw_uncompact_instruction(const struct brw_isa_info *isa, + brw_inst *dst, brw_compact_inst *src) +{ + struct compaction_state c; + compaction_state_init(&c, isa); + uncompact_instruction(&c, dst, src); +} + +void +brw_debug_compact_uncompact(const struct brw_isa_info *isa, + brw_inst *orig, + brw_inst *uncompacted) +{ + fprintf(stderr, "Instruction compact/uncompact changed (gen%d):\n", + isa->devinfo->ver); + + fprintf(stderr, " before: "); + brw_disassemble_inst(stderr, isa, orig, true, 0, NULL); + + fprintf(stderr, " after: "); + brw_disassemble_inst(stderr, isa, uncompacted, false, 0, NULL); + + uint32_t *before_bits = (uint32_t *)orig; + uint32_t *after_bits = (uint32_t *)uncompacted; + fprintf(stderr, " changed bits:\n"); + for (int i = 0; i < 128; i++) { + uint32_t before = before_bits[i / 32] & (1 << (i & 31)); + uint32_t after = after_bits[i / 32] & (1 << (i & 31)); + + if (before != after) { + fprintf(stderr, " bit %d, %s to %s\n", i, + before ? "set" : "unset", + after ? "set" : "unset"); + } + } +} + +static int +compacted_between(int old_ip, int old_target_ip, int *compacted_counts) +{ + int this_compacted_count = compacted_counts[old_ip]; + int target_compacted_count = compacted_counts[old_target_ip]; + return target_compacted_count - this_compacted_count; +} + +static void +update_uip_jip(const struct brw_isa_info *isa, brw_inst *insn, + int this_old_ip, int *compacted_counts) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + /* JIP and UIP are in units of: + * - bytes on Gfx8+; and + * - compacted instructions on Gfx6+. + */ + int shift = devinfo->ver >= 8 ? 3 : 0; + + int32_t jip_compacted = brw_inst_jip(devinfo, insn) >> shift; + jip_compacted -= compacted_between(this_old_ip, + this_old_ip + (jip_compacted / 2), + compacted_counts); + brw_inst_set_jip(devinfo, insn, jip_compacted << shift); + + if (brw_inst_opcode(isa, insn) == BRW_OPCODE_ENDIF || + brw_inst_opcode(isa, insn) == BRW_OPCODE_WHILE || + (brw_inst_opcode(isa, insn) == BRW_OPCODE_ELSE && devinfo->ver <= 7)) + return; + + int32_t uip_compacted = brw_inst_uip(devinfo, insn) >> shift; + uip_compacted -= compacted_between(this_old_ip, + this_old_ip + (uip_compacted / 2), + compacted_counts); + brw_inst_set_uip(devinfo, insn, uip_compacted << shift); +} + +static void +update_gfx4_jump_count(const struct intel_device_info *devinfo, brw_inst *insn, + int this_old_ip, int *compacted_counts) +{ + assert(devinfo->ver == 5 || devinfo->platform == INTEL_PLATFORM_G4X); + + /* Jump Count is in units of: + * - uncompacted instructions on G45; and + * - compacted instructions on Gfx5. + */ + int shift = devinfo->platform == INTEL_PLATFORM_G4X ? 1 : 0; + + int jump_count_compacted = brw_inst_gfx4_jump_count(devinfo, insn) << shift; + + int target_old_ip = this_old_ip + (jump_count_compacted / 2); + + int this_compacted_count = compacted_counts[this_old_ip]; + int target_compacted_count = compacted_counts[target_old_ip]; + + jump_count_compacted -= (target_compacted_count - this_compacted_count); + brw_inst_set_gfx4_jump_count(devinfo, insn, jump_count_compacted >> shift); +} + +static void +compaction_state_init(struct compaction_state *c, + const struct brw_isa_info *isa) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + assert(g45_control_index_table[ARRAY_SIZE(g45_control_index_table) - 1] != 0); + assert(g45_datatype_table[ARRAY_SIZE(g45_datatype_table) - 1] != 0); + assert(g45_subreg_table[ARRAY_SIZE(g45_subreg_table) - 1] != 0); + assert(g45_src_index_table[ARRAY_SIZE(g45_src_index_table) - 1] != 0); + assert(gfx6_control_index_table[ARRAY_SIZE(gfx6_control_index_table) - 1] != 0); + assert(gfx6_datatype_table[ARRAY_SIZE(gfx6_datatype_table) - 1] != 0); + assert(gfx6_subreg_table[ARRAY_SIZE(gfx6_subreg_table) - 1] != 0); + assert(gfx6_src_index_table[ARRAY_SIZE(gfx6_src_index_table) - 1] != 0); + assert(gfx7_control_index_table[ARRAY_SIZE(gfx7_control_index_table) - 1] != 0); + assert(gfx7_datatype_table[ARRAY_SIZE(gfx7_datatype_table) - 1] != 0); + assert(gfx7_subreg_table[ARRAY_SIZE(gfx7_subreg_table) - 1] != 0); + assert(gfx7_src_index_table[ARRAY_SIZE(gfx7_src_index_table) - 1] != 0); + assert(gfx8_control_index_table[ARRAY_SIZE(gfx8_control_index_table) - 1] != 0); + assert(gfx8_datatype_table[ARRAY_SIZE(gfx8_datatype_table) - 1] != 0); + assert(gfx8_subreg_table[ARRAY_SIZE(gfx8_subreg_table) - 1] != 0); + assert(gfx8_src_index_table[ARRAY_SIZE(gfx8_src_index_table) - 1] != 0); + assert(gfx11_datatype_table[ARRAY_SIZE(gfx11_datatype_table) - 1] != 0); + assert(gfx12_control_index_table[ARRAY_SIZE(gfx12_control_index_table) - 1] != 0); + assert(gfx12_datatype_table[ARRAY_SIZE(gfx12_datatype_table) - 1] != 0); + assert(gfx12_subreg_table[ARRAY_SIZE(gfx12_subreg_table) - 1] != 0); + assert(gfx12_src0_index_table[ARRAY_SIZE(gfx12_src0_index_table) - 1] != 0); + assert(gfx12_src1_index_table[ARRAY_SIZE(gfx12_src1_index_table) - 1] != 0); + assert(xehp_src0_index_table[ARRAY_SIZE(xehp_src0_index_table) - 1] != 0); + assert(xehp_src1_index_table[ARRAY_SIZE(xehp_src1_index_table) - 1] != 0); + assert(xe2_control_index_table[ARRAY_SIZE(xe2_control_index_table) - 1] != 0); + assert(xe2_datatype_table[ARRAY_SIZE(xe2_datatype_table) - 1] != 0); + assert(xe2_subreg_table[ARRAY_SIZE(xe2_subreg_table) - 1] != 0); + assert(xe2_src0_index_table[ARRAY_SIZE(xe2_src0_index_table) - 1] != 0); + assert(xe2_src1_index_table[ARRAY_SIZE(xe2_src1_index_table) - 1] != 0); + + c->isa = isa; + switch (devinfo->ver) { + case 20: + c->control_index_table = xe2_control_index_table; + c->datatype_table = xe2_datatype_table; + c->subreg_table = xe2_subreg_table; + c->src0_index_table = xe2_src0_index_table; + c->src1_index_table = xe2_src1_index_table; + break; + case 12: + c->control_index_table = gfx12_control_index_table;; + c->datatype_table = gfx12_datatype_table; + c->subreg_table = gfx12_subreg_table; + if (devinfo->verx10 >= 125) { + c->src0_index_table = xehp_src0_index_table; + c->src1_index_table = xehp_src1_index_table; + } else { + c->src0_index_table = gfx12_src0_index_table; + c->src1_index_table = gfx12_src1_index_table; + } + break; + case 11: + c->control_index_table = gfx8_control_index_table; + c->datatype_table = gfx11_datatype_table; + c->subreg_table = gfx8_subreg_table; + c->src0_index_table = gfx8_src_index_table; + c->src1_index_table = gfx8_src_index_table; + break; + case 9: + case 8: + c->control_index_table = gfx8_control_index_table; + c->datatype_table = gfx8_datatype_table; + c->subreg_table = gfx8_subreg_table; + c->src0_index_table = gfx8_src_index_table; + c->src1_index_table = gfx8_src_index_table; + break; + case 7: + c->control_index_table = gfx7_control_index_table; + c->datatype_table = gfx7_datatype_table; + c->subreg_table = gfx7_subreg_table; + c->src0_index_table = gfx7_src_index_table; + c->src1_index_table = gfx7_src_index_table; + break; + case 6: + c->control_index_table = gfx6_control_index_table; + c->datatype_table = gfx6_datatype_table; + c->subreg_table = gfx6_subreg_table; + c->src0_index_table = gfx6_src_index_table; + c->src1_index_table = gfx6_src_index_table; + break; + case 5: + case 4: + c->control_index_table = g45_control_index_table; + c->datatype_table = g45_datatype_table; + c->subreg_table = g45_subreg_table; + c->src0_index_table = g45_src_index_table; + c->src1_index_table = g45_src_index_table; + break; + default: + unreachable("unknown generation"); + } +} + +void +brw_compact_instructions(struct brw_codegen *p, int start_offset, + struct disasm_info *disasm) +{ + if (INTEL_DEBUG(DEBUG_NO_COMPACTION)) + return; + + const struct intel_device_info *devinfo = p->devinfo; + if (devinfo->ver == 4 && devinfo->platform != INTEL_PLATFORM_G4X) + return; + + void *store = p->store + start_offset / 16; + /* For an instruction at byte offset 16*i before compaction, this is the + * number of compacted instructions minus the number of padding NOP/NENOPs + * that preceded it. + */ + unsigned num_compacted_counts = + (p->next_insn_offset - start_offset) / sizeof(brw_inst); + int *compacted_counts = + calloc(1, sizeof(*compacted_counts) * num_compacted_counts); + + /* For an instruction at byte offset 8*i after compaction, this was its IP + * (in 16-byte units) before compaction. + */ + unsigned num_old_ip = + (p->next_insn_offset - start_offset) / sizeof(brw_compact_inst) + 1; + int *old_ip = calloc(1, sizeof(*old_ip) * num_old_ip); + + struct compaction_state c; + compaction_state_init(&c, p->isa); + + int offset = 0; + int compacted_count = 0; + for (int src_offset = 0; src_offset < p->next_insn_offset - start_offset; + src_offset += sizeof(brw_inst)) { + brw_inst *src = store + src_offset; + void *dst = store + offset; + + old_ip[offset / sizeof(brw_compact_inst)] = src_offset / sizeof(brw_inst); + compacted_counts[src_offset / sizeof(brw_inst)] = compacted_count; + + brw_inst inst = precompact(p->isa, *src); + brw_inst saved = inst; + + if (try_compact_instruction(&c, dst, &inst)) { + compacted_count++; + + if (INTEL_DEBUG(DEBUG_VS | DEBUG_GS | DEBUG_TCS | DEBUG_TASK | + DEBUG_WM | DEBUG_CS | DEBUG_TES | DEBUG_MESH | + DEBUG_RT)) { + brw_inst uncompacted; + uncompact_instruction(&c, &uncompacted, dst); + if (memcmp(&saved, &uncompacted, sizeof(uncompacted))) { + brw_debug_compact_uncompact(p->isa, &saved, &uncompacted); + } + } + + offset += sizeof(brw_compact_inst); + } else { + /* All uncompacted instructions need to be aligned on G45. */ + if ((offset & sizeof(brw_compact_inst)) != 0 && + devinfo->platform == INTEL_PLATFORM_G4X) { + brw_compact_inst *align = store + offset; + memset(align, 0, sizeof(*align)); + brw_compact_inst_set_hw_opcode( + devinfo, align, brw_opcode_encode(p->isa, BRW_OPCODE_NENOP)); + brw_compact_inst_set_cmpt_control(devinfo, align, true); + offset += sizeof(brw_compact_inst); + compacted_count--; + compacted_counts[src_offset / sizeof(brw_inst)] = compacted_count; + old_ip[offset / sizeof(brw_compact_inst)] = src_offset / sizeof(brw_inst); + + dst = store + offset; + } + + /* If we didn't compact this instruction, we need to move it down into + * place. + */ + if (offset != src_offset) { + memmove(dst, src, sizeof(brw_inst)); + } + offset += sizeof(brw_inst); + } + } + + /* Add an entry for the ending offset of the program. This greatly + * simplifies the linked list walk at the end of the function. + */ + old_ip[offset / sizeof(brw_compact_inst)] = + (p->next_insn_offset - start_offset) / sizeof(brw_inst); + + /* Fix up control flow offsets. */ + p->next_insn_offset = start_offset + offset; + for (offset = 0; offset < p->next_insn_offset - start_offset; + offset = next_offset(devinfo, store, offset)) { + brw_inst *insn = store + offset; + int this_old_ip = old_ip[offset / sizeof(brw_compact_inst)]; + int this_compacted_count = compacted_counts[this_old_ip]; + + switch (brw_inst_opcode(p->isa, insn)) { + case BRW_OPCODE_BREAK: + case BRW_OPCODE_CONTINUE: + case BRW_OPCODE_HALT: + if (devinfo->ver >= 6) { + update_uip_jip(p->isa, insn, this_old_ip, compacted_counts); + } else { + update_gfx4_jump_count(devinfo, insn, this_old_ip, + compacted_counts); + } + break; + + case BRW_OPCODE_IF: + case BRW_OPCODE_IFF: + case BRW_OPCODE_ELSE: + case BRW_OPCODE_ENDIF: + case BRW_OPCODE_WHILE: + if (devinfo->ver >= 7) { + if (brw_inst_cmpt_control(devinfo, insn)) { + brw_inst uncompacted; + uncompact_instruction(&c, &uncompacted, + (brw_compact_inst *)insn); + + update_uip_jip(p->isa, &uncompacted, this_old_ip, + compacted_counts); + + bool ret = try_compact_instruction(&c, (brw_compact_inst *)insn, + &uncompacted); + assert(ret); (void)ret; + } else { + update_uip_jip(p->isa, insn, this_old_ip, compacted_counts); + } + } else if (devinfo->ver == 6) { + assert(!brw_inst_cmpt_control(devinfo, insn)); + + /* Jump Count is in units of compacted instructions on Gfx6. */ + int jump_count_compacted = brw_inst_gfx6_jump_count(devinfo, insn); + + int target_old_ip = this_old_ip + (jump_count_compacted / 2); + int target_compacted_count = compacted_counts[target_old_ip]; + jump_count_compacted -= (target_compacted_count - this_compacted_count); + brw_inst_set_gfx6_jump_count(devinfo, insn, jump_count_compacted); + } else { + update_gfx4_jump_count(devinfo, insn, this_old_ip, + compacted_counts); + } + break; + + case BRW_OPCODE_ADD: + /* Add instructions modifying the IP register use an immediate src1, + * and Gens that use this cannot compact instructions with immediate + * operands. + */ + if (brw_inst_cmpt_control(devinfo, insn)) + break; + + if (brw_inst_dst_reg_file(devinfo, insn) == BRW_ARCHITECTURE_REGISTER_FILE && + brw_inst_dst_da_reg_nr(devinfo, insn) == BRW_ARF_IP) { + assert(brw_inst_src1_reg_file(devinfo, insn) == BRW_IMMEDIATE_VALUE); + + int shift = 3; + int jump_compacted = brw_inst_imm_d(devinfo, insn) >> shift; + + int target_old_ip = this_old_ip + (jump_compacted / 2); + int target_compacted_count = compacted_counts[target_old_ip]; + jump_compacted -= (target_compacted_count - this_compacted_count); + brw_inst_set_imm_ud(devinfo, insn, jump_compacted << shift); + } + break; + + default: + break; + } + } + + /* p->nr_insn is counting the number of uncompacted instructions still, so + * divide. We do want to be sure there's a valid instruction in any + * alignment padding, so that the next compression pass (for the FS 8/16 + * compile passes) parses correctly. + */ + if (p->next_insn_offset & sizeof(brw_compact_inst)) { + brw_compact_inst *align = store + offset; + memset(align, 0, sizeof(*align)); + brw_compact_inst_set_hw_opcode( + devinfo, align, brw_opcode_encode(p->isa, BRW_OPCODE_NOP)); + brw_compact_inst_set_cmpt_control(devinfo, align, true); + p->next_insn_offset += sizeof(brw_compact_inst); + } + p->nr_insn = p->next_insn_offset / sizeof(brw_inst); + + for (int i = 0; i < p->num_relocs; i++) { + if (p->relocs[i].offset < (uint32_t)start_offset) + continue; + + assert(p->relocs[i].offset % 16 == 0); + unsigned idx = (p->relocs[i].offset - start_offset) / 16; + p->relocs[i].offset -= compacted_counts[idx] * 8; + } + + /* Update the instruction offsets for each group. */ + if (disasm) { + int offset = 0; + + foreach_list_typed(struct inst_group, group, link, &disasm->group_list) { + while (start_offset + old_ip[offset / sizeof(brw_compact_inst)] * + sizeof(brw_inst) != group->offset) { + assert(start_offset + old_ip[offset / sizeof(brw_compact_inst)] * + sizeof(brw_inst) < group->offset); + offset = next_offset(devinfo, store, offset); + } + + group->offset = start_offset + offset; + + offset = next_offset(devinfo, store, offset); + } + } + + free(compacted_counts); + free(old_ip); +} diff --git a/src/intel/compiler/elk/brw_eu_defines.h b/src/intel/compiler/elk/brw_eu_defines.h new file mode 100644 index 00000000000..0302334014d --- /dev/null +++ b/src/intel/compiler/elk/brw_eu_defines.h @@ -0,0 +1,2218 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell + */ + +#ifndef BRW_EU_DEFINES_H +#define BRW_EU_DEFINES_H + +#include +#include +#include "util/macros.h" +#include "dev/intel_device_info.h" + +/* The following hunk, up-to "Execution Unit" is used by both the + * intel/compiler and i965 codebase. */ + +#define INTEL_MASK(high, low) (((1u<<((high)-(low)+1))-1)<<(low)) +/* Using the GNU statement expression extension */ +#define SET_FIELD(value, field) \ + ({ \ + uint32_t fieldval = (uint32_t)(value) << field ## _SHIFT; \ + assert((fieldval & ~ field ## _MASK) == 0); \ + fieldval & field ## _MASK; \ + }) + +#define SET_BITS(value, high, low) \ + ({ \ + const uint32_t fieldval = (uint32_t)(value) << (low); \ + assert((fieldval & ~INTEL_MASK(high, low)) == 0); \ + fieldval & INTEL_MASK(high, low); \ + }) + +#define GET_BITS(data, high, low) ((data & INTEL_MASK((high), (low))) >> (low)) +#define GET_FIELD(word, field) (((word) & field ## _MASK) >> field ## _SHIFT) + +/* Bitfields for the URB_WRITE message, DW2 of message header: */ +#define URB_WRITE_PRIM_END 0x1 +#define URB_WRITE_PRIM_START 0x2 +#define URB_WRITE_PRIM_TYPE_SHIFT 2 + +#define BRW_SPRITE_POINT_ENABLE 16 + +# define GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT 0 +# define GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID 1 + +/* Execution Unit (EU) defines + */ + +#define BRW_ALIGN_1 0 +#define BRW_ALIGN_16 1 + +#define BRW_ADDRESS_DIRECT 0 +#define BRW_ADDRESS_REGISTER_INDIRECT_REGISTER 1 + +#define BRW_CHANNEL_X 0 +#define BRW_CHANNEL_Y 1 +#define BRW_CHANNEL_Z 2 +#define BRW_CHANNEL_W 3 + +enum brw_compression { + BRW_COMPRESSION_NONE = 0, + BRW_COMPRESSION_2NDHALF = 1, + BRW_COMPRESSION_COMPRESSED = 2, +}; + +#define GFX6_COMPRESSION_1Q 0 +#define GFX6_COMPRESSION_2Q 1 +#define GFX6_COMPRESSION_3Q 2 +#define GFX6_COMPRESSION_4Q 3 +#define GFX6_COMPRESSION_1H 0 +#define GFX6_COMPRESSION_2H 2 + +enum ENUM_PACKED brw_conditional_mod { + BRW_CONDITIONAL_NONE = 0, + BRW_CONDITIONAL_Z = 1, + BRW_CONDITIONAL_NZ = 2, + BRW_CONDITIONAL_EQ = 1, /* Z */ + BRW_CONDITIONAL_NEQ = 2, /* NZ */ + BRW_CONDITIONAL_G = 3, + BRW_CONDITIONAL_GE = 4, + BRW_CONDITIONAL_L = 5, + BRW_CONDITIONAL_LE = 6, + BRW_CONDITIONAL_R = 7, /* Gen <= 5 */ + BRW_CONDITIONAL_O = 8, + BRW_CONDITIONAL_U = 9, +}; + +#define BRW_DEBUG_NONE 0 +#define BRW_DEBUG_BREAKPOINT 1 + +#define BRW_DEPENDENCY_NORMAL 0 +#define BRW_DEPENDENCY_NOTCLEARED 1 +#define BRW_DEPENDENCY_NOTCHECKED 2 +#define BRW_DEPENDENCY_DISABLE 3 + +enum ENUM_PACKED brw_execution_size { + BRW_EXECUTE_1 = 0, + BRW_EXECUTE_2 = 1, + BRW_EXECUTE_4 = 2, + BRW_EXECUTE_8 = 3, + BRW_EXECUTE_16 = 4, + BRW_EXECUTE_32 = 5, +}; + +enum ENUM_PACKED brw_horizontal_stride { + BRW_HORIZONTAL_STRIDE_0 = 0, + BRW_HORIZONTAL_STRIDE_1 = 1, + BRW_HORIZONTAL_STRIDE_2 = 2, + BRW_HORIZONTAL_STRIDE_4 = 3, +}; + +enum ENUM_PACKED gfx10_align1_3src_src_horizontal_stride { + BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0 = 0, + BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1 = 1, + BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2 = 2, + BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4 = 3, +}; + +enum ENUM_PACKED gfx10_align1_3src_dst_horizontal_stride { + BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1 = 0, + BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_2 = 1, +}; + +#define BRW_INSTRUCTION_NORMAL 0 +#define BRW_INSTRUCTION_SATURATE 1 + +#define BRW_MASK_ENABLE 0 +#define BRW_MASK_DISABLE 1 + +/** @{ + * + * Gfx6 has replaced "mask enable/disable" with WECtrl, which is + * effectively the same but much simpler to think about. Now, there + * are two contributors ANDed together to whether channels are + * executed: The predication on the instruction, and the channel write + * enable. + */ +/** + * This is the default value. It means that a channel's write enable is set + * if the per-channel IP is pointing at this instruction. + */ +#define BRW_WE_NORMAL 0 +/** + * This is used like BRW_MASK_DISABLE, and causes all channels to have + * their write enable set. Note that predication still contributes to + * whether the channel actually gets written. + */ +#define BRW_WE_ALL 1 +/** @} */ + +enum opcode { + /* These are the actual hardware instructions. */ + BRW_OPCODE_ILLEGAL, + BRW_OPCODE_SYNC, + BRW_OPCODE_MOV, + BRW_OPCODE_SEL, + BRW_OPCODE_MOVI, /**< G45+ */ + BRW_OPCODE_NOT, + BRW_OPCODE_AND, + BRW_OPCODE_OR, + BRW_OPCODE_XOR, + BRW_OPCODE_SHR, + BRW_OPCODE_SHL, + BRW_OPCODE_DIM, /**< Gfx7.5 only */ + BRW_OPCODE_SMOV, /**< Gfx8+ */ + BRW_OPCODE_ASR, + BRW_OPCODE_ROR, /**< Gfx11+ */ + BRW_OPCODE_ROL, /**< Gfx11+ */ + BRW_OPCODE_CMP, + BRW_OPCODE_CMPN, + BRW_OPCODE_CSEL, /**< Gfx8+ */ + BRW_OPCODE_F32TO16, /**< Gfx7 only */ + BRW_OPCODE_F16TO32, /**< Gfx7 only */ + BRW_OPCODE_BFREV, /**< Gfx7+ */ + BRW_OPCODE_BFE, /**< Gfx7+ */ + BRW_OPCODE_BFI1, /**< Gfx7+ */ + BRW_OPCODE_BFI2, /**< Gfx7+ */ + BRW_OPCODE_JMPI, + BRW_OPCODE_BRD, /**< Gfx7+ */ + BRW_OPCODE_IF, + BRW_OPCODE_IFF, /**< Pre-Gfx6 */ + BRW_OPCODE_BRC, /**< Gfx7+ */ + BRW_OPCODE_ELSE, + BRW_OPCODE_ENDIF, + BRW_OPCODE_DO, /**< Pre-Gfx6 */ + BRW_OPCODE_CASE, /**< Gfx6 only */ + BRW_OPCODE_WHILE, + BRW_OPCODE_BREAK, + BRW_OPCODE_CONTINUE, + BRW_OPCODE_HALT, + BRW_OPCODE_CALLA, /**< Gfx7.5+ */ + BRW_OPCODE_MSAVE, /**< Pre-Gfx6 */ + BRW_OPCODE_CALL, /**< Gfx6+ */ + BRW_OPCODE_MREST, /**< Pre-Gfx6 */ + BRW_OPCODE_RET, /**< Gfx6+ */ + BRW_OPCODE_PUSH, /**< Pre-Gfx6 */ + BRW_OPCODE_FORK, /**< Gfx6 only */ + BRW_OPCODE_GOTO, /**< Gfx8+ */ + BRW_OPCODE_POP, /**< Pre-Gfx6 */ + BRW_OPCODE_WAIT, + BRW_OPCODE_SEND, + BRW_OPCODE_SENDC, + BRW_OPCODE_SENDS, /**< Gfx9+ */ + BRW_OPCODE_SENDSC, /**< Gfx9+ */ + BRW_OPCODE_MATH, /**< Gfx6+ */ + BRW_OPCODE_ADD, + BRW_OPCODE_MUL, + BRW_OPCODE_AVG, + BRW_OPCODE_FRC, + BRW_OPCODE_RNDU, + BRW_OPCODE_RNDD, + BRW_OPCODE_RNDE, + BRW_OPCODE_RNDZ, + BRW_OPCODE_MAC, + BRW_OPCODE_MACH, + BRW_OPCODE_LZD, + BRW_OPCODE_FBH, /**< Gfx7+ */ + BRW_OPCODE_FBL, /**< Gfx7+ */ + BRW_OPCODE_CBIT, /**< Gfx7+ */ + BRW_OPCODE_ADDC, /**< Gfx7+ */ + BRW_OPCODE_SUBB, /**< Gfx7+ */ + BRW_OPCODE_SAD2, + BRW_OPCODE_SADA2, + BRW_OPCODE_ADD3, /* Gen12+ only */ + BRW_OPCODE_DP4, + BRW_OPCODE_DPH, + BRW_OPCODE_DP3, + BRW_OPCODE_DP2, + BRW_OPCODE_DP4A, /**< Gfx12+ */ + BRW_OPCODE_LINE, + BRW_OPCODE_DPAS, /**< Gfx12.5+ */ + BRW_OPCODE_PLN, /**< G45+ */ + BRW_OPCODE_MAD, /**< Gfx6+ */ + BRW_OPCODE_LRP, /**< Gfx6+ */ + BRW_OPCODE_MADM, /**< Gfx8+ */ + BRW_OPCODE_NENOP, /**< G45 only */ + BRW_OPCODE_NOP, + + NUM_BRW_OPCODES, + + /* These are compiler backend opcodes that get translated into other + * instructions. + */ + FS_OPCODE_FB_WRITE = NUM_BRW_OPCODES, + + /** + * Same as FS_OPCODE_FB_WRITE but expects its arguments separately as + * individual sources instead of as a single payload blob. The + * position/ordering of the arguments are defined by the enum + * fb_write_logical_srcs. + */ + FS_OPCODE_FB_WRITE_LOGICAL, + + FS_OPCODE_REP_FB_WRITE, + + FS_OPCODE_FB_READ, + FS_OPCODE_FB_READ_LOGICAL, + + SHADER_OPCODE_RCP, + SHADER_OPCODE_RSQ, + SHADER_OPCODE_SQRT, + SHADER_OPCODE_EXP2, + SHADER_OPCODE_LOG2, + SHADER_OPCODE_POW, + SHADER_OPCODE_INT_QUOTIENT, + SHADER_OPCODE_INT_REMAINDER, + SHADER_OPCODE_SIN, + SHADER_OPCODE_COS, + + /** + * A generic "send" opcode. The first two sources are the message + * descriptor and extended message descriptor respectively. The third + * and optional fourth sources are the message payload + */ + SHADER_OPCODE_SEND, + + /** + * An "undefined" write which does nothing but indicates to liveness that + * we don't care about any values in the register which predate this + * instruction. Used to prevent partial writes from causing issues with + * live ranges. + */ + SHADER_OPCODE_UNDEF, + + /** + * Texture sampling opcodes. + * + * LOGICAL opcodes are eventually translated to the matching non-LOGICAL + * opcode but instead of taking a single payload blob they expect their + * arguments separately as individual sources. The position/ordering of the + * arguments are defined by the enum tex_logical_srcs. + */ + SHADER_OPCODE_TEX, + SHADER_OPCODE_TEX_LOGICAL, + SHADER_OPCODE_TXD, + SHADER_OPCODE_TXD_LOGICAL, + SHADER_OPCODE_TXF, + SHADER_OPCODE_TXF_LOGICAL, + SHADER_OPCODE_TXF_LZ, + SHADER_OPCODE_TXL, + SHADER_OPCODE_TXL_LOGICAL, + SHADER_OPCODE_TXL_LZ, + SHADER_OPCODE_TXS, + SHADER_OPCODE_TXS_LOGICAL, + FS_OPCODE_TXB, + FS_OPCODE_TXB_LOGICAL, + SHADER_OPCODE_TXF_CMS, + SHADER_OPCODE_TXF_CMS_LOGICAL, + SHADER_OPCODE_TXF_CMS_W, + SHADER_OPCODE_TXF_CMS_W_LOGICAL, + SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL, + SHADER_OPCODE_TXF_UMS, + SHADER_OPCODE_TXF_UMS_LOGICAL, + SHADER_OPCODE_TXF_MCS, + SHADER_OPCODE_TXF_MCS_LOGICAL, + SHADER_OPCODE_LOD, + SHADER_OPCODE_LOD_LOGICAL, + SHADER_OPCODE_TG4, + SHADER_OPCODE_TG4_LOGICAL, + SHADER_OPCODE_TG4_OFFSET, + SHADER_OPCODE_TG4_OFFSET_LOGICAL, + SHADER_OPCODE_SAMPLEINFO, + SHADER_OPCODE_SAMPLEINFO_LOGICAL, + + SHADER_OPCODE_IMAGE_SIZE_LOGICAL, + + /** + * Combines multiple sources of size 1 into a larger virtual GRF. + * For example, parameters for a send-from-GRF message. Or, updating + * channels of a size 4 VGRF used to store vec4s such as texturing results. + * + * This will be lowered into MOVs from each source to consecutive offsets + * of the destination VGRF. + * + * src[0] may be BAD_FILE. If so, the lowering pass skips emitting the MOV, + * but still reserves the first channel of the destination VGRF. This can be + * used to reserve space for, say, a message header set up by the generators. + */ + SHADER_OPCODE_LOAD_PAYLOAD, + + /** + * Packs a number of sources into a single value. Unlike LOAD_PAYLOAD, this + * acts intra-channel, obtaining the final value for each channel by + * combining the sources values for the same channel, the first source + * occupying the lowest bits and the last source occupying the highest + * bits. + */ + FS_OPCODE_PACK, + + /** + * Typed and untyped surface access opcodes. + * + * LOGICAL opcodes are eventually translated to the matching non-LOGICAL + * opcode but instead of taking a single payload blob they expect their + * arguments separately as individual sources: + * + * Source 0: [required] Surface coordinates. + * Source 1: [optional] Operation source. + * Source 2: [required] Surface index. + * Source 3: [required] Number of coordinate components (as UD immediate). + * Source 4: [required] Opcode-specific control immediate, same as source 2 + * of the matching non-LOGICAL opcode. + */ + VEC4_OPCODE_UNTYPED_ATOMIC, + SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, + VEC4_OPCODE_UNTYPED_SURFACE_READ, + SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, + VEC4_OPCODE_UNTYPED_SURFACE_WRITE, + SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, + + SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL, + SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL, + + /** + * Untyped A64 surface access opcodes. + * + * Source 0: 64-bit address + * Source 1: Operational source + * Source 2: [required] Opcode-specific control immediate, same as source 2 + * of the matching non-LOGICAL opcode. + */ + SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL, + SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL, + SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL, + SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL, + SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL, + SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL, + SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL, + SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, + + SHADER_OPCODE_TYPED_ATOMIC_LOGICAL, + SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL, + SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL, + + SHADER_OPCODE_RND_MODE, + SHADER_OPCODE_FLOAT_CONTROL_MODE, + + /** + * Byte scattered write/read opcodes. + * + * LOGICAL opcodes are eventually translated to the matching non-LOGICAL + * opcode, but instead of taking a single payload blog they expect their + * arguments separately as individual sources, like untyped write/read. + */ + SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, + SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, + SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL, + SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL, + + /** + * Memory fence messages. + * + * Source 0: Must be register g0, used as header. + * Source 1: Immediate bool to indicate whether control is returned to the + * thread only after the fence has been honored. + * Source 2: Immediate byte indicating which memory to fence. Zero means + * global memory; GFX7_BTI_SLM means SLM (for Gfx11+ only). + * + * Vec4 backend only uses Source 0. + */ + SHADER_OPCODE_MEMORY_FENCE, + + /** + * Scheduling-only fence. + * + * Sources can be used to force a stall until the registers in those are + * available. This might generate MOVs or SYNC_NOPs (Gfx12+). + */ + FS_OPCODE_SCHEDULING_FENCE, + + SHADER_OPCODE_GFX4_SCRATCH_READ, + SHADER_OPCODE_GFX4_SCRATCH_WRITE, + SHADER_OPCODE_GFX7_SCRATCH_READ, + + SHADER_OPCODE_SCRATCH_HEADER, + + /** + * Gfx8+ SIMD8 URB messages. + */ + SHADER_OPCODE_URB_READ_LOGICAL, + SHADER_OPCODE_URB_WRITE_LOGICAL, + + /** + * Return the index of the first enabled live channel and assign it to + * to the first component of the destination. Frequently used as input + * for the BROADCAST pseudo-opcode. + */ + SHADER_OPCODE_FIND_LIVE_CHANNEL, + + /** + * Return the index of the last enabled live channel and assign it to + * the first component of the destination. + */ + SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL, + + /** + * Return the current execution mask in the specified flag subregister. + * Can be CSE'ed more easily than a plain MOV from the ce0 ARF register. + */ + FS_OPCODE_LOAD_LIVE_CHANNELS, + + /** + * Pick the channel from its first source register given by the index + * specified as second source. Useful for variable indexing of surfaces. + * + * Note that because the result of this instruction is by definition + * uniform and it can always be splatted to multiple channels using a + * scalar regioning mode, only the first channel of the destination region + * is guaranteed to be updated, which implies that BROADCAST instructions + * should usually be marked force_writemask_all. + */ + SHADER_OPCODE_BROADCAST, + + /* Pick the channel from its first source register given by the index + * specified as second source. + * + * This is similar to the BROADCAST instruction except that it takes a + * dynamic index and potentially puts a different value in each output + * channel. + */ + SHADER_OPCODE_SHUFFLE, + + /* Select between src0 and src1 based on channel enables. + * + * This instruction copies src0 into the enabled channels of the + * destination and copies src1 into the disabled channels. + */ + SHADER_OPCODE_SEL_EXEC, + + /* This turns into an align16 mov from src0 to dst with a swizzle + * provided as an immediate in src1. + */ + SHADER_OPCODE_QUAD_SWIZZLE, + + /* Take every Nth element in src0 and broadcast it to the group of N + * channels in which it lives in the destination. The offset within the + * cluster is given by src1 and the cluster size is given by src2. + */ + SHADER_OPCODE_CLUSTER_BROADCAST, + + SHADER_OPCODE_GET_BUFFER_SIZE, + + SHADER_OPCODE_INTERLOCK, + + /** Target for a HALT + * + * All HALT instructions in a shader must target the same jump point and + * that point is denoted by a HALT_TARGET instruction. + */ + SHADER_OPCODE_HALT_TARGET, + + VEC4_OPCODE_MOV_BYTES, + VEC4_OPCODE_PACK_BYTES, + VEC4_OPCODE_UNPACK_UNIFORM, + VEC4_OPCODE_DOUBLE_TO_F32, + VEC4_OPCODE_DOUBLE_TO_D32, + VEC4_OPCODE_DOUBLE_TO_U32, + VEC4_OPCODE_TO_DOUBLE, + VEC4_OPCODE_PICK_LOW_32BIT, + VEC4_OPCODE_PICK_HIGH_32BIT, + VEC4_OPCODE_SET_LOW_32BIT, + VEC4_OPCODE_SET_HIGH_32BIT, + VEC4_OPCODE_MOV_FOR_SCRATCH, + VEC4_OPCODE_ZERO_OOB_PUSH_REGS, + + FS_OPCODE_DDX_COARSE, + FS_OPCODE_DDX_FINE, + /** + * Compute dFdy(), dFdyCoarse(), or dFdyFine(). + */ + FS_OPCODE_DDY_COARSE, + FS_OPCODE_DDY_FINE, + FS_OPCODE_LINTERP, + FS_OPCODE_PIXEL_X, + FS_OPCODE_PIXEL_Y, + FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, + FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4, + FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL, + FS_OPCODE_SET_SAMPLE_ID, + FS_OPCODE_PACK_HALF_2x16_SPLIT, + FS_OPCODE_INTERPOLATE_AT_SAMPLE, + FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, + FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, + + VEC4_VS_OPCODE_URB_WRITE, + VS_OPCODE_PULL_CONSTANT_LOAD, + VS_OPCODE_PULL_CONSTANT_LOAD_GFX7, + + VS_OPCODE_UNPACK_FLAGS_SIMD4X2, + + /** + * Write geometry shader output data to the URB. + * + * Unlike VEC4_VS_OPCODE_URB_WRITE, this opcode doesn't do an implied move from + * R0 to the first MRF. This allows the geometry shader to override the + * "Slot {0,1} Offset" fields in the message header. + */ + VEC4_GS_OPCODE_URB_WRITE, + + /** + * Write geometry shader output data to the URB and request a new URB + * handle (gfx6). + * + * This opcode doesn't do an implied move from R0 to the first MRF. + */ + VEC4_GS_OPCODE_URB_WRITE_ALLOCATE, + + /** + * Terminate the geometry shader thread by doing an empty URB write. + * + * This opcode doesn't do an implied move from R0 to the first MRF. This + * allows the geometry shader to override the "GS Number of Output Vertices + * for Slot {0,1}" fields in the message header. + */ + GS_OPCODE_THREAD_END, + + /** + * Set the "Slot {0,1} Offset" fields of a URB_WRITE message header. + * + * - dst is the MRF containing the message header. + * + * - src0.x indicates which portion of the URB should be written to (e.g. a + * vertex number) + * + * - src1 is an immediate multiplier which will be applied to src0 + * (e.g. the size of a single vertex in the URB). + * + * Note: the hardware will apply this offset *in addition to* the offset in + * vec4_instruction::offset. + */ + GS_OPCODE_SET_WRITE_OFFSET, + + /** + * Set the "GS Number of Output Vertices for Slot {0,1}" fields of a + * URB_WRITE message header. + * + * - dst is the MRF containing the message header. + * + * - src0.x is the vertex count. The upper 16 bits will be ignored. + */ + GS_OPCODE_SET_VERTEX_COUNT, + + /** + * Set DWORD 2 of dst to the value in src. + */ + GS_OPCODE_SET_DWORD_2, + + /** + * Prepare the dst register for storage in the "Channel Mask" fields of a + * URB_WRITE message header. + * + * DWORD 4 of dst is shifted left by 4 bits, so that later, + * GS_OPCODE_SET_CHANNEL_MASKS can OR DWORDs 0 and 4 together to form the + * final channel mask. + * + * Note: since GS_OPCODE_SET_CHANNEL_MASKS ORs DWORDs 0 and 4 together to + * form the final channel mask, DWORDs 0 and 4 of the dst register must not + * have any extraneous bits set prior to execution of this opcode (that is, + * they should be in the range 0x0 to 0xf). + */ + GS_OPCODE_PREPARE_CHANNEL_MASKS, + + /** + * Set the "Channel Mask" fields of a URB_WRITE message header. + * + * - dst is the MRF containing the message header. + * + * - src.x is the channel mask, as prepared by + * GS_OPCODE_PREPARE_CHANNEL_MASKS. DWORDs 0 and 4 are OR'ed together to + * form the final channel mask. + */ + GS_OPCODE_SET_CHANNEL_MASKS, + + /** + * Get the "Instance ID" fields from the payload. + * + * - dst is the GRF for gl_InvocationID. + */ + GS_OPCODE_GET_INSTANCE_ID, + + /** + * Send a FF_SYNC message to allocate initial URB handles (gfx6). + * + * - dst will be used as the writeback register for the FF_SYNC operation. + * + * - src0 is the number of primitives written. + * + * - src1 is the value to hold in M0.0: number of SO vertices to write + * and number of SO primitives needed. Its value will be overwritten + * with the SVBI values if transform feedback is enabled. + * + * Note: This opcode uses an implicit MRF register for the ff_sync message + * header, so the caller is expected to set inst->base_mrf and initialize + * that MRF register to r0. This opcode will also write to this MRF register + * to include the allocated URB handle so it can then be reused directly as + * the header in the URB write operation we are allocating the handle for. + */ + GS_OPCODE_FF_SYNC, + + /** + * Move r0.1 (which holds PrimitiveID information in gfx6) to a separate + * register. + * + * - dst is the GRF where PrimitiveID information will be moved. + */ + GS_OPCODE_SET_PRIMITIVE_ID, + + /** + * Write transform feedback data to the SVB by sending a SVB WRITE message. + * Used in gfx6. + * + * - dst is the MRF register containing the message header. + * + * - src0 is the register where the vertex data is going to be copied from. + * + * - src1 is the destination register when write commit occurs. + */ + GS_OPCODE_SVB_WRITE, + + /** + * Set destination index in the SVB write message payload (M0.5). Used + * in gfx6 for transform feedback. + * + * - dst is the header to save the destination indices for SVB WRITE. + * - src is the register that holds the destination indices value. + */ + GS_OPCODE_SVB_SET_DST_INDEX, + + /** + * Prepare Mx.0 subregister for being used in the FF_SYNC message header. + * Used in gfx6 for transform feedback. + * + * - dst will hold the register with the final Mx.0 value. + * + * - src0 has the number of vertices emitted in SO (NumSOVertsToWrite) + * + * - src1 has the number of needed primitives for SO (NumSOPrimsNeeded) + * + * - src2 is the value to hold in M0: number of SO vertices to write + * and number of SO primitives needed. + */ + GS_OPCODE_FF_SYNC_SET_PRIMITIVES, + + /** + * Terminate the compute shader. + */ + CS_OPCODE_CS_TERMINATE, + + /** + * GLSL barrier() + */ + SHADER_OPCODE_BARRIER, + + /** + * Calculate the high 32-bits of a 32x32 multiply. + */ + SHADER_OPCODE_MULH, + + /** Signed subtraction with saturation. */ + SHADER_OPCODE_ISUB_SAT, + + /** Unsigned subtraction with saturation. */ + SHADER_OPCODE_USUB_SAT, + + /** + * A MOV that uses VxH indirect addressing. + * + * Source 0: A register to start from (HW_REG). + * Source 1: An indirect offset (in bytes, UD GRF). + * Source 2: The length of the region that could be accessed (in bytes, + * UD immediate). + */ + SHADER_OPCODE_MOV_INDIRECT, + + /** Fills out a relocatable immediate */ + SHADER_OPCODE_MOV_RELOC_IMM, + + VEC4_OPCODE_URB_READ, + TCS_OPCODE_GET_INSTANCE_ID, + VEC4_TCS_OPCODE_URB_WRITE, + VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS, + VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, + TCS_OPCODE_GET_PRIMITIVE_ID, + TCS_OPCODE_CREATE_BARRIER_HEADER, + TCS_OPCODE_SRC0_010_IS_ZERO, + TCS_OPCODE_RELEASE_INPUT, + TCS_OPCODE_THREAD_END, + + TES_OPCODE_GET_PRIMITIVE_ID, + TES_OPCODE_CREATE_INPUT_READ_HEADER, + TES_OPCODE_ADD_INDIRECT_URB_OFFSET, + + SHADER_OPCODE_BTD_SPAWN_LOGICAL, + SHADER_OPCODE_BTD_RETIRE_LOGICAL, + + SHADER_OPCODE_READ_SR_REG, + + RT_OPCODE_TRACE_RAY_LOGICAL, +}; + +enum brw_urb_write_flags { + BRW_URB_WRITE_NO_FLAGS = 0, + + /** + * Causes a new URB entry to be allocated, and its address stored in the + * destination register (gen < 7). + */ + BRW_URB_WRITE_ALLOCATE = 0x1, + + /** + * Causes the current URB entry to be deallocated (gen < 7). + */ + BRW_URB_WRITE_UNUSED = 0x2, + + /** + * Causes the thread to terminate. + */ + BRW_URB_WRITE_EOT = 0x4, + + /** + * Indicates that the given URB entry is complete, and may be sent further + * down the 3D pipeline (gen < 7). + */ + BRW_URB_WRITE_COMPLETE = 0x8, + + /** + * Indicates that an additional offset (which may be different for the two + * vec4 slots) is stored in the message header (gen == 7). + */ + BRW_URB_WRITE_PER_SLOT_OFFSET = 0x10, + + /** + * Indicates that the channel masks in the URB_WRITE message header should + * not be overridden to 0xff (gen == 7). + */ + BRW_URB_WRITE_USE_CHANNEL_MASKS = 0x20, + + /** + * Indicates that the data should be sent to the URB using the + * URB_WRITE_OWORD message rather than URB_WRITE_HWORD (gen == 7). This + * causes offsets to be interpreted as multiples of an OWORD instead of an + * HWORD, and only allows one OWORD to be written. + */ + BRW_URB_WRITE_OWORD = 0x40, + + /** + * Convenient combination of flags: end the thread while simultaneously + * marking the given URB entry as complete. + */ + BRW_URB_WRITE_EOT_COMPLETE = BRW_URB_WRITE_EOT | BRW_URB_WRITE_COMPLETE, + + /** + * Convenient combination of flags: mark the given URB entry as complete + * and simultaneously allocate a new one. + */ + BRW_URB_WRITE_ALLOCATE_COMPLETE = + BRW_URB_WRITE_ALLOCATE | BRW_URB_WRITE_COMPLETE, +}; + +enum fb_write_logical_srcs { + FB_WRITE_LOGICAL_SRC_COLOR0, /* REQUIRED */ + FB_WRITE_LOGICAL_SRC_COLOR1, /* for dual source blend messages */ + FB_WRITE_LOGICAL_SRC_SRC0_ALPHA, + FB_WRITE_LOGICAL_SRC_SRC_DEPTH, /* gl_FragDepth */ + FB_WRITE_LOGICAL_SRC_DST_DEPTH, /* GFX4-5: passthrough from thread */ + FB_WRITE_LOGICAL_SRC_SRC_STENCIL, /* gl_FragStencilRefARB */ + FB_WRITE_LOGICAL_SRC_OMASK, /* Sample Mask (gl_SampleMask) */ + FB_WRITE_LOGICAL_SRC_COMPONENTS, /* REQUIRED */ + FB_WRITE_LOGICAL_NUM_SRCS +}; + +enum tex_logical_srcs { + /** Texture coordinates */ + TEX_LOGICAL_SRC_COORDINATE, + /** Shadow comparator */ + TEX_LOGICAL_SRC_SHADOW_C, + /** dPdx if the operation takes explicit derivatives, otherwise LOD value */ + TEX_LOGICAL_SRC_LOD, + /** dPdy if the operation takes explicit derivatives */ + TEX_LOGICAL_SRC_LOD2, + /** Min LOD */ + TEX_LOGICAL_SRC_MIN_LOD, + /** Sample index */ + TEX_LOGICAL_SRC_SAMPLE_INDEX, + /** MCS data */ + TEX_LOGICAL_SRC_MCS, + /** REQUIRED: Texture surface index */ + TEX_LOGICAL_SRC_SURFACE, + /** Texture sampler index */ + TEX_LOGICAL_SRC_SAMPLER, + /** Texture surface bindless handle */ + TEX_LOGICAL_SRC_SURFACE_HANDLE, + /** Texture sampler bindless handle */ + TEX_LOGICAL_SRC_SAMPLER_HANDLE, + /** Texel offset for gathers */ + TEX_LOGICAL_SRC_TG4_OFFSET, + /** REQUIRED: Number of coordinate components (as UD immediate) */ + TEX_LOGICAL_SRC_COORD_COMPONENTS, + /** REQUIRED: Number of derivative components (as UD immediate) */ + TEX_LOGICAL_SRC_GRAD_COMPONENTS, + /** REQUIRED: request residency (as UD immediate) */ + TEX_LOGICAL_SRC_RESIDENCY, + + TEX_LOGICAL_NUM_SRCS, +}; + +enum pull_uniform_constant_srcs { + /** Surface binding table index */ + PULL_UNIFORM_CONSTANT_SRC_SURFACE, + /** Surface bindless handle */ + PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE, + /** Surface offset */ + PULL_UNIFORM_CONSTANT_SRC_OFFSET, + /** Pull size */ + PULL_UNIFORM_CONSTANT_SRC_SIZE, + + PULL_UNIFORM_CONSTANT_SRCS, +}; + +enum pull_varying_constant_srcs { + /** Surface binding table index */ + PULL_VARYING_CONSTANT_SRC_SURFACE, + /** Surface bindless handle */ + PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE, + /** Surface offset */ + PULL_VARYING_CONSTANT_SRC_OFFSET, + /** Pull alignment */ + PULL_VARYING_CONSTANT_SRC_ALIGNMENT, + + PULL_VARYING_CONSTANT_SRCS, +}; + +enum get_buffer_size_srcs { + /** Surface binding table index */ + GET_BUFFER_SIZE_SRC_SURFACE, + /** Surface bindless handle */ + GET_BUFFER_SIZE_SRC_SURFACE_HANDLE, + /** LOD */ + GET_BUFFER_SIZE_SRC_LOD, + + GET_BUFFER_SIZE_SRCS +}; + +enum surface_logical_srcs { + /** Surface binding table index */ + SURFACE_LOGICAL_SRC_SURFACE, + /** Surface bindless handle */ + SURFACE_LOGICAL_SRC_SURFACE_HANDLE, + /** Surface address; could be multi-dimensional for typed opcodes */ + SURFACE_LOGICAL_SRC_ADDRESS, + /** Data to be written or used in an atomic op */ + SURFACE_LOGICAL_SRC_DATA, + /** Surface number of dimensions. Affects the size of ADDRESS */ + SURFACE_LOGICAL_SRC_IMM_DIMS, + /** Per-opcode immediate argument. For atomics, this is the atomic opcode */ + SURFACE_LOGICAL_SRC_IMM_ARG, + /** + * Some instructions with side-effects should not be predicated on + * sample mask, e.g. lowered stores to scratch. + */ + SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK, + + SURFACE_LOGICAL_NUM_SRCS +}; + +enum a64_logical_srcs { + /** Address the A64 message operates on */ + A64_LOGICAL_ADDRESS, + /** Source for the operation (unused of LOAD ops) */ + A64_LOGICAL_SRC, + /** Per-opcode immediate argument. Number of dwords, bit size, or atomic op. */ + A64_LOGICAL_ARG, + /** + * Some instructions do want to run on helper lanes (like ray queries). + */ + A64_LOGICAL_ENABLE_HELPERS, + + A64_LOGICAL_NUM_SRCS +}; + +enum rt_logical_srcs { + /** Address of the globals */ + RT_LOGICAL_SRC_GLOBALS, + /** Level at which the tracing should start */ + RT_LOGICAL_SRC_BVH_LEVEL, + /** Type of tracing operation */ + RT_LOGICAL_SRC_TRACE_RAY_CONTROL, + /** Synchronous tracing (ray query) */ + RT_LOGICAL_SRC_SYNCHRONOUS, + + RT_LOGICAL_NUM_SRCS +}; + +enum urb_logical_srcs { + URB_LOGICAL_SRC_HANDLE, + URB_LOGICAL_SRC_PER_SLOT_OFFSETS, + URB_LOGICAL_SRC_CHANNEL_MASK, + /** Data to be written. BAD_FILE for reads. */ + URB_LOGICAL_SRC_DATA, + URB_LOGICAL_SRC_COMPONENTS, + URB_LOGICAL_NUM_SRCS +}; + +enum interpolator_logical_srcs { + /** Interpolation offset */ + INTERP_SRC_OFFSET, + /** Message data */ + INTERP_SRC_MSG_DESC, + /** Flag register for dynamic mode */ + INTERP_SRC_DYNAMIC_MODE, + + INTERP_NUM_SRCS +}; + + +#ifdef __cplusplus +/** + * Allow brw_urb_write_flags enums to be ORed together. + */ +inline brw_urb_write_flags +operator|(brw_urb_write_flags x, brw_urb_write_flags y) +{ + return static_cast(static_cast(x) | + static_cast(y)); +} +#endif + +enum ENUM_PACKED brw_predicate { + BRW_PREDICATE_NONE = 0, + BRW_PREDICATE_NORMAL = 1, + BRW_PREDICATE_ALIGN1_ANYV = 2, + BRW_PREDICATE_ALIGN1_ALLV = 3, + BRW_PREDICATE_ALIGN1_ANY2H = 4, + BRW_PREDICATE_ALIGN1_ALL2H = 5, + BRW_PREDICATE_ALIGN1_ANY4H = 6, + BRW_PREDICATE_ALIGN1_ALL4H = 7, + BRW_PREDICATE_ALIGN1_ANY8H = 8, + BRW_PREDICATE_ALIGN1_ALL8H = 9, + BRW_PREDICATE_ALIGN1_ANY16H = 10, + BRW_PREDICATE_ALIGN1_ALL16H = 11, + BRW_PREDICATE_ALIGN1_ANY32H = 12, + BRW_PREDICATE_ALIGN1_ALL32H = 13, + BRW_PREDICATE_ALIGN16_REPLICATE_X = 2, + BRW_PREDICATE_ALIGN16_REPLICATE_Y = 3, + BRW_PREDICATE_ALIGN16_REPLICATE_Z = 4, + BRW_PREDICATE_ALIGN16_REPLICATE_W = 5, + BRW_PREDICATE_ALIGN16_ANY4H = 6, + BRW_PREDICATE_ALIGN16_ALL4H = 7, + XE2_PREDICATE_ANY = 2, + XE2_PREDICATE_ALL = 3 +}; + +enum ENUM_PACKED brw_reg_file { + BRW_ARCHITECTURE_REGISTER_FILE = 0, + BRW_GENERAL_REGISTER_FILE = 1, + BRW_MESSAGE_REGISTER_FILE = 2, + BRW_IMMEDIATE_VALUE = 3, + + ARF = BRW_ARCHITECTURE_REGISTER_FILE, + FIXED_GRF = BRW_GENERAL_REGISTER_FILE, + MRF = BRW_MESSAGE_REGISTER_FILE, + IMM = BRW_IMMEDIATE_VALUE, + + /* These are not hardware values */ + VGRF, + ATTR, + UNIFORM, /* prog_data->params[reg] */ + BAD_FILE, +}; + +enum ENUM_PACKED gfx10_align1_3src_reg_file { + BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE = 0, + BRW_ALIGN1_3SRC_IMMEDIATE_VALUE = 1, /* src0, src2 */ + BRW_ALIGN1_3SRC_ACCUMULATOR = 1, /* dest, src1 */ +}; + +/* CNL adds Align1 support for 3-src instructions. Bit 35 of the instruction + * word is "Execution Datatype" which controls whether the instruction operates + * on float or integer types. The register arguments have fields that offer + * more fine control their respective types. + */ +enum ENUM_PACKED gfx10_align1_3src_exec_type { + BRW_ALIGN1_3SRC_EXEC_TYPE_INT = 0, + BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT = 1, +}; + +#define BRW_ARF_NULL 0x00 +#define BRW_ARF_ADDRESS 0x10 +#define BRW_ARF_ACCUMULATOR 0x20 +#define BRW_ARF_FLAG 0x30 +#define BRW_ARF_MASK 0x40 +#define BRW_ARF_MASK_STACK 0x50 +#define BRW_ARF_MASK_STACK_DEPTH 0x60 +#define BRW_ARF_STATE 0x70 +#define BRW_ARF_CONTROL 0x80 +#define BRW_ARF_NOTIFICATION_COUNT 0x90 +#define BRW_ARF_IP 0xA0 +#define BRW_ARF_TDR 0xB0 +#define BRW_ARF_TIMESTAMP 0xC0 + +#define BRW_MRF_COMPR4 (1 << 7) + +#define BRW_AMASK 0 +#define BRW_IMASK 1 +#define BRW_LMASK 2 +#define BRW_CMASK 3 + + + +#define BRW_THREAD_NORMAL 0 +#define BRW_THREAD_ATOMIC 1 +#define BRW_THREAD_SWITCH 2 + +enum ENUM_PACKED brw_vertical_stride { + BRW_VERTICAL_STRIDE_0 = 0, + BRW_VERTICAL_STRIDE_1 = 1, + BRW_VERTICAL_STRIDE_2 = 2, + BRW_VERTICAL_STRIDE_4 = 3, + BRW_VERTICAL_STRIDE_8 = 4, + BRW_VERTICAL_STRIDE_16 = 5, + BRW_VERTICAL_STRIDE_32 = 6, + BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL = 0xF, +}; + +enum ENUM_PACKED gfx10_align1_3src_vertical_stride { + BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0 = 0, + BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1 = 1, + BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2 = 1, + BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4 = 2, + BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8 = 3, +}; + +enum ENUM_PACKED brw_width { + BRW_WIDTH_1 = 0, + BRW_WIDTH_2 = 1, + BRW_WIDTH_4 = 2, + BRW_WIDTH_8 = 3, + BRW_WIDTH_16 = 4, +}; + +/** + * Gfx12+ SWSB SBID synchronization mode. + * + * This is represented as a bitmask including any required SBID token + * synchronization modes, used to synchronize out-of-order instructions. Only + * the strongest mode of the mask will be provided to the hardware in the SWSB + * field of an actual hardware instruction, but virtual instructions may be + * able to take into account multiple of them. + */ +enum tgl_sbid_mode { + TGL_SBID_NULL = 0, + TGL_SBID_SRC = 1, + TGL_SBID_DST = 2, + TGL_SBID_SET = 4 +}; + + +enum gfx12_sub_byte_precision { + BRW_SUB_BYTE_PRECISION_NONE = 0, + + /** 4 bits. Signedness determined by base type */ + BRW_SUB_BYTE_PRECISION_4BIT = 1, + + /** 2 bits. Signedness determined by base type */ + BRW_SUB_BYTE_PRECISION_2BIT = 2, +}; + +enum gfx12_systolic_depth { + BRW_SYSTOLIC_DEPTH_16 = 0, + BRW_SYSTOLIC_DEPTH_2 = 1, + BRW_SYSTOLIC_DEPTH_4 = 2, + BRW_SYSTOLIC_DEPTH_8 = 3, +}; + +#ifdef __cplusplus +/** + * Allow bitwise arithmetic of tgl_sbid_mode enums. + */ +inline tgl_sbid_mode +operator|(tgl_sbid_mode x, tgl_sbid_mode y) +{ + return tgl_sbid_mode(unsigned(x) | unsigned(y)); +} + +inline tgl_sbid_mode +operator&(tgl_sbid_mode x, tgl_sbid_mode y) +{ + return tgl_sbid_mode(unsigned(x) & unsigned(y)); +} + +inline tgl_sbid_mode & +operator|=(tgl_sbid_mode &x, tgl_sbid_mode y) +{ + return x = x | y; +} + +#endif + +/** + * TGL+ SWSB RegDist synchronization pipeline. + * + * On TGL all instructions that use the RegDist synchronization mechanism are + * considered to be executed as a single in-order pipeline, therefore only the + * TGL_PIPE_FLOAT pipeline is applicable. On XeHP+ platforms there are two + * additional asynchronous ALU pipelines (which still execute instructions + * in-order and use the RegDist synchronization mechanism). TGL_PIPE_NONE + * doesn't provide any RegDist pipeline synchronization information and allows + * the hardware to infer the pipeline based on the source types of the + * instruction. TGL_PIPE_ALL can be used when synchronization with all ALU + * pipelines is intended. + */ +enum tgl_pipe { + TGL_PIPE_NONE = 0, + TGL_PIPE_FLOAT, + TGL_PIPE_INT, + TGL_PIPE_LONG, + TGL_PIPE_MATH, + TGL_PIPE_ALL +}; + +/** + * Logical representation of the SWSB scheduling information of a hardware + * instruction. The binary representation is slightly more compact. + */ +struct tgl_swsb { + unsigned regdist : 3; + enum tgl_pipe pipe : 3; + unsigned sbid : 5; + enum tgl_sbid_mode mode : 3; +}; + +/** + * Construct a scheduling annotation with a single RegDist dependency. This + * synchronizes with the completion of the d-th previous in-order instruction. + * The index is one-based, zero causes a no-op tgl_swsb to be constructed. + */ +static inline struct tgl_swsb +tgl_swsb_regdist(unsigned d) +{ + const struct tgl_swsb swsb = { d, d ? TGL_PIPE_ALL : TGL_PIPE_NONE }; + assert(swsb.regdist == d); + return swsb; +} + +/** + * Construct a scheduling annotation that synchronizes with the specified SBID + * token. + */ +static inline struct tgl_swsb +tgl_swsb_sbid(enum tgl_sbid_mode mode, unsigned sbid) +{ + const struct tgl_swsb swsb = { 0, TGL_PIPE_NONE, sbid, mode }; + assert(swsb.sbid == sbid); + return swsb; +} + +/** + * Construct a no-op scheduling annotation. + */ +static inline struct tgl_swsb +tgl_swsb_null(void) +{ + return tgl_swsb_regdist(0); +} + +/** + * Return a scheduling annotation that allocates the same SBID synchronization + * token as \p swsb. In addition it will synchronize against a previous + * in-order instruction if \p regdist is non-zero. + */ +static inline struct tgl_swsb +tgl_swsb_dst_dep(struct tgl_swsb swsb, unsigned regdist) +{ + swsb.regdist = regdist; + swsb.mode = swsb.mode & TGL_SBID_SET; + swsb.pipe = (regdist ? TGL_PIPE_ALL : TGL_PIPE_NONE); + return swsb; +} + +/** + * Return a scheduling annotation that synchronizes against the same SBID and + * RegDist dependencies as \p swsb, but doesn't allocate any SBID token. + */ +static inline struct tgl_swsb +tgl_swsb_src_dep(struct tgl_swsb swsb) +{ + swsb.mode = swsb.mode & (TGL_SBID_SRC | TGL_SBID_DST); + return swsb; +} + +/** + * Convert the provided tgl_swsb to the hardware's binary representation of an + * SWSB annotation. + */ +static inline uint32_t +tgl_swsb_encode(const struct intel_device_info *devinfo, struct tgl_swsb swsb) +{ + if (!swsb.mode) { + const unsigned pipe = devinfo->verx10 < 125 ? 0 : + swsb.pipe == TGL_PIPE_FLOAT ? 0x10 : + swsb.pipe == TGL_PIPE_INT ? 0x18 : + swsb.pipe == TGL_PIPE_LONG ? 0x20 : + swsb.pipe == TGL_PIPE_MATH ? 0x28 : + swsb.pipe == TGL_PIPE_ALL ? 0x8 : 0; + return pipe | swsb.regdist; + + } else if (swsb.regdist) { + if (devinfo->ver >= 20) { + if ((swsb.mode & TGL_SBID_SET)) { + assert(swsb.pipe == TGL_PIPE_ALL || + swsb.pipe == TGL_PIPE_INT || swsb.pipe == TGL_PIPE_FLOAT); + return (swsb.pipe == TGL_PIPE_INT ? 0x300 : + swsb.pipe == TGL_PIPE_FLOAT ? 0x200 : 0x100) | + swsb.regdist << 5 | swsb.sbid; + } else { + assert(!(swsb.mode & ~(TGL_SBID_DST | TGL_SBID_SRC))); + return (swsb.pipe == TGL_PIPE_ALL ? 0x300 : + swsb.mode == TGL_SBID_SRC ? 0x200 : 0x100) | + swsb.regdist << 5 | swsb.sbid; + } + } else { + assert(!(swsb.sbid & ~0xfu)); + return 0x80 | swsb.regdist << 4 | swsb.sbid; + } + + } else { + if (devinfo->ver >= 20) { + return swsb.sbid | (swsb.mode & TGL_SBID_SET ? 0xc0 : + swsb.mode & TGL_SBID_DST ? 0x80 : 0xa0); + } else { + assert(!(swsb.sbid & ~0xfu)); + return swsb.sbid | (swsb.mode & TGL_SBID_SET ? 0x40 : + swsb.mode & TGL_SBID_DST ? 0x20 : 0x30); + } + } +} + +/** + * Convert the provided binary representation of an SWSB annotation to a + * tgl_swsb. + */ +static inline struct tgl_swsb +tgl_swsb_decode(const struct intel_device_info *devinfo, + const bool is_unordered, const uint32_t x) +{ + if (devinfo->ver >= 20) { + if (x & 0x300) { + if (is_unordered) { + const struct tgl_swsb swsb = { + (x & 0xe0u) >> 5, + ((x & 0x300) == 0x300 ? TGL_PIPE_INT : + (x & 0x300) == 0x200 ? TGL_PIPE_FLOAT : + TGL_PIPE_ALL), + x & 0x1fu, + TGL_SBID_SET + }; + return swsb; + } else { + const struct tgl_swsb swsb = { + (x & 0xe0u) >> 5, + ((x & 0x300) == 0x300 ? TGL_PIPE_ALL : TGL_PIPE_NONE), + x & 0x1fu, + ((x & 0x300) == 0x200 ? TGL_SBID_SRC : TGL_SBID_DST) + }; + return swsb; + } + + } else if ((x & 0xe0) == 0x80) { + return tgl_swsb_sbid(TGL_SBID_DST, x & 0x1f); + } else if ((x & 0xe0) == 0xa0) { + return tgl_swsb_sbid(TGL_SBID_SRC, x & 0x1fu); + } else if ((x & 0xe0) == 0xc0) { + return tgl_swsb_sbid(TGL_SBID_SET, x & 0x1fu); + } else { + const struct tgl_swsb swsb = { x & 0x7u, + ((x & 0x38) == 0x10 ? TGL_PIPE_FLOAT : + (x & 0x38) == 0x18 ? TGL_PIPE_INT : + (x & 0x38) == 0x20 ? TGL_PIPE_LONG : + (x & 0x38) == 0x28 ? TGL_PIPE_MATH : + (x & 0x38) == 0x8 ? TGL_PIPE_ALL : + TGL_PIPE_NONE) }; + return swsb; + } + + } else { + if (x & 0x80) { + const struct tgl_swsb swsb = { (x & 0x70u) >> 4, TGL_PIPE_NONE, + x & 0xfu, + is_unordered ? + TGL_SBID_SET : TGL_SBID_DST }; + return swsb; + } else if ((x & 0x70) == 0x20) { + return tgl_swsb_sbid(TGL_SBID_DST, x & 0xfu); + } else if ((x & 0x70) == 0x30) { + return tgl_swsb_sbid(TGL_SBID_SRC, x & 0xfu); + } else if ((x & 0x70) == 0x40) { + return tgl_swsb_sbid(TGL_SBID_SET, x & 0xfu); + } else { + const struct tgl_swsb swsb = { x & 0x7u, + ((x & 0x78) == 0x10 ? TGL_PIPE_FLOAT : + (x & 0x78) == 0x18 ? TGL_PIPE_INT : + (x & 0x78) == 0x50 ? TGL_PIPE_LONG : + (x & 0x78) == 0x8 ? TGL_PIPE_ALL : + TGL_PIPE_NONE) }; + assert(devinfo->verx10 >= 125 || swsb.pipe == TGL_PIPE_NONE); + return swsb; + } + } +} + +enum tgl_sync_function { + TGL_SYNC_NOP = 0x0, + TGL_SYNC_ALLRD = 0x2, + TGL_SYNC_ALLWR = 0x3, + TGL_SYNC_FENCE = 0xd, + TGL_SYNC_BAR = 0xe, + TGL_SYNC_HOST = 0xf +}; + +/** + * Message target: Shared Function ID for where to SEND a message. + * + * These are enumerated in the ISA reference under "send - Send Message". + * In particular, see the following tables: + * - G45 PRM, Volume 4, Table 14-15 "Message Descriptor Definition" + * - Sandybridge PRM, Volume 4 Part 2, Table 8-16 "Extended Message Descriptor" + * - Ivybridge PRM, Volume 1 Part 1, section 3.2.7 "GPE Function IDs" + */ +enum brw_message_target { + BRW_SFID_NULL = 0, + BRW_SFID_MATH = 1, /* Only valid on Gfx4-5 */ + BRW_SFID_SAMPLER = 2, + BRW_SFID_MESSAGE_GATEWAY = 3, + BRW_SFID_DATAPORT_READ = 4, + BRW_SFID_DATAPORT_WRITE = 5, + BRW_SFID_URB = 6, + BRW_SFID_THREAD_SPAWNER = 7, + BRW_SFID_VME = 8, + + GFX6_SFID_DATAPORT_SAMPLER_CACHE = 4, + GFX6_SFID_DATAPORT_RENDER_CACHE = 5, + GFX6_SFID_DATAPORT_CONSTANT_CACHE = 9, + + GFX7_SFID_DATAPORT_DATA_CACHE = 10, + GFX7_SFID_PIXEL_INTERPOLATOR = 11, + HSW_SFID_DATAPORT_DATA_CACHE_1 = 12, + HSW_SFID_CRE = 13, + + GFX12_SFID_TGM = 13, /* Typed Global Memory */ + GFX12_SFID_SLM = 14, /* Shared Local Memory */ + GFX12_SFID_UGM = 15, /* Untyped Global Memory */ + + GEN_RT_SFID_BINDLESS_THREAD_DISPATCH = 7, + GEN_RT_SFID_RAY_TRACE_ACCELERATOR = 8, +}; + +#define GFX7_MESSAGE_TARGET_DP_DATA_CACHE 10 + +#define BRW_SAMPLER_RETURN_FORMAT_FLOAT32 0 +#define BRW_SAMPLER_RETURN_FORMAT_UINT32 2 +#define BRW_SAMPLER_RETURN_FORMAT_SINT32 3 + +#define GFX8_SAMPLER_RETURN_FORMAT_32BITS 0 +#define GFX8_SAMPLER_RETURN_FORMAT_16BITS 1 + +#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE 0 +#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE 0 +#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS 0 +#define BRW_SAMPLER_MESSAGE_SIMD8_KILLPIX 1 +#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD 1 +#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD 1 +#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS 2 +#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS 2 +#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE 0 +#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE 2 +#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE 0 +#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE 1 +#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE 1 +#define BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO 2 +#define BRW_SAMPLER_MESSAGE_SIMD16_RESINFO 2 +#define BRW_SAMPLER_MESSAGE_SIMD4X2_LD 3 +#define BRW_SAMPLER_MESSAGE_SIMD8_LD 3 +#define BRW_SAMPLER_MESSAGE_SIMD16_LD 3 + +#define GFX5_SAMPLER_MESSAGE_SAMPLE 0 +#define GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS 1 +#define GFX5_SAMPLER_MESSAGE_SAMPLE_LOD 2 +#define GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE 3 +#define GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS 4 +#define GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE 5 +#define GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE 6 +#define GFX5_SAMPLER_MESSAGE_SAMPLE_LD 7 +#define GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4 8 +#define GFX5_SAMPLER_MESSAGE_LOD 9 +#define GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO 10 +#define GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO 11 +#define GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C 16 +#define GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO 17 +#define GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C 18 +#define XE2_SAMPLER_MESSAGE_SAMPLE_MLOD 18 +#define XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD 19 +#define HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE 20 +#define GFX9_SAMPLER_MESSAGE_SAMPLE_LZ 24 +#define GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ 25 +#define GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ 26 +#define GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W 28 +#define GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS 29 +#define GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS 30 +#define GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DSS 31 + +/* for GFX5 only */ +#define BRW_SAMPLER_SIMD_MODE_SIMD4X2 0 +#define BRW_SAMPLER_SIMD_MODE_SIMD8 1 +#define BRW_SAMPLER_SIMD_MODE_SIMD16 2 +#define BRW_SAMPLER_SIMD_MODE_SIMD32_64 3 + +#define GFX10_SAMPLER_SIMD_MODE_SIMD8H 5 +#define GFX10_SAMPLER_SIMD_MODE_SIMD16H 6 + +#define XE2_SAMPLER_SIMD_MODE_SIMD16 1 +#define XE2_SAMPLER_SIMD_MODE_SIMD32 2 +#define XE2_SAMPLER_SIMD_MODE_SIMD16H 5 +#define XE2_SAMPLER_SIMD_MODE_SIMD32H 6 + +/* GFX9 changes SIMD mode 0 to mean SIMD8D, but lets us get the SIMD4x2 + * behavior by setting bit 22 of dword 2 in the message header. */ +#define GFX9_SAMPLER_SIMD_MODE_SIMD8D 0 +#define GFX9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2 (1 << 22) + +#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW 0 +#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH 1 +#define BRW_DATAPORT_OWORD_BLOCK_2_OWORDS 2 +#define BRW_DATAPORT_OWORD_BLOCK_4_OWORDS 3 +#define BRW_DATAPORT_OWORD_BLOCK_8_OWORDS 4 +#define GFX12_DATAPORT_OWORD_BLOCK_16_OWORDS 5 +#define BRW_DATAPORT_OWORD_BLOCK_OWORDS(n) \ + ((n) == 1 ? BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW : \ + (n) == 2 ? BRW_DATAPORT_OWORD_BLOCK_2_OWORDS : \ + (n) == 4 ? BRW_DATAPORT_OWORD_BLOCK_4_OWORDS : \ + (n) == 8 ? BRW_DATAPORT_OWORD_BLOCK_8_OWORDS : \ + (n) == 16 ? GFX12_DATAPORT_OWORD_BLOCK_16_OWORDS : \ + (abort(), ~0)) +#define BRW_DATAPORT_OWORD_BLOCK_DWORDS(n) \ + ((n) == 4 ? BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW : \ + (n) == 8 ? BRW_DATAPORT_OWORD_BLOCK_2_OWORDS : \ + (n) == 16 ? BRW_DATAPORT_OWORD_BLOCK_4_OWORDS : \ + (n) == 32 ? BRW_DATAPORT_OWORD_BLOCK_8_OWORDS : \ + (abort(), ~0)) + +#define BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD 0 +#define BRW_DATAPORT_OWORD_DUAL_BLOCK_4OWORDS 2 + +#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS 2 +#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS 3 + +/* This one stays the same across generations. */ +#define BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ 0 +/* GFX4 */ +#define BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ 1 +#define BRW_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ 2 +#define BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ 3 +/* G45, GFX5 */ +#define G45_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ 1 +#define G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ 2 +#define G45_DATAPORT_READ_MESSAGE_AVC_LOOP_FILTER_READ 3 +#define G45_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ 4 +#define G45_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ 6 +/* GFX6 */ +#define GFX6_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ 1 +#define GFX6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ 2 +#define GFX6_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ 4 +#define GFX6_DATAPORT_READ_MESSAGE_OWORD_UNALIGN_BLOCK_READ 5 +#define GFX6_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ 6 + +#define BRW_DATAPORT_READ_TARGET_DATA_CACHE 0 +#define BRW_DATAPORT_READ_TARGET_RENDER_CACHE 1 +#define BRW_DATAPORT_READ_TARGET_SAMPLER_CACHE 2 + +#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE 0 +#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED 1 +#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01 2 +#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23 3 +#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01 4 + +#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE 0 +#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE 1 +#define BRW_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE 2 +#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE 3 +#define BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE 4 +#define BRW_DATAPORT_WRITE_MESSAGE_STREAMED_VERTEX_BUFFER_WRITE 5 +#define BRW_DATAPORT_WRITE_MESSAGE_FLUSH_RENDER_CACHE 7 + +/* GFX6 */ +#define GFX6_DATAPORT_WRITE_MESSAGE_DWORD_ATOMIC_WRITE 7 +#define GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE 8 +#define GFX6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE 9 +#define GFX6_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE 10 +#define GFX6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE 11 +#define GFX6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE 12 +#define GFX6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE 13 +#define GFX6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_UNORM_WRITE 14 + +/* GFX7 */ +#define GFX7_DATAPORT_RC_MEDIA_BLOCK_READ 4 +#define GFX7_DATAPORT_RC_TYPED_SURFACE_READ 5 +#define GFX7_DATAPORT_RC_TYPED_ATOMIC_OP 6 +#define GFX7_DATAPORT_RC_MEMORY_FENCE 7 +#define GFX7_DATAPORT_RC_MEDIA_BLOCK_WRITE 10 +#define GFX7_DATAPORT_RC_RENDER_TARGET_WRITE 12 +#define GFX7_DATAPORT_RC_TYPED_SURFACE_WRITE 13 +#define GFX7_DATAPORT_DC_OWORD_BLOCK_READ 0 +#define GFX7_DATAPORT_DC_UNALIGNED_OWORD_BLOCK_READ 1 +#define GFX7_DATAPORT_DC_OWORD_DUAL_BLOCK_READ 2 +#define GFX7_DATAPORT_DC_DWORD_SCATTERED_READ 3 +#define GFX7_DATAPORT_DC_BYTE_SCATTERED_READ 4 +#define GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ 5 +#define GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP 6 +#define GFX7_DATAPORT_DC_MEMORY_FENCE 7 +#define GFX7_DATAPORT_DC_OWORD_BLOCK_WRITE 8 +#define GFX7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE 10 +#define GFX7_DATAPORT_DC_DWORD_SCATTERED_WRITE 11 +#define GFX7_DATAPORT_DC_BYTE_SCATTERED_WRITE 12 +#define GFX7_DATAPORT_DC_UNTYPED_SURFACE_WRITE 13 + +#define GFX7_DATAPORT_SCRATCH_READ ((1 << 18) | \ + (0 << 17)) +#define GFX7_DATAPORT_SCRATCH_WRITE ((1 << 18) | \ + (1 << 17)) +#define GFX7_DATAPORT_SCRATCH_NUM_REGS_SHIFT 12 + +#define GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET 0 +#define GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE 1 +#define GFX7_PIXEL_INTERPOLATOR_LOC_CENTROID 2 +#define GFX7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET 3 + +/* HSW */ +#define HSW_DATAPORT_DC_PORT0_OWORD_BLOCK_READ 0 +#define HSW_DATAPORT_DC_PORT0_UNALIGNED_OWORD_BLOCK_READ 1 +#define HSW_DATAPORT_DC_PORT0_OWORD_DUAL_BLOCK_READ 2 +#define HSW_DATAPORT_DC_PORT0_DWORD_SCATTERED_READ 3 +#define HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ 4 +#define HSW_DATAPORT_DC_PORT0_MEMORY_FENCE 7 +#define HSW_DATAPORT_DC_PORT0_OWORD_BLOCK_WRITE 8 +#define HSW_DATAPORT_DC_PORT0_OWORD_DUAL_BLOCK_WRITE 10 +#define HSW_DATAPORT_DC_PORT0_DWORD_SCATTERED_WRITE 11 +#define HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE 12 + +#define HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ 1 +#define HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP 2 +#define HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2 3 +#define HSW_DATAPORT_DC_PORT1_MEDIA_BLOCK_READ 4 +#define HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ 5 +#define HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP 6 +#define HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2 7 +#define HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE 9 +#define HSW_DATAPORT_DC_PORT1_MEDIA_BLOCK_WRITE 10 +#define HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP 11 +#define HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2 12 +#define HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE 13 +#define GFX9_DATAPORT_DC_PORT1_A64_SCATTERED_READ 0x10 +#define GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ 0x11 +#define GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_OP 0x12 +#define GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_INT_OP 0x13 +#define GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_READ 0x14 +#define GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_WRITE 0x15 +#define GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE 0x19 +#define GFX8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE 0x1a +#define GFX9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP 0x1b +#define GFX9_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_FLOAT_OP 0x1d +#define GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_FLOAT_OP 0x1e + +/* GFX9 */ +#define GFX9_DATAPORT_RC_RENDER_TARGET_WRITE 12 +#define GFX9_DATAPORT_RC_RENDER_TARGET_READ 13 + +/* A64 scattered message subtype */ +#define GFX8_A64_SCATTERED_SUBTYPE_BYTE 0 +#define GFX8_A64_SCATTERED_SUBTYPE_DWORD 1 +#define GFX8_A64_SCATTERED_SUBTYPE_QWORD 2 +#define GFX8_A64_SCATTERED_SUBTYPE_HWORD 3 + +/* Dataport special binding table indices: */ +#define BRW_BTI_STATELESS 255 +#define GFX7_BTI_SLM 254 + +#define HSW_BTI_STATELESS_LOCALLY_COHERENT 255 +#define HSW_BTI_STATELESS_NON_COHERENT 253 +#define HSW_BTI_STATELESS_GLOBALLY_COHERENT 252 +#define HSW_BTI_STATELESS_LLC_COHERENT 251 +#define HSW_BTI_STATELESS_L3_UNCACHED 250 + +/* The hardware docs are a bit contradictory here. On Haswell, where they + * first added cache ability control, there were 5 different cache modes (see + * HSW_BTI_STATELESS_* above). On Broadwell, they reduced to two: + * + * - IA-Coherent (BTI=255): Coherent within Gen and coherent within the + * entire IA cache memory hierarchy. + * + * - Non-Coherent (BTI=253): Coherent within Gen, same cache type. + * + * Information about stateless cache coherency can be found in the "A32 + * Stateless" section of the "3D Media GPGPU" volume of the PRM for each + * hardware generation. + * + * Unfortunately, the docs for MDC_STATELESS appear to have been copied and + * pasted from Haswell and give the Haswell definitions for the BTI values of + * 255 and 253 including a warning about accessing 253 surfaces from multiple + * threads. This seems to be a copy+paste error and the definitions from the + * "A32 Stateless" section should be trusted instead. + * + * Note that because the DRM sets bit 4 of HDC_CHICKEN0 on BDW, CHV and at + * least some pre-production steppings of SKL due to WaForceEnableNonCoherent, + * HDC memory access may have been overridden by the kernel to be non-coherent + * (matching the behavior of the same BTI on pre-Gfx8 hardware) and BTI 255 + * may actually be an alias for BTI 253. + */ +#define GFX8_BTI_STATELESS_IA_COHERENT 255 +#define GFX8_BTI_STATELESS_NON_COHERENT 253 +#define GFX9_BTI_BINDLESS 252 + +/* This ID doesn't map anything HW related value. It exists to inform the + * lowering code to not use the bindless heap. + */ +#define GFX125_NON_BINDLESS (1u << 16) + +/* Dataport atomic operations for Untyped Atomic Integer Operation message + * (and others). + */ +#define BRW_AOP_AND 1 +#define BRW_AOP_OR 2 +#define BRW_AOP_XOR 3 +#define BRW_AOP_MOV 4 +#define BRW_AOP_INC 5 +#define BRW_AOP_DEC 6 +#define BRW_AOP_ADD 7 +#define BRW_AOP_SUB 8 +#define BRW_AOP_REVSUB 9 +#define BRW_AOP_IMAX 10 +#define BRW_AOP_IMIN 11 +#define BRW_AOP_UMAX 12 +#define BRW_AOP_UMIN 13 +#define BRW_AOP_CMPWR 14 +#define BRW_AOP_PREDEC 15 + +/* Dataport atomic operations for Untyped Atomic Float Operation message. */ +#define BRW_AOP_FMAX 1 +#define BRW_AOP_FMIN 2 +#define BRW_AOP_FCMPWR 3 +#define BRW_AOP_FADD 4 + +#define BRW_MATH_FUNCTION_INV 1 +#define BRW_MATH_FUNCTION_LOG 2 +#define BRW_MATH_FUNCTION_EXP 3 +#define BRW_MATH_FUNCTION_SQRT 4 +#define BRW_MATH_FUNCTION_RSQ 5 +#define BRW_MATH_FUNCTION_SIN 6 +#define BRW_MATH_FUNCTION_COS 7 +#define BRW_MATH_FUNCTION_SINCOS 8 /* gfx4, gfx5 */ +#define BRW_MATH_FUNCTION_FDIV 9 /* gfx6+ */ +#define BRW_MATH_FUNCTION_POW 10 +#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER 11 +#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT 12 +#define BRW_MATH_FUNCTION_INT_DIV_REMAINDER 13 +#define GFX8_MATH_FUNCTION_INVM 14 +#define GFX8_MATH_FUNCTION_RSQRTM 15 + +#define BRW_MATH_INTEGER_UNSIGNED 0 +#define BRW_MATH_INTEGER_SIGNED 1 + +#define BRW_MATH_PRECISION_FULL 0 +#define BRW_MATH_PRECISION_PARTIAL 1 + +#define BRW_MATH_SATURATE_NONE 0 +#define BRW_MATH_SATURATE_SATURATE 1 + +#define BRW_MATH_DATA_VECTOR 0 +#define BRW_MATH_DATA_SCALAR 1 + +#define BRW_URB_OPCODE_WRITE_HWORD 0 +#define BRW_URB_OPCODE_WRITE_OWORD 1 +#define BRW_URB_OPCODE_READ_HWORD 2 +#define BRW_URB_OPCODE_READ_OWORD 3 +#define GFX7_URB_OPCODE_ATOMIC_MOV 4 +#define GFX7_URB_OPCODE_ATOMIC_INC 5 +#define GFX8_URB_OPCODE_ATOMIC_ADD 6 +#define GFX8_URB_OPCODE_SIMD8_WRITE 7 +#define GFX8_URB_OPCODE_SIMD8_READ 8 +#define GFX125_URB_OPCODE_FENCE 9 + +#define BRW_URB_SWIZZLE_NONE 0 +#define BRW_URB_SWIZZLE_INTERLEAVE 1 +#define BRW_URB_SWIZZLE_TRANSPOSE 2 + +#define BRW_SCRATCH_SPACE_SIZE_1K 0 +#define BRW_SCRATCH_SPACE_SIZE_2K 1 +#define BRW_SCRATCH_SPACE_SIZE_4K 2 +#define BRW_SCRATCH_SPACE_SIZE_8K 3 +#define BRW_SCRATCH_SPACE_SIZE_16K 4 +#define BRW_SCRATCH_SPACE_SIZE_32K 5 +#define BRW_SCRATCH_SPACE_SIZE_64K 6 +#define BRW_SCRATCH_SPACE_SIZE_128K 7 +#define BRW_SCRATCH_SPACE_SIZE_256K 8 +#define BRW_SCRATCH_SPACE_SIZE_512K 9 +#define BRW_SCRATCH_SPACE_SIZE_1M 10 +#define BRW_SCRATCH_SPACE_SIZE_2M 11 + +#define BRW_MESSAGE_GATEWAY_SFID_OPEN_GATEWAY 0 +#define BRW_MESSAGE_GATEWAY_SFID_CLOSE_GATEWAY 1 +#define BRW_MESSAGE_GATEWAY_SFID_FORWARD_MSG 2 +#define BRW_MESSAGE_GATEWAY_SFID_GET_TIMESTAMP 3 +#define BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG 4 +#define BRW_MESSAGE_GATEWAY_SFID_UPDATE_GATEWAY_STATE 5 +#define BRW_MESSAGE_GATEWAY_SFID_MMIO_READ_WRITE 6 + + +/* Gfx7 "GS URB Entry Allocation Size" is a U9-1 field, so the maximum gs_size + * is 2^9, or 512. It's counted in multiples of 64 bytes. + * + * Identical for VS, DS, and HS. + */ +#define GFX7_MAX_GS_URB_ENTRY_SIZE_BYTES (512*64) +#define GFX7_MAX_DS_URB_ENTRY_SIZE_BYTES (512*64) +#define GFX7_MAX_HS_URB_ENTRY_SIZE_BYTES (512*64) +#define GFX7_MAX_VS_URB_ENTRY_SIZE_BYTES (512*64) + +#define BRW_GS_EDGE_INDICATOR_0 (1 << 8) +#define BRW_GS_EDGE_INDICATOR_1 (1 << 9) + +/* Gfx6 "GS URB Entry Allocation Size" is defined as a number of 1024-bit + * (128 bytes) URB rows and the maximum allowed value is 5 rows. + */ +#define GFX6_MAX_GS_URB_ENTRY_SIZE_BYTES (5*128) + +/* GS Thread Payload + */ + +/* 3DSTATE_GS "Output Vertex Size" has an effective maximum of 62. It's + * counted in multiples of 16 bytes. + */ +#define GFX7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES (62*16) + + +/* R0 */ +# define GFX7_GS_PAYLOAD_INSTANCE_ID_SHIFT 27 + +/* CR0.0[5:4] Floating-Point Rounding Modes + * Skylake PRM, Volume 7 Part 1, "Control Register", page 756 + */ + +#define BRW_CR0_RND_MODE_MASK 0x30 +#define BRW_CR0_RND_MODE_SHIFT 4 + +enum ENUM_PACKED brw_rnd_mode { + BRW_RND_MODE_RTNE = 0, /* Round to Nearest or Even */ + BRW_RND_MODE_RU = 1, /* Round Up, toward +inf */ + BRW_RND_MODE_RD = 2, /* Round Down, toward -inf */ + BRW_RND_MODE_RTZ = 3, /* Round Toward Zero */ + BRW_RND_MODE_UNSPECIFIED, /* Unspecified rounding mode */ +}; + +#define BRW_CR0_FP64_DENORM_PRESERVE (1 << 6) +#define BRW_CR0_FP32_DENORM_PRESERVE (1 << 7) +#define BRW_CR0_FP16_DENORM_PRESERVE (1 << 10) + +#define BRW_CR0_FP_MODE_MASK (BRW_CR0_FP64_DENORM_PRESERVE | \ + BRW_CR0_FP32_DENORM_PRESERVE | \ + BRW_CR0_FP16_DENORM_PRESERVE | \ + BRW_CR0_RND_MODE_MASK) + +/* MDC_DS - Data Size Message Descriptor Control Field + * Skylake PRM, Volume 2d, page 129 + * + * Specifies the number of Bytes to be read or written per Dword used at + * byte_scattered read/write and byte_scaled read/write messages. + */ +#define GFX7_BYTE_SCATTERED_DATA_ELEMENT_BYTE 0 +#define GFX7_BYTE_SCATTERED_DATA_ELEMENT_WORD 1 +#define GFX7_BYTE_SCATTERED_DATA_ELEMENT_DWORD 2 + +#define GEN_RT_BTD_MESSAGE_SPAWN 1 + +#define GEN_RT_TRACE_RAY_INITAL 0 +#define GEN_RT_TRACE_RAY_INSTANCE 1 +#define GEN_RT_TRACE_RAY_COMMIT 2 +#define GEN_RT_TRACE_RAY_CONTINUE 3 + +#define GEN_RT_BTD_SHADER_TYPE_ANY_HIT 0 +#define GEN_RT_BTD_SHADER_TYPE_CLOSEST_HIT 1 +#define GEN_RT_BTD_SHADER_TYPE_MISS 2 +#define GEN_RT_BTD_SHADER_TYPE_INTERSECTION 3 + +/* Starting with Xe-HPG, the old dataport was massively reworked dataport. + * The new thing, called Load/Store Cache or LSC, has a significantly improved + * interface. Instead of bespoke messages for every case, there's basically + * one or two messages with different bits to control things like address + * size, how much data is read/written, etc. It's way nicer but also means we + * get to rewrite all our dataport encoding/decoding code. This patch kicks + * off the party with all of the new enums. + */ +enum lsc_opcode { + LSC_OP_LOAD = 0, + LSC_OP_LOAD_CMASK = 2, + LSC_OP_STORE = 4, + LSC_OP_STORE_CMASK = 6, + LSC_OP_ATOMIC_INC = 8, + LSC_OP_ATOMIC_DEC = 9, + LSC_OP_ATOMIC_LOAD = 10, + LSC_OP_ATOMIC_STORE = 11, + LSC_OP_ATOMIC_ADD = 12, + LSC_OP_ATOMIC_SUB = 13, + LSC_OP_ATOMIC_MIN = 14, + LSC_OP_ATOMIC_MAX = 15, + LSC_OP_ATOMIC_UMIN = 16, + LSC_OP_ATOMIC_UMAX = 17, + LSC_OP_ATOMIC_CMPXCHG = 18, + LSC_OP_ATOMIC_FADD = 19, + LSC_OP_ATOMIC_FSUB = 20, + LSC_OP_ATOMIC_FMIN = 21, + LSC_OP_ATOMIC_FMAX = 22, + LSC_OP_ATOMIC_FCMPXCHG = 23, + LSC_OP_ATOMIC_AND = 24, + LSC_OP_ATOMIC_OR = 25, + LSC_OP_ATOMIC_XOR = 26, + LSC_OP_FENCE = 31 +}; + +/* + * Specifies the size of the dataport address payload in registers. + */ +enum ENUM_PACKED lsc_addr_reg_size { + LSC_ADDR_REG_SIZE_1 = 1, + LSC_ADDR_REG_SIZE_2 = 2, + LSC_ADDR_REG_SIZE_3 = 3, + LSC_ADDR_REG_SIZE_4 = 4, + LSC_ADDR_REG_SIZE_6 = 6, + LSC_ADDR_REG_SIZE_8 = 8, +}; + +/* + * Specifies the size of the address payload item in a dataport message. + */ +enum ENUM_PACKED lsc_addr_size { + LSC_ADDR_SIZE_A16 = 1, /* 16-bit address offset */ + LSC_ADDR_SIZE_A32 = 2, /* 32-bit address offset */ + LSC_ADDR_SIZE_A64 = 3, /* 64-bit address offset */ +}; + +/* + * Specifies the type of the address payload item in a dataport message. The + * address type specifies how the dataport message decodes the Extended + * Descriptor for the surface attributes and address calculation. + */ +enum ENUM_PACKED lsc_addr_surface_type { + LSC_ADDR_SURFTYPE_FLAT = 0, /* Flat */ + LSC_ADDR_SURFTYPE_BSS = 1, /* Bindless surface state */ + LSC_ADDR_SURFTYPE_SS = 2, /* Surface state */ + LSC_ADDR_SURFTYPE_BTI = 3, /* Binding table index */ +}; + +/* + * Specifies the dataport message override to the default L1 and L3 memory + * cache policies. Dataport L1 cache policies are uncached (UC), cached (C), + * cache streaming (S) and invalidate-after-read (IAR). Dataport L3 cache + * policies are uncached (UC) and cached (C). + */ +enum lsc_cache_load { + /* No override. Use the non-pipelined state or surface state cache settings + * for L1 and L3. + */ + LSC_CACHE_LOAD_L1STATE_L3MOCS = 0, + /* Override to L1 uncached and L3 uncached */ + LSC_CACHE_LOAD_L1UC_L3UC = 1, + /* Override to L1 uncached and L3 cached */ + LSC_CACHE_LOAD_L1UC_L3C = 2, + /* Override to L1 cached and L3 uncached */ + LSC_CACHE_LOAD_L1C_L3UC = 3, + /* Override to cache at both L1 and L3 */ + LSC_CACHE_LOAD_L1C_L3C = 4, + /* Override to L1 streaming load and L3 uncached */ + LSC_CACHE_LOAD_L1S_L3UC = 5, + /* Override to L1 streaming load and L3 cached */ + LSC_CACHE_LOAD_L1S_L3C = 6, + /* For load messages, override to L1 invalidate-after-read, and L3 cached. */ + LSC_CACHE_LOAD_L1IAR_L3C = 7, +}; + +/* + * Specifies the dataport message override to the default L1 and L3 memory + * cache policies. Dataport L1 cache policies are uncached (UC), cached (C), + * streaming (S) and invalidate-after-read (IAR). Dataport L3 cache policies + * are uncached (UC), cached (C), cached-as-a-constand (CC) and + * invalidate-after-read (IAR). + */ +enum PACKED xe2_lsc_cache_load { + /* No override. Use the non-pipelined or surface state cache settings for L1 + * and L3. + */ + XE2_LSC_CACHE_LOAD_L1STATE_L3MOCS = 0, + /* Override to L1 uncached and L3 uncached */ + XE2_LSC_CACHE_LOAD_L1UC_L3UC = 2, + /* Override to L1 uncached and L3 cached */ + XE2_LSC_CACHE_LOAD_L1UC_L3C = 4, + /* Override to L1 uncached and L3 cached as a constant */ + XE2_LSC_CACHE_LOAD_L1UC_L3CC = 5, + /* Override to L1 cached and L3 uncached */ + XE2_LSC_CACHE_LOAD_L1C_L3UC = 6, + /* Override to L1 cached and L3 cached */ + XE2_LSC_CACHE_LOAD_L1C_L3C = 8, + /* Override to L1 cached and L3 cached as a constant */ + XE2_LSC_CACHE_LOAD_L1C_L3CC = 9, + /* Override to L1 cached as streaming load and L3 uncached */ + XE2_LSC_CACHE_LOAD_L1S_L3UC = 10, + /* Override to L1 cached as streaming load and L3 cached */ + XE2_LSC_CACHE_LOAD_L1S_L3C = 12, + /* Override to L1 and L3 invalidate after read */ + XE2_LSC_CACHE_LOAD_L1IAR_L3IAR = 14, + +}; + +/* + * Specifies the dataport message override to the default L1 and L3 memory + * cache policies. Dataport L1 cache policies are uncached (UC), write-through + * (WT), write-back (WB) and streaming (S). Dataport L3 cache policies are + * uncached (UC) and cached (WB). + */ +enum ENUM_PACKED lsc_cache_store { + /* No override. Use the non-pipelined or surface state cache settings for L1 + * and L3. + */ + LSC_CACHE_STORE_L1STATE_L3MOCS = 0, + /* Override to L1 uncached and L3 uncached */ + LSC_CACHE_STORE_L1UC_L3UC = 1, + /* Override to L1 uncached and L3 cached */ + LSC_CACHE_STORE_L1UC_L3WB = 2, + /* Override to L1 write-through and L3 uncached */ + LSC_CACHE_STORE_L1WT_L3UC = 3, + /* Override to L1 write-through and L3 cached */ + LSC_CACHE_STORE_L1WT_L3WB = 4, + /* Override to L1 streaming and L3 uncached */ + LSC_CACHE_STORE_L1S_L3UC = 5, + /* Override to L1 streaming and L3 cached */ + LSC_CACHE_STORE_L1S_L3WB = 6, + /* Override to L1 write-back, and L3 cached */ + LSC_CACHE_STORE_L1WB_L3WB = 7, + +}; + +/* + * Specifies the dataport message override to the default L1 and L3 memory + * cache policies. Dataport L1 cache policies are uncached (UC), write-through + * (WT), write-back (WB) and streaming (S). Dataport L3 cache policies are + * uncached (UC) and cached (WB). + */ +enum PACKED xe2_lsc_cache_store { + /* No override. Use the non-pipelined or surface state cache settings for L1 + * and L3. + */ + XE2_LSC_CACHE_STORE_L1STATE_L3MOCS = 0, + /* Override to L1 uncached and L3 uncached */ + XE2_LSC_CACHE_STORE_L1UC_L3UC = 2, + /* Override to L1 uncached and L3 cached */ + XE2_LSC_CACHE_STORE_L1UC_L3WB = 4, + /* Override to L1 write-through and L3 uncached */ + XE2_LSC_CACHE_STORE_L1WT_L3UC = 6, + /* Override to L1 write-through and L3 cached */ + XE2_LSC_CACHE_STORE_L1WT_L3WB = 8, + /* Override to L1 streaming and L3 uncached */ + XE2_LSC_CACHE_STORE_L1S_L3UC = 10, + /* Override to L1 streaming and L3 cached */ + XE2_LSC_CACHE_STORE_L1S_L3WB = 12, + /* Override to L1 write-back and L3 cached */ + XE2_LSC_CACHE_STORE_L1WB_L3WB = 14, + +}; + +#define LSC_CACHE(devinfo, l_or_s, cc) \ + ((devinfo)->ver < 20 ? (unsigned)LSC_CACHE_ ## l_or_s ## _ ## cc : \ + (unsigned)XE2_LSC_CACHE_ ## l_or_s ## _ ## cc) + +/* + * Specifies which components of the data payload 4-element vector (X,Y,Z,W) is + * packed into the register payload. + */ +enum ENUM_PACKED lsc_cmask { + LSC_CMASK_X = 0x1, + LSC_CMASK_Y = 0x2, + LSC_CMASK_XY = 0x3, + LSC_CMASK_Z = 0x4, + LSC_CMASK_XZ = 0x5, + LSC_CMASK_YZ = 0x6, + LSC_CMASK_XYZ = 0x7, + LSC_CMASK_W = 0x8, + LSC_CMASK_XW = 0x9, + LSC_CMASK_YW = 0xa, + LSC_CMASK_XYW = 0xb, + LSC_CMASK_ZW = 0xc, + LSC_CMASK_XZW = 0xd, + LSC_CMASK_YZW = 0xe, + LSC_CMASK_XYZW = 0xf, +}; + +/* + * Specifies the size of the data payload item in a dataport message. + */ +enum ENUM_PACKED lsc_data_size { + /* 8-bit scalar data value in memory, packed into a 8-bit data value in + * register. + */ + LSC_DATA_SIZE_D8 = 0, + /* 16-bit scalar data value in memory, packed into a 16-bit data value in + * register. + */ + LSC_DATA_SIZE_D16 = 1, + /* 32-bit scalar data value in memory, packed into 32-bit data value in + * register. + */ + LSC_DATA_SIZE_D32 = 2, + /* 64-bit scalar data value in memory, packed into 64-bit data value in + * register. + */ + LSC_DATA_SIZE_D64 = 3, + /* 8-bit scalar data value in memory, packed into 32-bit unsigned data value + * in register. + */ + LSC_DATA_SIZE_D8U32 = 4, + /* 16-bit scalar data value in memory, packed into 32-bit unsigned data + * value in register. + */ + LSC_DATA_SIZE_D16U32 = 5, + /* 16-bit scalar BigFloat data value in memory, packed into 32-bit float + * value in register. + */ + LSC_DATA_SIZE_D16BF32 = 6, +}; + +/* + * Enum specifies the scope of the fence. + */ +enum ENUM_PACKED lsc_fence_scope { + /* Wait until all previous memory transactions from this thread are observed + * within the local thread-group. + */ + LSC_FENCE_THREADGROUP = 0, + /* Wait until all previous memory transactions from this thread are observed + * within the local sub-slice. + */ + LSC_FENCE_LOCAL = 1, + /* Wait until all previous memory transactions from this thread are observed + * in the local tile. + */ + LSC_FENCE_TILE = 2, + /* Wait until all previous memory transactions from this thread are observed + * in the local GPU. + */ + LSC_FENCE_GPU = 3, + /* Wait until all previous memory transactions from this thread are observed + * across all GPUs in the system. + */ + LSC_FENCE_ALL_GPU = 4, + /* Wait until all previous memory transactions from this thread are observed + * at the "system" level. + */ + LSC_FENCE_SYSTEM_RELEASE = 5, + /* For GPUs that do not follow PCIe Write ordering for downstream writes + * targeting device memory, a fence message with scope=System_Acquire will + * commit to device memory all downstream and peer writes that have reached + * the device. + */ + LSC_FENCE_SYSTEM_ACQUIRE = 6, +}; + +/* + * Specifies the type of cache flush operation to perform after a fence is + * complete. + */ +enum ENUM_PACKED lsc_flush_type { + LSC_FLUSH_TYPE_NONE = 0, + /* + * For a R/W cache, evict dirty lines (M to I state) and invalidate clean + * lines. For a RO cache, invalidate clean lines. + */ + LSC_FLUSH_TYPE_EVICT = 1, + /* + * For both R/W and RO cache, invalidate clean lines in the cache. + */ + LSC_FLUSH_TYPE_INVALIDATE = 2, + /* + * For a R/W cache, invalidate dirty lines (M to I state), without + * write-back to next level. This opcode does nothing for a RO cache. + */ + LSC_FLUSH_TYPE_DISCARD = 3, + /* + * For a R/W cache, write-back dirty lines to the next level, but kept in + * the cache as "clean" (M to V state). This opcode does nothing for a RO + * cache. + */ + LSC_FLUSH_TYPE_CLEAN = 4, + /* + * Flush "RW" section of the L3 cache, but leave L1 and L2 caches untouched. + */ + LSC_FLUSH_TYPE_L3ONLY = 5, + /* + * HW maps this flush type internally to NONE. + */ + LSC_FLUSH_TYPE_NONE_6 = 6, + +}; + +enum ENUM_PACKED lsc_backup_fence_routing { + /* Normal routing: UGM fence is routed to UGM pipeline. */ + LSC_NORMAL_ROUTING, + /* Route UGM fence to LSC unit. */ + LSC_ROUTE_TO_LSC, +}; + +/* + * Specifies the size of the vector in a dataport message. + */ +enum ENUM_PACKED lsc_vect_size { + LSC_VECT_SIZE_V1 = 0, /* vector length 1 */ + LSC_VECT_SIZE_V2 = 1, /* vector length 2 */ + LSC_VECT_SIZE_V3 = 2, /* Vector length 3 */ + LSC_VECT_SIZE_V4 = 3, /* Vector length 4 */ + LSC_VECT_SIZE_V8 = 4, /* Vector length 8 */ + LSC_VECT_SIZE_V16 = 5, /* Vector length 16 */ + LSC_VECT_SIZE_V32 = 6, /* Vector length 32 */ + LSC_VECT_SIZE_V64 = 7, /* Vector length 64 */ +}; + +#define LSC_ONE_ADDR_REG 1 + +#endif /* BRW_EU_DEFINES_H */ diff --git a/src/intel/compiler/elk/brw_eu_emit.c b/src/intel/compiler/elk/brw_eu_emit.c new file mode 100644 index 00000000000..74bd9c0ddbe --- /dev/null +++ b/src/intel/compiler/elk/brw_eu_emit.c @@ -0,0 +1,3770 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell + */ + + +#include "brw_eu_defines.h" +#include "brw_eu.h" + +#include "util/ralloc.h" + +/** + * Prior to Sandybridge, the SEND instruction accepted non-MRF source + * registers, implicitly moving the operand to a message register. + * + * On Sandybridge, this is no longer the case. This function performs the + * explicit move; it should be called before emitting a SEND instruction. + */ +void +gfx6_resolve_implied_move(struct brw_codegen *p, + struct brw_reg *src, + unsigned msg_reg_nr) +{ + const struct intel_device_info *devinfo = p->devinfo; + if (devinfo->ver < 6) + return; + + if (src->file == BRW_MESSAGE_REGISTER_FILE) + return; + + if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) { + assert(devinfo->ver < 12); + brw_push_insn_state(p); + brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD), + retype(*src, BRW_REGISTER_TYPE_UD)); + brw_pop_insn_state(p); + } + *src = brw_message_reg(msg_reg_nr); +} + +static void +gfx7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg) +{ + /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"): + * "The send with EOT should use register space R112-R127 for . This is + * to enable loading of a new thread into the same slot while the message + * with EOT for current thread is pending dispatch." + * + * Since we're pretending to have 16 MRFs anyway, we may as well use the + * registers required for messages with EOT. + */ + const struct intel_device_info *devinfo = p->devinfo; + if (devinfo->ver >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) { + reg->file = BRW_GENERAL_REGISTER_FILE; + reg->nr += GFX7_MRF_HACK_START; + } +} + +void +brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest) +{ + const struct intel_device_info *devinfo = p->devinfo; + + if (dest.file == BRW_MESSAGE_REGISTER_FILE) + assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver)); + else if (dest.file == BRW_GENERAL_REGISTER_FILE) + assert(dest.nr < XE2_MAX_GRF); + + /* The hardware has a restriction where a destination of size Byte with + * a stride of 1 is only allowed for a packed byte MOV. For any other + * instruction, the stride must be at least 2, even when the destination + * is the NULL register. + */ + if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE && + dest.nr == BRW_ARF_NULL && + type_sz(dest.type) == 1 && + dest.hstride == BRW_HORIZONTAL_STRIDE_1) { + dest.hstride = BRW_HORIZONTAL_STRIDE_2; + } + + gfx7_convert_mrf_to_grf(p, &dest); + + if (devinfo->ver >= 12 && + (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND || + brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC)) { + assert(dest.file == BRW_GENERAL_REGISTER_FILE || + dest.file == BRW_ARCHITECTURE_REGISTER_FILE); + assert(dest.address_mode == BRW_ADDRESS_DIRECT); + assert(dest.subnr == 0); + assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 || + (dest.hstride == BRW_HORIZONTAL_STRIDE_1 && + dest.vstride == dest.width + 1)); + assert(!dest.negate && !dest.abs); + brw_inst_set_dst_reg_file(devinfo, inst, dest.file); + brw_inst_set_dst_da_reg_nr(devinfo, inst, phys_nr(devinfo, dest)); + + } else if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDS || + brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDSC) { + assert(devinfo->ver < 12); + assert(dest.file == BRW_GENERAL_REGISTER_FILE || + dest.file == BRW_ARCHITECTURE_REGISTER_FILE); + assert(dest.address_mode == BRW_ADDRESS_DIRECT); + assert(dest.subnr % 16 == 0); + assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1 && + dest.vstride == dest.width + 1); + assert(!dest.negate && !dest.abs); + brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr); + brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16); + brw_inst_set_send_dst_reg_file(devinfo, inst, dest.file); + } else { + brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type); + brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode); + + if (dest.address_mode == BRW_ADDRESS_DIRECT) { + brw_inst_set_dst_da_reg_nr(devinfo, inst, phys_nr(devinfo, dest)); + + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + brw_inst_set_dst_da1_subreg_nr(devinfo, inst, phys_subnr(devinfo, dest)); + if (dest.hstride == BRW_HORIZONTAL_STRIDE_0) + dest.hstride = BRW_HORIZONTAL_STRIDE_1; + brw_inst_set_dst_hstride(devinfo, inst, dest.hstride); + } else { + brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16); + brw_inst_set_da16_writemask(devinfo, inst, dest.writemask); + if (dest.file == BRW_GENERAL_REGISTER_FILE || + dest.file == BRW_MESSAGE_REGISTER_FILE) { + assert(dest.writemask != 0); + } + /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1: + * Although Dst.HorzStride is a don't care for Align16, HW needs + * this to be programmed as "01". + */ + brw_inst_set_dst_hstride(devinfo, inst, 1); + } + } else { + brw_inst_set_dst_ia_subreg_nr(devinfo, inst, phys_subnr(devinfo, dest)); + + /* These are different sizes in align1 vs align16: + */ + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + brw_inst_set_dst_ia1_addr_imm(devinfo, inst, + dest.indirect_offset); + if (dest.hstride == BRW_HORIZONTAL_STRIDE_0) + dest.hstride = BRW_HORIZONTAL_STRIDE_1; + brw_inst_set_dst_hstride(devinfo, inst, dest.hstride); + } else { + brw_inst_set_dst_ia16_addr_imm(devinfo, inst, + dest.indirect_offset); + /* even ignored in da16, still need to set as '01' */ + brw_inst_set_dst_hstride(devinfo, inst, 1); + } + } + } + + /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8) + * or 16 (SIMD16), as that's normally correct. However, when dealing with + * small registers, it can be useful for us to automatically reduce it to + * match the register size. + */ + if (p->automatic_exec_sizes) { + /* + * In platforms that support fp64 we can emit instructions with a width + * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In + * these cases we need to make sure that these instructions have their + * exec sizes set properly when they are emitted and we can't rely on + * this code to fix it. + */ + bool fix_exec_size; + if (devinfo->ver >= 6) + fix_exec_size = dest.width < BRW_EXECUTE_4; + else + fix_exec_size = dest.width < BRW_EXECUTE_8; + + if (fix_exec_size) + brw_inst_set_exec_size(devinfo, inst, dest.width); + } +} + +void +brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg) +{ + const struct intel_device_info *devinfo = p->devinfo; + + if (reg.file == BRW_MESSAGE_REGISTER_FILE) + assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver)); + else if (reg.file == BRW_GENERAL_REGISTER_FILE) + assert(reg.nr < XE2_MAX_GRF); + + gfx7_convert_mrf_to_grf(p, ®); + + if (devinfo->ver >= 6 && + (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND || + brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC || + brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDS || + brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDSC)) { + /* Any source modifiers or regions will be ignored, since this just + * identifies the MRF/GRF to start reading the message contents from. + * Check for some likely failures. + */ + assert(!reg.negate); + assert(!reg.abs); + assert(reg.address_mode == BRW_ADDRESS_DIRECT); + } + + if (devinfo->ver >= 12 && + (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND || + brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC)) { + assert(reg.file != BRW_IMMEDIATE_VALUE); + assert(reg.address_mode == BRW_ADDRESS_DIRECT); + assert(reg.subnr == 0); + assert(has_scalar_region(reg) || + (reg.hstride == BRW_HORIZONTAL_STRIDE_1 && + reg.vstride == reg.width + 1)); + assert(!reg.negate && !reg.abs); + brw_inst_set_send_src0_reg_file(devinfo, inst, reg.file); + brw_inst_set_src0_da_reg_nr(devinfo, inst, phys_nr(devinfo, reg)); + + } else if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDS || + brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDSC) { + assert(reg.file == BRW_GENERAL_REGISTER_FILE); + assert(reg.address_mode == BRW_ADDRESS_DIRECT); + assert(reg.subnr % 16 == 0); + assert(has_scalar_region(reg) || + (reg.hstride == BRW_HORIZONTAL_STRIDE_1 && + reg.vstride == reg.width + 1)); + assert(!reg.negate && !reg.abs); + brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr); + brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16); + } else { + brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type); + brw_inst_set_src0_abs(devinfo, inst, reg.abs); + brw_inst_set_src0_negate(devinfo, inst, reg.negate); + brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode); + + if (reg.file == BRW_IMMEDIATE_VALUE) { + if (reg.type == BRW_REGISTER_TYPE_DF || + brw_inst_opcode(p->isa, inst) == BRW_OPCODE_DIM) + brw_inst_set_imm_df(devinfo, inst, reg.df); + else if (reg.type == BRW_REGISTER_TYPE_UQ || + reg.type == BRW_REGISTER_TYPE_Q) + brw_inst_set_imm_uq(devinfo, inst, reg.u64); + else + brw_inst_set_imm_ud(devinfo, inst, reg.ud); + + if (devinfo->ver < 12 && type_sz(reg.type) < 8) { + brw_inst_set_src1_reg_file(devinfo, inst, + BRW_ARCHITECTURE_REGISTER_FILE); + brw_inst_set_src1_reg_hw_type(devinfo, inst, + brw_inst_src0_reg_hw_type(devinfo, inst)); + } + } else { + if (reg.address_mode == BRW_ADDRESS_DIRECT) { + brw_inst_set_src0_da_reg_nr(devinfo, inst, phys_nr(devinfo, reg)); + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + brw_inst_set_src0_da1_subreg_nr(devinfo, inst, phys_subnr(devinfo, reg)); + } else { + brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16); + } + } else { + brw_inst_set_src0_ia_subreg_nr(devinfo, inst, phys_subnr(devinfo, reg)); + + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset); + } else { + brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset); + } + } + + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + if (reg.width == BRW_WIDTH_1 && + brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) { + brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0); + brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1); + brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0); + } else { + brw_inst_set_src0_hstride(devinfo, inst, reg.hstride); + brw_inst_set_src0_width(devinfo, inst, reg.width); + brw_inst_set_src0_vstride(devinfo, inst, reg.vstride); + } + } else { + brw_inst_set_src0_da16_swiz_x(devinfo, inst, + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X)); + brw_inst_set_src0_da16_swiz_y(devinfo, inst, + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y)); + brw_inst_set_src0_da16_swiz_z(devinfo, inst, + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z)); + brw_inst_set_src0_da16_swiz_w(devinfo, inst, + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W)); + + if (reg.vstride == BRW_VERTICAL_STRIDE_8) { + /* This is an oddity of the fact we're using the same + * descriptions for registers in align_16 as align_1: + */ + brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4); + } else if (devinfo->verx10 == 70 && + reg.type == BRW_REGISTER_TYPE_DF && + reg.vstride == BRW_VERTICAL_STRIDE_2) { + /* From SNB PRM: + * + * "For Align16 access mode, only encodings of 0000 and 0011 + * are allowed. Other codes are reserved." + * + * Presumably the DevSNB behavior applies to IVB as well. + */ + brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4); + } else { + brw_inst_set_src0_vstride(devinfo, inst, reg.vstride); + } + } + } + } +} + + +void +brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg) +{ + const struct intel_device_info *devinfo = p->devinfo; + + if (reg.file == BRW_GENERAL_REGISTER_FILE) + assert(reg.nr < XE2_MAX_GRF); + + if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDS || + brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDSC || + (devinfo->ver >= 12 && + (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND || + brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC))) { + assert(reg.file == BRW_GENERAL_REGISTER_FILE || + reg.file == BRW_ARCHITECTURE_REGISTER_FILE); + assert(reg.address_mode == BRW_ADDRESS_DIRECT); + assert(reg.subnr == 0); + assert(has_scalar_region(reg) || + (reg.hstride == BRW_HORIZONTAL_STRIDE_1 && + reg.vstride == reg.width + 1)); + assert(!reg.negate && !reg.abs); + brw_inst_set_send_src1_reg_nr(devinfo, inst, phys_nr(devinfo, reg)); + brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file); + } else { + /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5: + * + * "Accumulator registers may be accessed explicitly as src0 + * operands only." + */ + assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE || + reg.nr != BRW_ARF_ACCUMULATOR); + + gfx7_convert_mrf_to_grf(p, ®); + assert(reg.file != BRW_MESSAGE_REGISTER_FILE); + + brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type); + brw_inst_set_src1_abs(devinfo, inst, reg.abs); + brw_inst_set_src1_negate(devinfo, inst, reg.negate); + + /* Only src1 can be immediate in two-argument instructions. + */ + assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE); + + if (reg.file == BRW_IMMEDIATE_VALUE) { + /* two-argument instructions can only use 32-bit immediates */ + assert(type_sz(reg.type) < 8); + brw_inst_set_imm_ud(devinfo, inst, reg.ud); + } else { + /* This is a hardware restriction, which may or may not be lifted + * in the future: + */ + assert (reg.address_mode == BRW_ADDRESS_DIRECT); + /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */ + + brw_inst_set_src1_da_reg_nr(devinfo, inst, phys_nr(devinfo, reg)); + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + brw_inst_set_src1_da1_subreg_nr(devinfo, inst, phys_subnr(devinfo, reg)); + } else { + brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16); + } + + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + if (reg.width == BRW_WIDTH_1 && + brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) { + brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0); + brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1); + brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0); + } else { + brw_inst_set_src1_hstride(devinfo, inst, reg.hstride); + brw_inst_set_src1_width(devinfo, inst, reg.width); + brw_inst_set_src1_vstride(devinfo, inst, reg.vstride); + } + } else { + brw_inst_set_src1_da16_swiz_x(devinfo, inst, + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X)); + brw_inst_set_src1_da16_swiz_y(devinfo, inst, + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y)); + brw_inst_set_src1_da16_swiz_z(devinfo, inst, + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z)); + brw_inst_set_src1_da16_swiz_w(devinfo, inst, + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W)); + + if (reg.vstride == BRW_VERTICAL_STRIDE_8) { + /* This is an oddity of the fact we're using the same + * descriptions for registers in align_16 as align_1: + */ + brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4); + } else if (devinfo->verx10 == 70 && + reg.type == BRW_REGISTER_TYPE_DF && + reg.vstride == BRW_VERTICAL_STRIDE_2) { + /* From SNB PRM: + * + * "For Align16 access mode, only encodings of 0000 and 0011 + * are allowed. Other codes are reserved." + * + * Presumably the DevSNB behavior applies to IVB as well. + */ + brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4); + } else { + brw_inst_set_src1_vstride(devinfo, inst, reg.vstride); + } + } + } + } +} + +/** + * Specify the descriptor and extended descriptor immediate for a SEND(C) + * message instruction. + */ +void +brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst, + unsigned desc, unsigned ex_desc) +{ + const struct intel_device_info *devinfo = p->devinfo; + assert(brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND || + brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC); + if (devinfo->ver < 12) + brw_inst_set_src1_file_type(devinfo, inst, + BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD); + brw_inst_set_send_desc(devinfo, inst, desc); + if (devinfo->ver >= 9) + brw_inst_set_send_ex_desc(devinfo, inst, ex_desc); +} + +static void brw_set_math_message( struct brw_codegen *p, + brw_inst *inst, + unsigned function, + unsigned integer_type, + bool low_precision, + unsigned dataType ) +{ + const struct intel_device_info *devinfo = p->devinfo; + unsigned msg_length; + unsigned response_length; + + /* Infer message length from the function */ + switch (function) { + case BRW_MATH_FUNCTION_POW: + case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT: + case BRW_MATH_FUNCTION_INT_DIV_REMAINDER: + case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER: + msg_length = 2; + break; + default: + msg_length = 1; + break; + } + + /* Infer response length from the function */ + switch (function) { + case BRW_MATH_FUNCTION_SINCOS: + case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER: + response_length = 2; + break; + default: + response_length = 1; + break; + } + + brw_set_desc(p, inst, brw_message_desc( + devinfo, msg_length, response_length, false)); + + brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH); + brw_inst_set_math_msg_function(devinfo, inst, function); + brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type); + brw_inst_set_math_msg_precision(devinfo, inst, low_precision); + brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst)); + brw_inst_set_math_msg_data_type(devinfo, inst, dataType); + brw_inst_set_saturate(devinfo, inst, 0); +} + + +static void brw_set_ff_sync_message(struct brw_codegen *p, + brw_inst *insn, + bool allocate, + unsigned response_length, + bool end_of_thread) +{ + const struct intel_device_info *devinfo = p->devinfo; + + brw_set_desc(p, insn, brw_message_desc( + devinfo, 1, response_length, true)); + + brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB); + brw_inst_set_eot(devinfo, insn, end_of_thread); + brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */ + brw_inst_set_urb_allocate(devinfo, insn, allocate); + /* The following fields are not used by FF_SYNC: */ + brw_inst_set_urb_global_offset(devinfo, insn, 0); + brw_inst_set_urb_swizzle_control(devinfo, insn, 0); + brw_inst_set_urb_used(devinfo, insn, 0); + brw_inst_set_urb_complete(devinfo, insn, 0); +} + +static void brw_set_urb_message( struct brw_codegen *p, + brw_inst *insn, + enum brw_urb_write_flags flags, + unsigned msg_length, + unsigned response_length, + unsigned offset, + unsigned swizzle_control ) +{ + const struct intel_device_info *devinfo = p->devinfo; + + assert(devinfo->ver < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE); + assert(devinfo->ver < 7 || !(flags & BRW_URB_WRITE_ALLOCATE)); + assert(devinfo->ver >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET)); + + brw_set_desc(p, insn, brw_message_desc( + devinfo, msg_length, response_length, true)); + + brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB); + brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT)); + + if (flags & BRW_URB_WRITE_OWORD) { + assert(msg_length == 2); /* header + one OWORD of data */ + brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD); + } else { + brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD); + } + + brw_inst_set_urb_global_offset(devinfo, insn, offset); + brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control); + + if (devinfo->ver < 8) { + brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE)); + } + + if (devinfo->ver < 7) { + brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE)); + brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED)); + } else { + brw_inst_set_urb_per_slot_offset(devinfo, insn, + !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET)); + } +} + +static void +gfx7_set_dp_scratch_message(struct brw_codegen *p, + brw_inst *inst, + bool write, + bool dword, + bool invalidate_after_read, + unsigned num_regs, + unsigned addr_offset, + unsigned mlen, + unsigned rlen, + bool header_present) +{ + const struct intel_device_info *devinfo = p->devinfo; + assert(num_regs == 1 || num_regs == 2 || num_regs == 4 || + (devinfo->ver >= 8 && num_regs == 8)); + const unsigned block_size = (devinfo->ver >= 8 ? util_logbase2(num_regs) : + num_regs - 1); + + brw_set_desc(p, inst, brw_message_desc( + devinfo, mlen, rlen, header_present)); + + brw_inst_set_sfid(devinfo, inst, GFX7_SFID_DATAPORT_DATA_CACHE); + brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */ + brw_inst_set_scratch_read_write(devinfo, inst, write); + brw_inst_set_scratch_type(devinfo, inst, dword); + brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read); + brw_inst_set_scratch_block_size(devinfo, inst, block_size); + brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset); +} + +static void +brw_inst_set_state(const struct brw_isa_info *isa, + brw_inst *insn, + const struct brw_insn_state *state) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + brw_inst_set_exec_size(devinfo, insn, state->exec_size); + brw_inst_set_group(devinfo, insn, state->group); + brw_inst_set_compression(devinfo, insn, state->compressed); + brw_inst_set_access_mode(devinfo, insn, state->access_mode); + brw_inst_set_mask_control(devinfo, insn, state->mask_control); + if (devinfo->ver >= 12) + brw_inst_set_swsb(devinfo, insn, tgl_swsb_encode(devinfo, state->swsb)); + brw_inst_set_saturate(devinfo, insn, state->saturate); + brw_inst_set_pred_control(devinfo, insn, state->predicate); + brw_inst_set_pred_inv(devinfo, insn, state->pred_inv); + + if (is_3src(isa, brw_inst_opcode(isa, insn)) && + state->access_mode == BRW_ALIGN_16) { + brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2); + if (devinfo->ver >= 7) + brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2); + } else { + brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2); + if (devinfo->ver >= 7) + brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2); + } + + if (devinfo->ver >= 6 && devinfo->ver < 20) + brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control); +} + +static brw_inst * +brw_append_insns(struct brw_codegen *p, unsigned nr_insn, unsigned alignment) +{ + assert(util_is_power_of_two_or_zero(sizeof(brw_inst))); + assert(util_is_power_of_two_or_zero(alignment)); + const unsigned align_insn = MAX2(alignment / sizeof(brw_inst), 1); + const unsigned start_insn = ALIGN(p->nr_insn, align_insn); + const unsigned new_nr_insn = start_insn + nr_insn; + + if (p->store_size < new_nr_insn) { + p->store_size = util_next_power_of_two(new_nr_insn * sizeof(brw_inst)); + p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size); + } + + /* Memset any padding due to alignment to 0. We don't want to be hashing + * or caching a bunch of random bits we got from a memory allocation. + */ + if (p->nr_insn < start_insn) { + memset(&p->store[p->nr_insn], 0, + (start_insn - p->nr_insn) * sizeof(brw_inst)); + } + + assert(p->next_insn_offset == p->nr_insn * sizeof(brw_inst)); + p->nr_insn = new_nr_insn; + p->next_insn_offset = new_nr_insn * sizeof(brw_inst); + + return &p->store[start_insn]; +} + +void +brw_realign(struct brw_codegen *p, unsigned alignment) +{ + brw_append_insns(p, 0, alignment); +} + +int +brw_append_data(struct brw_codegen *p, void *data, + unsigned size, unsigned alignment) +{ + unsigned nr_insn = DIV_ROUND_UP(size, sizeof(brw_inst)); + void *dst = brw_append_insns(p, nr_insn, alignment); + memcpy(dst, data, size); + + /* If it's not a whole number of instructions, memset the end */ + if (size < nr_insn * sizeof(brw_inst)) + memset(dst + size, 0, nr_insn * sizeof(brw_inst) - size); + + return dst - (void *)p->store; +} + +#define next_insn brw_next_insn +brw_inst * +brw_next_insn(struct brw_codegen *p, unsigned opcode) +{ + brw_inst *insn = brw_append_insns(p, 1, sizeof(brw_inst)); + + memset(insn, 0, sizeof(*insn)); + brw_inst_set_opcode(p->isa, insn, opcode); + + /* Apply the default instruction state */ + brw_inst_set_state(p->isa, insn, p->current); + + return insn; +} + +void +brw_add_reloc(struct brw_codegen *p, uint32_t id, + enum brw_shader_reloc_type type, + uint32_t offset, uint32_t delta) +{ + if (p->num_relocs + 1 > p->reloc_array_size) { + p->reloc_array_size = MAX2(16, p->reloc_array_size * 2); + p->relocs = reralloc(p->mem_ctx, p->relocs, + struct brw_shader_reloc, p->reloc_array_size); + } + + p->relocs[p->num_relocs++] = (struct brw_shader_reloc) { + .id = id, + .type = type, + .offset = offset, + .delta = delta, + }; +} + +static brw_inst * +brw_alu1(struct brw_codegen *p, unsigned opcode, + struct brw_reg dest, struct brw_reg src) +{ + brw_inst *insn = next_insn(p, opcode); + brw_set_dest(p, insn, dest); + brw_set_src0(p, insn, src); + return insn; +} + +static brw_inst * +brw_alu2(struct brw_codegen *p, unsigned opcode, + struct brw_reg dest, struct brw_reg src0, struct brw_reg src1) +{ + /* 64-bit immediates are only supported on 1-src instructions */ + assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4); + assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4); + + brw_inst *insn = next_insn(p, opcode); + brw_set_dest(p, insn, dest); + brw_set_src0(p, insn, src0); + brw_set_src1(p, insn, src1); + return insn; +} + +static int +get_3src_subreg_nr(struct brw_reg reg) +{ + /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions + * use 32-bit units (components 0..7). Since they only support F/D/UD + * types, this doesn't lose any flexibility, but uses fewer bits. + */ + return reg.subnr / 4; +} + +static enum gfx10_align1_3src_vertical_stride +to_3src_align1_vstride(const struct intel_device_info *devinfo, + enum brw_vertical_stride vstride) +{ + switch (vstride) { + case BRW_VERTICAL_STRIDE_0: + return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0; + case BRW_VERTICAL_STRIDE_1: + assert(devinfo->ver >= 12); + return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1; + case BRW_VERTICAL_STRIDE_2: + assert(devinfo->ver < 12); + return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2; + case BRW_VERTICAL_STRIDE_4: + return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4; + case BRW_VERTICAL_STRIDE_8: + case BRW_VERTICAL_STRIDE_16: + return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8; + default: + unreachable("invalid vstride"); + } +} + + +static enum gfx10_align1_3src_src_horizontal_stride +to_3src_align1_hstride(enum brw_horizontal_stride hstride) +{ + switch (hstride) { + case BRW_HORIZONTAL_STRIDE_0: + return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0; + case BRW_HORIZONTAL_STRIDE_1: + return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1; + case BRW_HORIZONTAL_STRIDE_2: + return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2; + case BRW_HORIZONTAL_STRIDE_4: + return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4; + default: + unreachable("invalid hstride"); + } +} + +static brw_inst * +brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest, + struct brw_reg src0, struct brw_reg src1, struct brw_reg src2) +{ + const struct intel_device_info *devinfo = p->devinfo; + brw_inst *inst = next_insn(p, opcode); + + gfx7_convert_mrf_to_grf(p, &dest); + + assert(dest.nr < XE2_MAX_GRF); + + if (devinfo->ver >= 10) + assert(!(src0.file == BRW_IMMEDIATE_VALUE && + src2.file == BRW_IMMEDIATE_VALUE)); + + assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < XE2_MAX_GRF); + assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < XE2_MAX_GRF); + assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < XE2_MAX_GRF); + assert(dest.address_mode == BRW_ADDRESS_DIRECT); + assert(src0.address_mode == BRW_ADDRESS_DIRECT); + assert(src1.address_mode == BRW_ADDRESS_DIRECT); + assert(src2.address_mode == BRW_ADDRESS_DIRECT); + + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + assert(dest.file == BRW_GENERAL_REGISTER_FILE || + (dest.file == BRW_ARCHITECTURE_REGISTER_FILE && + dest.nr == BRW_ARF_ACCUMULATOR)); + + if (devinfo->ver >= 12) { + brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file); + brw_inst_set_3src_dst_reg_nr(devinfo, inst, phys_nr(devinfo, dest)); + } else { + if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) { + brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, + BRW_ALIGN1_3SRC_ACCUMULATOR); + brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR); + } else { + brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, + BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE); + brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr); + } + } + brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, phys_subnr(devinfo, dest) / 8); + + brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1); + + if (brw_reg_type_is_floating_point(dest.type)) { + brw_inst_set_3src_a1_exec_type(devinfo, inst, + BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT); + } else { + brw_inst_set_3src_a1_exec_type(devinfo, inst, + BRW_ALIGN1_3SRC_EXEC_TYPE_INT); + } + + brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type); + brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type); + brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type); + brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type); + + if (src0.file == BRW_IMMEDIATE_VALUE) { + brw_inst_set_3src_a1_src0_imm(devinfo, inst, src0.ud); + } else { + brw_inst_set_3src_a1_src0_vstride( + devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride)); + brw_inst_set_3src_a1_src0_hstride(devinfo, inst, + to_3src_align1_hstride(src0.hstride)); + brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, phys_subnr(devinfo, src0)); + if (src0.type == BRW_REGISTER_TYPE_NF) { + brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR); + } else { + brw_inst_set_3src_src0_reg_nr(devinfo, inst, phys_nr(devinfo, src0)); + } + brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs); + brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate); + } + brw_inst_set_3src_a1_src1_vstride( + devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride)); + brw_inst_set_3src_a1_src1_hstride(devinfo, inst, + to_3src_align1_hstride(src1.hstride)); + + brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, phys_subnr(devinfo, src1)); + if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) { + brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR); + } else { + brw_inst_set_3src_src1_reg_nr(devinfo, inst, phys_nr(devinfo, src1)); + } + brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs); + brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate); + + if (src2.file == BRW_IMMEDIATE_VALUE) { + brw_inst_set_3src_a1_src2_imm(devinfo, inst, src2.ud); + } else { + brw_inst_set_3src_a1_src2_hstride(devinfo, inst, + to_3src_align1_hstride(src2.hstride)); + /* no vstride on src2 */ + brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, phys_subnr(devinfo, src2)); + brw_inst_set_3src_src2_reg_nr(devinfo, inst, phys_nr(devinfo, src2)); + brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs); + brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate); + } + + assert(src0.file == BRW_GENERAL_REGISTER_FILE || + src0.file == BRW_IMMEDIATE_VALUE || + (src0.file == BRW_ARCHITECTURE_REGISTER_FILE && + src0.type == BRW_REGISTER_TYPE_NF)); + assert(src1.file == BRW_GENERAL_REGISTER_FILE || + (src1.file == BRW_ARCHITECTURE_REGISTER_FILE && + src1.nr == BRW_ARF_ACCUMULATOR)); + assert(src2.file == BRW_GENERAL_REGISTER_FILE || + src2.file == BRW_IMMEDIATE_VALUE); + + if (devinfo->ver >= 12) { + if (src0.file == BRW_IMMEDIATE_VALUE) { + brw_inst_set_3src_a1_src0_is_imm(devinfo, inst, 1); + } else { + brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file); + } + + brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file); + + if (src2.file == BRW_IMMEDIATE_VALUE) { + brw_inst_set_3src_a1_src2_is_imm(devinfo, inst, 1); + } else { + brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file); + } + } else { + brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, + src0.file == BRW_GENERAL_REGISTER_FILE ? + BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE : + BRW_ALIGN1_3SRC_IMMEDIATE_VALUE); + brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, + src1.file == BRW_GENERAL_REGISTER_FILE ? + BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE : + BRW_ALIGN1_3SRC_ACCUMULATOR); + brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, + src2.file == BRW_GENERAL_REGISTER_FILE ? + BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE : + BRW_ALIGN1_3SRC_IMMEDIATE_VALUE); + } + + } else { + assert(dest.file == BRW_GENERAL_REGISTER_FILE || + dest.file == BRW_MESSAGE_REGISTER_FILE); + assert(dest.type == BRW_REGISTER_TYPE_F || + dest.type == BRW_REGISTER_TYPE_DF || + dest.type == BRW_REGISTER_TYPE_D || + dest.type == BRW_REGISTER_TYPE_UD || + (dest.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 8)); + if (devinfo->ver == 6) { + brw_inst_set_3src_a16_dst_reg_file(devinfo, inst, + dest.file == BRW_MESSAGE_REGISTER_FILE); + } + brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr); + brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 4); + brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask); + + assert(src0.file == BRW_GENERAL_REGISTER_FILE); + brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle); + brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0)); + brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr); + brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs); + brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate); + brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst, + src0.vstride == BRW_VERTICAL_STRIDE_0); + + assert(src1.file == BRW_GENERAL_REGISTER_FILE); + brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle); + brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1)); + brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr); + brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs); + brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate); + brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst, + src1.vstride == BRW_VERTICAL_STRIDE_0); + + assert(src2.file == BRW_GENERAL_REGISTER_FILE); + brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle); + brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2)); + brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr); + brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs); + brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate); + brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst, + src2.vstride == BRW_VERTICAL_STRIDE_0); + + if (devinfo->ver >= 7) { + /* Set both the source and destination types based on dest.type, + * ignoring the source register types. The MAD and LRP emitters ensure + * that all four types are float. The BFE and BFI2 emitters, however, + * may send us mixed D and UD types and want us to ignore that and use + * the destination type. + */ + brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type); + brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type); + + /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType: + * + * "Three source instructions can use operands with mixed-mode + * precision. When SrcType field is set to :f or :hf it defines + * precision for source 0 only, and fields Src1Type and Src2Type + * define precision for other source operands: + * + * 0b = :f. Single precision Float (32-bit). + * 1b = :hf. Half precision Float (16-bit)." + */ + if (src1.type == BRW_REGISTER_TYPE_HF) + brw_inst_set_3src_a16_src1_type(devinfo, inst, 1); + + if (src2.type == BRW_REGISTER_TYPE_HF) + brw_inst_set_3src_a16_src2_type(devinfo, inst, 1); + } + } + + return inst; +} + +static brw_inst * +brw_dpas_three_src(struct brw_codegen *p, enum gfx12_systolic_depth opcode, + unsigned sdepth, unsigned rcount, struct brw_reg dest, + struct brw_reg src0, struct brw_reg src1, struct brw_reg src2) +{ + const struct intel_device_info *devinfo = p->devinfo; + brw_inst *inst = next_insn(p, opcode); + + assert(dest.file == BRW_GENERAL_REGISTER_FILE); + brw_inst_set_dpas_3src_dst_reg_file(devinfo, inst, + BRW_GENERAL_REGISTER_FILE); + brw_inst_set_dpas_3src_dst_reg_nr(devinfo, inst, dest.nr); + brw_inst_set_dpas_3src_dst_subreg_nr(devinfo, inst, dest.subnr); + + if (brw_reg_type_is_floating_point(dest.type)) { + brw_inst_set_dpas_3src_exec_type(devinfo, inst, + BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT); + } else { + brw_inst_set_dpas_3src_exec_type(devinfo, inst, + BRW_ALIGN1_3SRC_EXEC_TYPE_INT); + } + + brw_inst_set_dpas_3src_sdepth(devinfo, inst, sdepth); + brw_inst_set_dpas_3src_rcount(devinfo, inst, rcount - 1); + + brw_inst_set_dpas_3src_dst_type(devinfo, inst, dest.type); + brw_inst_set_dpas_3src_src0_type(devinfo, inst, src0.type); + brw_inst_set_dpas_3src_src1_type(devinfo, inst, src1.type); + brw_inst_set_dpas_3src_src2_type(devinfo, inst, src2.type); + + assert(src0.file == BRW_GENERAL_REGISTER_FILE || + (src0.file == BRW_ARCHITECTURE_REGISTER_FILE && + src0.nr == BRW_ARF_NULL)); + + brw_inst_set_dpas_3src_src0_reg_file(devinfo, inst, src0.file); + brw_inst_set_dpas_3src_src0_reg_nr(devinfo, inst, src0.nr); + brw_inst_set_dpas_3src_src0_subreg_nr(devinfo, inst, src0.subnr); + + assert(src1.file == BRW_GENERAL_REGISTER_FILE); + + brw_inst_set_dpas_3src_src1_reg_file(devinfo, inst, src1.file); + brw_inst_set_dpas_3src_src1_reg_nr(devinfo, inst, src1.nr); + brw_inst_set_dpas_3src_src1_subreg_nr(devinfo, inst, src1.subnr); + brw_inst_set_dpas_3src_src1_subbyte(devinfo, inst, BRW_SUB_BYTE_PRECISION_NONE); + + assert(src2.file == BRW_GENERAL_REGISTER_FILE); + + brw_inst_set_dpas_3src_src2_reg_file(devinfo, inst, src2.file); + brw_inst_set_dpas_3src_src2_reg_nr(devinfo, inst, src2.nr); + brw_inst_set_dpas_3src_src2_subreg_nr(devinfo, inst, src2.subnr); + brw_inst_set_dpas_3src_src2_subbyte(devinfo, inst, BRW_SUB_BYTE_PRECISION_NONE); + + return inst; +} + +/*********************************************************************** + * Convenience routines. + */ +#define ALU1(OP) \ +brw_inst *brw_##OP(struct brw_codegen *p, \ + struct brw_reg dest, \ + struct brw_reg src0) \ +{ \ + return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \ +} + +#define ALU2(OP) \ +brw_inst *brw_##OP(struct brw_codegen *p, \ + struct brw_reg dest, \ + struct brw_reg src0, \ + struct brw_reg src1) \ +{ \ + return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \ +} + +#define ALU3(OP) \ +brw_inst *brw_##OP(struct brw_codegen *p, \ + struct brw_reg dest, \ + struct brw_reg src0, \ + struct brw_reg src1, \ + struct brw_reg src2) \ +{ \ + if (p->current->access_mode == BRW_ALIGN_16) { \ + if (src0.vstride == BRW_VERTICAL_STRIDE_0) \ + src0.swizzle = BRW_SWIZZLE_XXXX; \ + if (src1.vstride == BRW_VERTICAL_STRIDE_0) \ + src1.swizzle = BRW_SWIZZLE_XXXX; \ + if (src2.vstride == BRW_VERTICAL_STRIDE_0) \ + src2.swizzle = BRW_SWIZZLE_XXXX; \ + } \ + return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \ +} + +#define ALU3F(OP) \ +brw_inst *brw_##OP(struct brw_codegen *p, \ + struct brw_reg dest, \ + struct brw_reg src0, \ + struct brw_reg src1, \ + struct brw_reg src2) \ +{ \ + assert(dest.type == BRW_REGISTER_TYPE_F || \ + dest.type == BRW_REGISTER_TYPE_DF); \ + if (dest.type == BRW_REGISTER_TYPE_F) { \ + assert(src0.type == BRW_REGISTER_TYPE_F); \ + assert(src1.type == BRW_REGISTER_TYPE_F); \ + assert(src2.type == BRW_REGISTER_TYPE_F); \ + } else if (dest.type == BRW_REGISTER_TYPE_DF) { \ + assert(src0.type == BRW_REGISTER_TYPE_DF); \ + assert(src1.type == BRW_REGISTER_TYPE_DF); \ + assert(src2.type == BRW_REGISTER_TYPE_DF); \ + } \ + \ + if (p->current->access_mode == BRW_ALIGN_16) { \ + if (src0.vstride == BRW_VERTICAL_STRIDE_0) \ + src0.swizzle = BRW_SWIZZLE_XXXX; \ + if (src1.vstride == BRW_VERTICAL_STRIDE_0) \ + src1.swizzle = BRW_SWIZZLE_XXXX; \ + if (src2.vstride == BRW_VERTICAL_STRIDE_0) \ + src2.swizzle = BRW_SWIZZLE_XXXX; \ + } \ + return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \ +} + +ALU2(SEL) +ALU1(NOT) +ALU2(AND) +ALU2(OR) +ALU2(XOR) +ALU2(SHR) +ALU2(SHL) +ALU1(DIM) +ALU2(ASR) +ALU2(ROL) +ALU2(ROR) +ALU3(CSEL) +ALU1(FRC) +ALU1(RNDD) +ALU1(RNDE) +ALU1(RNDU) +ALU1(RNDZ) +ALU2(MAC) +ALU2(MACH) +ALU1(LZD) +ALU2(DP4) +ALU2(DPH) +ALU2(DP3) +ALU2(DP2) +ALU3(DP4A) +ALU3(MAD) +ALU3F(LRP) +ALU1(BFREV) +ALU3(BFE) +ALU2(BFI1) +ALU3(BFI2) +ALU1(FBH) +ALU1(FBL) +ALU1(CBIT) +ALU2(ADDC) +ALU2(SUBB) +ALU3(ADD3) + +brw_inst * +brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0) +{ + const struct intel_device_info *devinfo = p->devinfo; + + /* When converting F->DF on IVB/BYT, every odd source channel is ignored. + * To avoid the problems that causes, we use an source region to + * read each element twice. + */ + if (devinfo->verx10 == 70 && + brw_get_default_access_mode(p) == BRW_ALIGN_1 && + dest.type == BRW_REGISTER_TYPE_DF && + (src0.type == BRW_REGISTER_TYPE_F || + src0.type == BRW_REGISTER_TYPE_D || + src0.type == BRW_REGISTER_TYPE_UD) && + !has_scalar_region(src0)) { + assert(src0.vstride == src0.width + src0.hstride); + src0.vstride = src0.hstride; + src0.width = BRW_WIDTH_2; + src0.hstride = BRW_HORIZONTAL_STRIDE_0; + } + + return brw_alu1(p, BRW_OPCODE_MOV, dest, src0); +} + +brw_inst * +brw_ADD(struct brw_codegen *p, struct brw_reg dest, + struct brw_reg src0, struct brw_reg src1) +{ + /* 6.2.2: add */ + if (src0.type == BRW_REGISTER_TYPE_F || + (src0.file == BRW_IMMEDIATE_VALUE && + src0.type == BRW_REGISTER_TYPE_VF)) { + assert(src1.type != BRW_REGISTER_TYPE_UD); + assert(src1.type != BRW_REGISTER_TYPE_D); + } + + if (src1.type == BRW_REGISTER_TYPE_F || + (src1.file == BRW_IMMEDIATE_VALUE && + src1.type == BRW_REGISTER_TYPE_VF)) { + assert(src0.type != BRW_REGISTER_TYPE_UD); + assert(src0.type != BRW_REGISTER_TYPE_D); + } + + return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1); +} + +brw_inst * +brw_AVG(struct brw_codegen *p, struct brw_reg dest, + struct brw_reg src0, struct brw_reg src1) +{ + assert(dest.type == src0.type); + assert(src0.type == src1.type); + switch (src0.type) { + case BRW_REGISTER_TYPE_B: + case BRW_REGISTER_TYPE_UB: + case BRW_REGISTER_TYPE_W: + case BRW_REGISTER_TYPE_UW: + case BRW_REGISTER_TYPE_D: + case BRW_REGISTER_TYPE_UD: + break; + default: + unreachable("Bad type for brw_AVG"); + } + + return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1); +} + +brw_inst * +brw_MUL(struct brw_codegen *p, struct brw_reg dest, + struct brw_reg src0, struct brw_reg src1) +{ + /* 6.32.38: mul */ + if (src0.type == BRW_REGISTER_TYPE_D || + src0.type == BRW_REGISTER_TYPE_UD || + src1.type == BRW_REGISTER_TYPE_D || + src1.type == BRW_REGISTER_TYPE_UD) { + assert(dest.type != BRW_REGISTER_TYPE_F); + } + + if (src0.type == BRW_REGISTER_TYPE_F || + (src0.file == BRW_IMMEDIATE_VALUE && + src0.type == BRW_REGISTER_TYPE_VF)) { + assert(src1.type != BRW_REGISTER_TYPE_UD); + assert(src1.type != BRW_REGISTER_TYPE_D); + } + + if (src1.type == BRW_REGISTER_TYPE_F || + (src1.file == BRW_IMMEDIATE_VALUE && + src1.type == BRW_REGISTER_TYPE_VF)) { + assert(src0.type != BRW_REGISTER_TYPE_UD); + assert(src0.type != BRW_REGISTER_TYPE_D); + } + + assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE || + src0.nr != BRW_ARF_ACCUMULATOR); + assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE || + src1.nr != BRW_ARF_ACCUMULATOR); + + return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1); +} + +brw_inst * +brw_LINE(struct brw_codegen *p, struct brw_reg dest, + struct brw_reg src0, struct brw_reg src1) +{ + src0.vstride = BRW_VERTICAL_STRIDE_0; + src0.width = BRW_WIDTH_1; + src0.hstride = BRW_HORIZONTAL_STRIDE_0; + return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1); +} + +brw_inst * +brw_PLN(struct brw_codegen *p, struct brw_reg dest, + struct brw_reg src0, struct brw_reg src1) +{ + src0.vstride = BRW_VERTICAL_STRIDE_0; + src0.width = BRW_WIDTH_1; + src0.hstride = BRW_HORIZONTAL_STRIDE_0; + src1.vstride = BRW_VERTICAL_STRIDE_8; + src1.width = BRW_WIDTH_8; + src1.hstride = BRW_HORIZONTAL_STRIDE_1; + return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1); +} + +brw_inst * +brw_DPAS(struct brw_codegen *p, enum gfx12_systolic_depth sdepth, + unsigned rcount, struct brw_reg dest, struct brw_reg src0, + struct brw_reg src1, struct brw_reg src2) +{ + return brw_dpas_three_src(p, BRW_OPCODE_DPAS, sdepth, rcount, dest, src0, + src1, src2); +} + +brw_inst * +brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src) +{ + assert(p->devinfo->ver == 7); + + /* The F32TO16 instruction doesn't support 32-bit destination types in + * Align1 mode. Gfx7 (only) does zero out the high 16 bits in Align16 + * mode as an undocumented feature. + */ + if (BRW_ALIGN_16 == brw_get_default_access_mode(p)) { + assert(dst.type == BRW_REGISTER_TYPE_UD); + } else { + assert(dst.type == BRW_REGISTER_TYPE_W || + dst.type == BRW_REGISTER_TYPE_UW); + } + + return brw_alu1(p, BRW_OPCODE_F32TO16, dst, src); +} + +brw_inst * +brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src) +{ + assert(p->devinfo->ver == 7); + + if (BRW_ALIGN_16 == brw_get_default_access_mode(p)) { + assert(src.type == BRW_REGISTER_TYPE_UD); + } else { + /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32: + * + * Because this instruction does not have a 16-bit floating-point + * type, the source data type must be Word (W). The destination type + * must be F (Float). + */ + assert(src.type == BRW_REGISTER_TYPE_W || + src.type == BRW_REGISTER_TYPE_UW); + } + + return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src); +} + + +void brw_NOP(struct brw_codegen *p) +{ + brw_inst *insn = next_insn(p, BRW_OPCODE_NOP); + memset(insn, 0, sizeof(*insn)); + brw_inst_set_opcode(p->isa, insn, BRW_OPCODE_NOP); +} + +void brw_SYNC(struct brw_codegen *p, enum tgl_sync_function func) +{ + brw_inst *insn = next_insn(p, BRW_OPCODE_SYNC); + brw_inst_set_cond_modifier(p->devinfo, insn, func); +} + +/*********************************************************************** + * Comparisons, if/else/endif + */ + +brw_inst * +brw_JMPI(struct brw_codegen *p, struct brw_reg index, + unsigned predicate_control) +{ + const struct intel_device_info *devinfo = p->devinfo; + struct brw_reg ip = brw_ip_reg(); + brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index); + + brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1); + brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE); + brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE); + brw_inst_set_pred_control(devinfo, inst, predicate_control); + + return inst; +} + +static void +push_if_stack(struct brw_codegen *p, brw_inst *inst) +{ + p->if_stack[p->if_stack_depth] = inst - p->store; + + p->if_stack_depth++; + if (p->if_stack_array_size <= p->if_stack_depth) { + p->if_stack_array_size *= 2; + p->if_stack = reralloc(p->mem_ctx, p->if_stack, int, + p->if_stack_array_size); + } +} + +static brw_inst * +pop_if_stack(struct brw_codegen *p) +{ + p->if_stack_depth--; + return &p->store[p->if_stack[p->if_stack_depth]]; +} + +static void +push_loop_stack(struct brw_codegen *p, brw_inst *inst) +{ + if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) { + p->loop_stack_array_size *= 2; + p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int, + p->loop_stack_array_size); + p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int, + p->loop_stack_array_size); + } + + p->loop_stack[p->loop_stack_depth] = inst - p->store; + p->loop_stack_depth++; + p->if_depth_in_loop[p->loop_stack_depth] = 0; +} + +static brw_inst * +get_inner_do_insn(struct brw_codegen *p) +{ + return &p->store[p->loop_stack[p->loop_stack_depth - 1]]; +} + +/* EU takes the value from the flag register and pushes it onto some + * sort of a stack (presumably merging with any flag value already on + * the stack). Within an if block, the flags at the top of the stack + * control execution on each channel of the unit, eg. on each of the + * 16 pixel values in our wm programs. + * + * When the matching 'else' instruction is reached (presumably by + * countdown of the instruction count patched in by our ELSE/ENDIF + * functions), the relevant flags are inverted. + * + * When the matching 'endif' instruction is reached, the flags are + * popped off. If the stack is now empty, normal execution resumes. + */ +brw_inst * +brw_IF(struct brw_codegen *p, unsigned execute_size) +{ + const struct intel_device_info *devinfo = p->devinfo; + brw_inst *insn; + + insn = next_insn(p, BRW_OPCODE_IF); + + /* Override the defaults for this instruction: + */ + if (devinfo->ver < 6) { + brw_set_dest(p, insn, brw_ip_reg()); + brw_set_src0(p, insn, brw_ip_reg()); + brw_set_src1(p, insn, brw_imm_d(0x0)); + } else if (devinfo->ver == 6) { + brw_set_dest(p, insn, brw_imm_w(0)); + brw_inst_set_gfx6_jump_count(devinfo, insn, 0); + brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); + brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); + } else if (devinfo->ver == 7) { + brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); + brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); + brw_set_src1(p, insn, brw_imm_w(0)); + brw_inst_set_jip(devinfo, insn, 0); + brw_inst_set_uip(devinfo, insn, 0); + } else { + brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); + if (devinfo->ver < 12) + brw_set_src0(p, insn, brw_imm_d(0)); + brw_inst_set_jip(devinfo, insn, 0); + brw_inst_set_uip(devinfo, insn, 0); + } + + brw_inst_set_exec_size(devinfo, insn, execute_size); + brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); + brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL); + brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE); + if (!p->single_program_flow && devinfo->ver < 6) + brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH); + + push_if_stack(p, insn); + p->if_depth_in_loop[p->loop_stack_depth]++; + return insn; +} + +/* This function is only used for gfx6-style IF instructions with an + * embedded comparison (conditional modifier). It is not used on gfx7. + */ +brw_inst * +gfx6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional, + struct brw_reg src0, struct brw_reg src1) +{ + const struct intel_device_info *devinfo = p->devinfo; + brw_inst *insn; + + insn = next_insn(p, BRW_OPCODE_IF); + + brw_set_dest(p, insn, brw_imm_w(0)); + brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p)); + brw_inst_set_gfx6_jump_count(devinfo, insn, 0); + brw_set_src0(p, insn, src0); + brw_set_src1(p, insn, src1); + + assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE); + assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE); + brw_inst_set_cond_modifier(devinfo, insn, conditional); + + push_if_stack(p, insn); + return insn; +} + +/** + * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs. + */ +static void +convert_IF_ELSE_to_ADD(struct brw_codegen *p, + brw_inst *if_inst, brw_inst *else_inst) +{ + const struct intel_device_info *devinfo = p->devinfo; + + /* The next instruction (where the ENDIF would be, if it existed) */ + brw_inst *next_inst = &p->store[p->nr_insn]; + + assert(p->single_program_flow); + assert(if_inst != NULL && brw_inst_opcode(p->isa, if_inst) == BRW_OPCODE_IF); + assert(else_inst == NULL || brw_inst_opcode(p->isa, else_inst) == BRW_OPCODE_ELSE); + assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1); + + /* Convert IF to an ADD instruction that moves the instruction pointer + * to the first instruction of the ELSE block. If there is no ELSE + * block, point to where ENDIF would be. Reverse the predicate. + * + * There's no need to execute an ENDIF since we don't need to do any + * stack operations, and if we're currently executing, we just want to + * continue normally. + */ + brw_inst_set_opcode(p->isa, if_inst, BRW_OPCODE_ADD); + brw_inst_set_pred_inv(devinfo, if_inst, true); + + if (else_inst != NULL) { + /* Convert ELSE to an ADD instruction that points where the ENDIF + * would be. + */ + brw_inst_set_opcode(p->isa, else_inst, BRW_OPCODE_ADD); + + brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16); + brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16); + } else { + brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16); + } +} + +/** + * Patch IF and ELSE instructions with appropriate jump targets. + */ +static void +patch_IF_ELSE(struct brw_codegen *p, + brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst) +{ + const struct intel_device_info *devinfo = p->devinfo; + + /* We shouldn't be patching IF and ELSE instructions in single program flow + * mode when gen < 6, because in single program flow mode on those + * platforms, we convert flow control instructions to conditional ADDs that + * operate on IP (see brw_ENDIF). + * + * However, on Gfx6, writing to IP doesn't work in single program flow mode + * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may + * not be updated by non-flow control instructions."). And on later + * platforms, there is no significant benefit to converting control flow + * instructions to conditional ADDs. So we do patch IF and ELSE + * instructions in single program flow mode on those platforms. + */ + if (devinfo->ver < 6) + assert(!p->single_program_flow); + + assert(if_inst != NULL && brw_inst_opcode(p->isa, if_inst) == BRW_OPCODE_IF); + assert(endif_inst != NULL); + assert(else_inst == NULL || brw_inst_opcode(p->isa, else_inst) == BRW_OPCODE_ELSE); + + unsigned br = brw_jump_scale(devinfo); + + assert(brw_inst_opcode(p->isa, endif_inst) == BRW_OPCODE_ENDIF); + brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst)); + + if (else_inst == NULL) { + /* Patch IF -> ENDIF */ + if (devinfo->ver < 6) { + /* Turn it into an IFF, which means no mask stack operations for + * all-false and jumping past the ENDIF. + */ + brw_inst_set_opcode(p->isa, if_inst, BRW_OPCODE_IFF); + brw_inst_set_gfx4_jump_count(devinfo, if_inst, + br * (endif_inst - if_inst + 1)); + brw_inst_set_gfx4_pop_count(devinfo, if_inst, 0); + } else if (devinfo->ver == 6) { + /* As of gfx6, there is no IFF and IF must point to the ENDIF. */ + brw_inst_set_gfx6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst)); + } else { + brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst)); + brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst)); + } + } else { + brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst)); + + /* Patch IF -> ELSE */ + if (devinfo->ver < 6) { + brw_inst_set_gfx4_jump_count(devinfo, if_inst, + br * (else_inst - if_inst)); + brw_inst_set_gfx4_pop_count(devinfo, if_inst, 0); + } else if (devinfo->ver == 6) { + brw_inst_set_gfx6_jump_count(devinfo, if_inst, + br * (else_inst - if_inst + 1)); + } + + /* Patch ELSE -> ENDIF */ + if (devinfo->ver < 6) { + /* BRW_OPCODE_ELSE pre-gfx6 should point just past the + * matching ENDIF. + */ + brw_inst_set_gfx4_jump_count(devinfo, else_inst, + br * (endif_inst - else_inst + 1)); + brw_inst_set_gfx4_pop_count(devinfo, else_inst, 1); + } else if (devinfo->ver == 6) { + /* BRW_OPCODE_ELSE on gfx6 should point to the matching ENDIF. */ + brw_inst_set_gfx6_jump_count(devinfo, else_inst, + br * (endif_inst - else_inst)); + } else { + /* The IF instruction's JIP should point just past the ELSE */ + brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1)); + /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */ + brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst)); + + if (devinfo->ver >= 8 && devinfo->ver < 11) { + /* Set the ELSE instruction to use branch_ctrl with a join + * jump target pointing at the NOP inserted right before + * the ENDIF instruction in order to make sure it is + * executed in all cases, since attempting to do the same + * as on other generations could cause the EU to jump at + * the instruction immediately after the ENDIF due to + * Wa_220160235, which could cause the program to continue + * running with all channels disabled. + */ + brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst - 1)); + brw_inst_set_branch_control(devinfo, else_inst, true); + } else { + brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst)); + } + + if (devinfo->ver >= 8) { + /* Since we don't set branch_ctrl on Gfx11+, the ELSE's + * JIP and UIP both should point to ENDIF on those + * platforms. + */ + brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst)); + } + } + } +} + +void +brw_ELSE(struct brw_codegen *p) +{ + const struct intel_device_info *devinfo = p->devinfo; + brw_inst *insn; + + insn = next_insn(p, BRW_OPCODE_ELSE); + + if (devinfo->ver < 6) { + brw_set_dest(p, insn, brw_ip_reg()); + brw_set_src0(p, insn, brw_ip_reg()); + brw_set_src1(p, insn, brw_imm_d(0x0)); + } else if (devinfo->ver == 6) { + brw_set_dest(p, insn, brw_imm_w(0)); + brw_inst_set_gfx6_jump_count(devinfo, insn, 0); + brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + } else if (devinfo->ver == 7) { + brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src1(p, insn, brw_imm_w(0)); + brw_inst_set_jip(devinfo, insn, 0); + brw_inst_set_uip(devinfo, insn, 0); + } else { + brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + if (devinfo->ver < 12) + brw_set_src0(p, insn, brw_imm_d(0)); + brw_inst_set_jip(devinfo, insn, 0); + brw_inst_set_uip(devinfo, insn, 0); + } + + brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); + brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE); + if (!p->single_program_flow && devinfo->ver < 6) + brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH); + + push_if_stack(p, insn); +} + +void +brw_ENDIF(struct brw_codegen *p) +{ + const struct intel_device_info *devinfo = p->devinfo; + brw_inst *insn = NULL; + brw_inst *else_inst = NULL; + brw_inst *if_inst = NULL; + brw_inst *tmp; + bool emit_endif = true; + + assert(p->if_stack_depth > 0); + + if (devinfo->ver >= 8 && devinfo->ver < 11 && + brw_inst_opcode(p->isa, &p->store[p->if_stack[ + p->if_stack_depth - 1]]) == BRW_OPCODE_ELSE) { + /* Insert a NOP to be specified as join instruction within the + * ELSE block, which is valid for an ELSE instruction with + * branch_ctrl on. The ELSE instruction will be set to jump + * here instead of to the ENDIF instruction, since attempting to + * do the latter would prevent the ENDIF from being executed in + * some cases due to Wa_220160235, which could cause the program + * to continue running with all channels disabled. + */ + brw_NOP(p); + } + + /* In single program flow mode, we can express IF and ELSE instructions + * equivalently as ADD instructions that operate on IP. On platforms prior + * to Gfx6, flow control instructions cause an implied thread switch, so + * this is a significant savings. + * + * However, on Gfx6, writing to IP doesn't work in single program flow mode + * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may + * not be updated by non-flow control instructions."). And on later + * platforms, there is no significant benefit to converting control flow + * instructions to conditional ADDs. So we only do this trick on Gfx4 and + * Gfx5. + */ + if (devinfo->ver < 6 && p->single_program_flow) + emit_endif = false; + + /* + * A single next_insn() may change the base address of instruction store + * memory(p->store), so call it first before referencing the instruction + * store pointer from an index + */ + if (emit_endif) + insn = next_insn(p, BRW_OPCODE_ENDIF); + + /* Pop the IF and (optional) ELSE instructions from the stack */ + p->if_depth_in_loop[p->loop_stack_depth]--; + tmp = pop_if_stack(p); + if (brw_inst_opcode(p->isa, tmp) == BRW_OPCODE_ELSE) { + else_inst = tmp; + tmp = pop_if_stack(p); + } + if_inst = tmp; + + if (!emit_endif) { + /* ENDIF is useless; don't bother emitting it. */ + convert_IF_ELSE_to_ADD(p, if_inst, else_inst); + return; + } + + if (devinfo->ver < 6) { + brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src1(p, insn, brw_imm_d(0x0)); + } else if (devinfo->ver == 6) { + brw_set_dest(p, insn, brw_imm_w(0)); + brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + } else if (devinfo->ver == 7) { + brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src1(p, insn, brw_imm_w(0)); + } else { + brw_set_src0(p, insn, brw_imm_d(0)); + } + + brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); + brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE); + if (devinfo->ver < 6) + brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH); + + /* Also pop item off the stack in the endif instruction: */ + if (devinfo->ver < 6) { + brw_inst_set_gfx4_jump_count(devinfo, insn, 0); + brw_inst_set_gfx4_pop_count(devinfo, insn, 1); + } else if (devinfo->ver == 6) { + brw_inst_set_gfx6_jump_count(devinfo, insn, 2); + } else { + brw_inst_set_jip(devinfo, insn, 2); + } + patch_IF_ELSE(p, if_inst, else_inst, insn); +} + +brw_inst * +brw_BREAK(struct brw_codegen *p) +{ + const struct intel_device_info *devinfo = p->devinfo; + brw_inst *insn; + + insn = next_insn(p, BRW_OPCODE_BREAK); + if (devinfo->ver >= 8) { + brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src0(p, insn, brw_imm_d(0x0)); + } else if (devinfo->ver >= 6) { + brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src1(p, insn, brw_imm_d(0x0)); + } else { + brw_set_dest(p, insn, brw_ip_reg()); + brw_set_src0(p, insn, brw_ip_reg()); + brw_set_src1(p, insn, brw_imm_d(0x0)); + brw_inst_set_gfx4_pop_count(devinfo, insn, + p->if_depth_in_loop[p->loop_stack_depth]); + } + brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); + brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p)); + + return insn; +} + +brw_inst * +brw_CONT(struct brw_codegen *p) +{ + const struct intel_device_info *devinfo = p->devinfo; + brw_inst *insn; + + insn = next_insn(p, BRW_OPCODE_CONTINUE); + brw_set_dest(p, insn, brw_ip_reg()); + if (devinfo->ver >= 8) { + brw_set_src0(p, insn, brw_imm_d(0x0)); + } else { + brw_set_src0(p, insn, brw_ip_reg()); + brw_set_src1(p, insn, brw_imm_d(0x0)); + } + + if (devinfo->ver < 6) { + brw_inst_set_gfx4_pop_count(devinfo, insn, + p->if_depth_in_loop[p->loop_stack_depth]); + } + brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); + brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p)); + return insn; +} + +brw_inst * +brw_HALT(struct brw_codegen *p) +{ + const struct intel_device_info *devinfo = p->devinfo; + brw_inst *insn; + + insn = next_insn(p, BRW_OPCODE_HALT); + brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + if (devinfo->ver < 6) { + /* From the Gfx4 PRM: + * + * "IP register must be put (for example, by the assembler) at + * and locations. + */ + brw_set_dest(p, insn, brw_ip_reg()); + brw_set_src0(p, insn, brw_ip_reg()); + brw_set_src1(p, insn, brw_imm_d(0x0)); /* exitcode updated later. */ + } else if (devinfo->ver < 8) { + brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */ + } else if (devinfo->ver < 12) { + brw_set_src0(p, insn, brw_imm_d(0x0)); + } + + brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); + brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p)); + return insn; +} + +/* DO/WHILE loop: + * + * The DO/WHILE is just an unterminated loop -- break or continue are + * used for control within the loop. We have a few ways they can be + * done. + * + * For uniform control flow, the WHILE is just a jump, so ADD ip, ip, + * jip and no DO instruction. + * + * For non-uniform control flow pre-gfx6, there's a DO instruction to + * push the mask, and a WHILE to jump back, and BREAK to get out and + * pop the mask. + * + * For gfx6, there's no more mask stack, so no need for DO. WHILE + * just points back to the first instruction of the loop. + */ +brw_inst * +brw_DO(struct brw_codegen *p, unsigned execute_size) +{ + const struct intel_device_info *devinfo = p->devinfo; + + if (devinfo->ver >= 6 || p->single_program_flow) { + push_loop_stack(p, &p->store[p->nr_insn]); + return &p->store[p->nr_insn]; + } else { + brw_inst *insn = next_insn(p, BRW_OPCODE_DO); + + push_loop_stack(p, insn); + + /* Override the defaults for this instruction: + */ + brw_set_dest(p, insn, brw_null_reg()); + brw_set_src0(p, insn, brw_null_reg()); + brw_set_src1(p, insn, brw_null_reg()); + + brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); + brw_inst_set_exec_size(devinfo, insn, execute_size); + brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); + + return insn; + } +} + +/** + * For pre-gfx6, we patch BREAK/CONT instructions to point at the WHILE + * instruction here. + * + * For gfx6+, see brw_set_uip_jip(), which doesn't care so much about the loop + * nesting, since it can always just point to the end of the block/current loop. + */ +static void +brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst) +{ + const struct intel_device_info *devinfo = p->devinfo; + brw_inst *do_inst = get_inner_do_insn(p); + brw_inst *inst; + unsigned br = brw_jump_scale(devinfo); + + assert(devinfo->ver < 6); + + for (inst = while_inst - 1; inst != do_inst; inst--) { + /* If the jump count is != 0, that means that this instruction has already + * been patched because it's part of a loop inside of the one we're + * patching. + */ + if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_BREAK && + brw_inst_gfx4_jump_count(devinfo, inst) == 0) { + brw_inst_set_gfx4_jump_count(devinfo, inst, br*((while_inst - inst) + 1)); + } else if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_CONTINUE && + brw_inst_gfx4_jump_count(devinfo, inst) == 0) { + brw_inst_set_gfx4_jump_count(devinfo, inst, br * (while_inst - inst)); + } + } +} + +brw_inst * +brw_WHILE(struct brw_codegen *p) +{ + const struct intel_device_info *devinfo = p->devinfo; + brw_inst *insn, *do_insn; + unsigned br = brw_jump_scale(devinfo); + + if (devinfo->ver >= 6) { + insn = next_insn(p, BRW_OPCODE_WHILE); + do_insn = get_inner_do_insn(p); + + if (devinfo->ver >= 8) { + brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + if (devinfo->ver < 12) + brw_set_src0(p, insn, brw_imm_d(0)); + brw_inst_set_jip(devinfo, insn, br * (do_insn - insn)); + } else if (devinfo->ver == 7) { + brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src1(p, insn, brw_imm_w(0)); + brw_inst_set_jip(devinfo, insn, br * (do_insn - insn)); + } else { + brw_set_dest(p, insn, brw_imm_w(0)); + brw_inst_set_gfx6_jump_count(devinfo, insn, br * (do_insn - insn)); + brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + } + + brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p)); + + } else { + if (p->single_program_flow) { + insn = next_insn(p, BRW_OPCODE_ADD); + do_insn = get_inner_do_insn(p); + + brw_set_dest(p, insn, brw_ip_reg()); + brw_set_src0(p, insn, brw_ip_reg()); + brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16)); + brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1); + } else { + insn = next_insn(p, BRW_OPCODE_WHILE); + do_insn = get_inner_do_insn(p); + + assert(brw_inst_opcode(p->isa, do_insn) == BRW_OPCODE_DO); + + brw_set_dest(p, insn, brw_ip_reg()); + brw_set_src0(p, insn, brw_ip_reg()); + brw_set_src1(p, insn, brw_imm_d(0)); + + brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn)); + brw_inst_set_gfx4_jump_count(devinfo, insn, br * (do_insn - insn + 1)); + brw_inst_set_gfx4_pop_count(devinfo, insn, 0); + + brw_patch_break_cont(p, insn); + } + } + brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); + + p->loop_stack_depth--; + + return insn; +} + +/* FORWARD JUMPS: + */ +void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx) +{ + const struct intel_device_info *devinfo = p->devinfo; + brw_inst *jmp_insn = &p->store[jmp_insn_idx]; + unsigned jmpi = 1; + + if (devinfo->ver >= 5) + jmpi = 2; + + assert(brw_inst_opcode(p->isa, jmp_insn) == BRW_OPCODE_JMPI); + assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE); + + brw_inst_set_gfx4_jump_count(devinfo, jmp_insn, + jmpi * (p->nr_insn - jmp_insn_idx - 1)); +} + +/* To integrate with the above, it makes sense that the comparison + * instruction should populate the flag register. It might be simpler + * just to use the flag reg for most WM tasks? + */ +void brw_CMP(struct brw_codegen *p, + struct brw_reg dest, + unsigned conditional, + struct brw_reg src0, + struct brw_reg src1) +{ + const struct intel_device_info *devinfo = p->devinfo; + brw_inst *insn = next_insn(p, BRW_OPCODE_CMP); + + brw_inst_set_cond_modifier(devinfo, insn, conditional); + brw_set_dest(p, insn, dest); + brw_set_src0(p, insn, src0); + brw_set_src1(p, insn, src1); + + /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds + * page says: + * "Any CMP instruction with a null destination must use a {switch}." + * + * It also applies to other Gfx7 platforms (IVB, BYT) even though it isn't + * mentioned on their work-arounds pages. + */ + if (devinfo->ver == 7) { + if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE && + dest.nr == BRW_ARF_NULL) { + brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH); + } + } +} + +void brw_CMPN(struct brw_codegen *p, + struct brw_reg dest, + unsigned conditional, + struct brw_reg src0, + struct brw_reg src1) +{ + const struct intel_device_info *devinfo = p->devinfo; + brw_inst *insn = next_insn(p, BRW_OPCODE_CMPN); + + brw_inst_set_cond_modifier(devinfo, insn, conditional); + brw_set_dest(p, insn, dest); + brw_set_src0(p, insn, src0); + brw_set_src1(p, insn, src1); + + /* Page 166 of the Ivy Bridge PRM Volume 4 part 3 (Execution Unit ISA) + * says: + * + * If the destination is the null register, the {Switch} instruction + * option must be used. + * + * Page 77 of the Haswell PRM Volume 2b contains the same text. + */ + if (devinfo->ver == 7) { + if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE && + dest.nr == BRW_ARF_NULL) { + brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH); + } + } +} + +/*********************************************************************** + * Helpers for the various SEND message types: + */ + +/** Extended math function, float[8]. + */ +void gfx4_math(struct brw_codegen *p, + struct brw_reg dest, + unsigned function, + unsigned msg_reg_nr, + struct brw_reg src, + unsigned precision ) +{ + const struct intel_device_info *devinfo = p->devinfo; + brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); + unsigned data_type; + if (has_scalar_region(src)) { + data_type = BRW_MATH_DATA_SCALAR; + } else { + data_type = BRW_MATH_DATA_VECTOR; + } + + assert(devinfo->ver < 6); + + /* Example code doesn't set predicate_control for send + * instructions. + */ + brw_inst_set_pred_control(devinfo, insn, 0); + brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr); + + brw_set_dest(p, insn, dest); + brw_set_src0(p, insn, src); + brw_set_math_message(p, + insn, + function, + src.type == BRW_REGISTER_TYPE_D, + precision, + data_type); +} + +void gfx6_math(struct brw_codegen *p, + struct brw_reg dest, + unsigned function, + struct brw_reg src0, + struct brw_reg src1) +{ + const struct intel_device_info *devinfo = p->devinfo; + brw_inst *insn = next_insn(p, BRW_OPCODE_MATH); + + assert(devinfo->ver >= 6); + + assert(dest.file == BRW_GENERAL_REGISTER_FILE || + (devinfo->ver >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE)); + + assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1); + if (devinfo->ver == 6) { + assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1); + assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1); + } + + if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT || + function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER || + function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) { + assert(src0.type != BRW_REGISTER_TYPE_F); + assert(src1.type != BRW_REGISTER_TYPE_F); + assert(src1.file == BRW_GENERAL_REGISTER_FILE || + (devinfo->ver >= 8 && src1.file == BRW_IMMEDIATE_VALUE)); + /* From BSpec 6647/47428 "[Instruction] Extended Math Function": + * INT DIV function does not support source modifiers. + */ + assert(!src0.negate); + assert(!src0.abs); + assert(!src1.negate); + assert(!src1.abs); + } else { + assert(src0.type == BRW_REGISTER_TYPE_F || + (src0.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 9)); + assert(src1.type == BRW_REGISTER_TYPE_F || + (src1.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 9)); + } + + /* Source modifiers are ignored for extended math instructions on Gfx6. */ + if (devinfo->ver == 6) { + assert(!src0.negate); + assert(!src0.abs); + assert(!src1.negate); + assert(!src1.abs); + } + + brw_inst_set_math_function(devinfo, insn, function); + + brw_set_dest(p, insn, dest); + brw_set_src0(p, insn, src0); + brw_set_src1(p, insn, src1); +} + +/** + * Return the right surface index to access the thread scratch space using + * stateless dataport messages. + */ +unsigned +brw_scratch_surface_idx(const struct brw_codegen *p) +{ + /* The scratch space is thread-local so IA coherency is unnecessary. */ + if (p->devinfo->ver >= 8) + return GFX8_BTI_STATELESS_NON_COHERENT; + else + return BRW_BTI_STATELESS; +} + +/** + * Write a block of OWORDs (half a GRF each) from the scratch buffer, + * using a constant offset per channel. + * + * The offset must be aligned to oword size (16 bytes). Used for + * register spilling. + */ +void brw_oword_block_write_scratch(struct brw_codegen *p, + struct brw_reg mrf, + int num_regs, + unsigned offset) +{ + const struct intel_device_info *devinfo = p->devinfo; + const unsigned target_cache = + (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE : + devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE : + BRW_SFID_DATAPORT_WRITE); + const struct tgl_swsb swsb = brw_get_default_swsb(p); + uint32_t msg_type; + + if (devinfo->ver >= 6) + offset /= 16; + + mrf = retype(mrf, BRW_REGISTER_TYPE_UD); + + const unsigned mlen = 1 + num_regs; + + /* Set up the message header. This is g0, with g0.2 filled with + * the offset. We don't want to leave our offset around in g0 or + * it'll screw up texture samples, so set it up inside the message + * reg. + */ + { + brw_push_insn_state(p); + brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); + + brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + + /* set message header global offset field (reg 0, element 2) */ + brw_set_default_exec_size(p, BRW_EXECUTE_1); + brw_set_default_swsb(p, tgl_swsb_null()); + brw_MOV(p, + retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, + mrf.nr, + 2), BRW_REGISTER_TYPE_UD), + brw_imm_ud(offset)); + + brw_pop_insn_state(p); + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); + } + + { + struct brw_reg dest; + brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); + int send_commit_msg; + struct brw_reg src_header = retype(brw_vec8_grf(0, 0), + BRW_REGISTER_TYPE_UW); + + brw_inst_set_sfid(devinfo, insn, target_cache); + brw_inst_set_compression(devinfo, insn, false); + + if (brw_inst_exec_size(devinfo, insn) >= 16) + src_header = vec16(src_header); + + assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE); + if (devinfo->ver < 6) + brw_inst_set_base_mrf(devinfo, insn, mrf.nr); + + /* Until gfx6, writes followed by reads from the same location + * are not guaranteed to be ordered unless write_commit is set. + * If set, then a no-op write is issued to the destination + * register to set a dependency, and a read from the destination + * can be used to ensure the ordering. + * + * For gfx6, only writes between different threads need ordering + * protection. Our use of DP writes is all about register + * spilling within a thread. + */ + if (devinfo->ver >= 6) { + dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW); + send_commit_msg = 0; + } else { + dest = src_header; + send_commit_msg = 1; + } + + brw_set_dest(p, insn, dest); + if (devinfo->ver >= 6) { + brw_set_src0(p, insn, mrf); + } else { + brw_set_src0(p, insn, brw_null_reg()); + } + + if (devinfo->ver >= 6) + msg_type = GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE; + else + msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE; + + brw_set_desc(p, insn, + brw_message_desc(devinfo, mlen, send_commit_msg, true) | + brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p), + BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8), + msg_type, send_commit_msg)); + } +} + + +/** + * Read a block of owords (half a GRF each) from the scratch buffer + * using a constant index per channel. + * + * Offset must be aligned to oword size (16 bytes). Used for register + * spilling. + */ +void +brw_oword_block_read_scratch(struct brw_codegen *p, + struct brw_reg dest, + struct brw_reg mrf, + int num_regs, + unsigned offset) +{ + const struct intel_device_info *devinfo = p->devinfo; + const struct tgl_swsb swsb = brw_get_default_swsb(p); + + if (devinfo->ver >= 6) + offset /= 16; + + if (p->devinfo->ver >= 7) { + /* On gen 7 and above, we no longer have message registers and we can + * send from any register we want. By using the destination register + * for the message, we guarantee that the implied message write won't + * accidentally overwrite anything. This has been a problem because + * the MRF registers and source for the final FB write are both fixed + * and may overlap. + */ + mrf = retype(dest, BRW_REGISTER_TYPE_UD); + } else { + mrf = retype(mrf, BRW_REGISTER_TYPE_UD); + } + dest = retype(dest, BRW_REGISTER_TYPE_UW); + + const unsigned rlen = num_regs; + const unsigned target_cache = + (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE : + devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE : + BRW_SFID_DATAPORT_READ); + + { + brw_push_insn_state(p); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); + brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + + brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + + /* set message header global offset field (reg 0, element 2) */ + brw_set_default_exec_size(p, BRW_EXECUTE_1); + brw_set_default_swsb(p, tgl_swsb_null()); + brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset)); + + brw_pop_insn_state(p); + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); + } + + { + brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); + + brw_inst_set_sfid(devinfo, insn, target_cache); + assert(brw_inst_pred_control(devinfo, insn) == 0); + brw_inst_set_compression(devinfo, insn, false); + + brw_set_dest(p, insn, dest); /* UW? */ + if (devinfo->ver >= 6) { + brw_set_src0(p, insn, mrf); + } else { + brw_set_src0(p, insn, brw_null_reg()); + brw_inst_set_base_mrf(devinfo, insn, mrf.nr); + } + + brw_set_desc(p, insn, + brw_message_desc(devinfo, 1, rlen, true) | + brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p), + BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8), + BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, + BRW_DATAPORT_READ_TARGET_RENDER_CACHE)); + } +} + +void +gfx7_block_read_scratch(struct brw_codegen *p, + struct brw_reg dest, + int num_regs, + unsigned offset) +{ + brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); + assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE); + + brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW)); + + /* The HW requires that the header is present; this is to get the g0.5 + * scratch offset. + */ + brw_set_src0(p, insn, brw_vec8_grf(0, 0)); + + /* According to the docs, offset is "A 12-bit HWord offset into the memory + * Immediate Memory buffer as specified by binding table 0xFF." An HWORD + * is 32 bytes, which happens to be the size of a register. + */ + offset /= REG_SIZE; + assert(offset < (1 << 12)); + + gfx7_set_dp_scratch_message(p, insn, + false, /* scratch read */ + false, /* OWords */ + false, /* invalidate after read */ + num_regs, + offset, + 1, /* mlen: just g0 */ + num_regs, /* rlen */ + true); /* header present */ +} + +/** + * Read float[4] vectors from the data port constant cache. + * Location (in buffer) should be a multiple of 16. + * Used for fetching shader constants. + */ +void brw_oword_block_read(struct brw_codegen *p, + struct brw_reg dest, + struct brw_reg mrf, + uint32_t offset, + uint32_t bind_table_index) +{ + const struct intel_device_info *devinfo = p->devinfo; + const unsigned target_cache = + (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_CONSTANT_CACHE : + BRW_SFID_DATAPORT_READ); + const unsigned exec_size = 1 << brw_get_default_exec_size(p); + const struct tgl_swsb swsb = brw_get_default_swsb(p); + + /* On newer hardware, offset is in units of owords. */ + if (devinfo->ver >= 6) + offset /= 16; + + mrf = retype(mrf, BRW_REGISTER_TYPE_UD); + + brw_push_insn_state(p); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_set_default_flag_reg(p, 0, 0); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + + brw_push_insn_state(p); + brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); + brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + + /* set message header global offset field (reg 0, element 2) */ + brw_set_default_exec_size(p, BRW_EXECUTE_1); + brw_set_default_swsb(p, tgl_swsb_null()); + brw_MOV(p, + retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, + mrf.nr, + 2), BRW_REGISTER_TYPE_UD), + brw_imm_ud(offset)); + brw_pop_insn_state(p); + + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); + + brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); + + brw_inst_set_sfid(devinfo, insn, target_cache); + + /* cast dest to a uword[8] vector */ + dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW); + + brw_set_dest(p, insn, dest); + if (devinfo->ver >= 6) { + brw_set_src0(p, insn, mrf); + } else { + brw_set_src0(p, insn, brw_null_reg()); + brw_inst_set_base_mrf(devinfo, insn, mrf.nr); + } + + brw_set_desc(p, insn, + brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) | + brw_dp_read_desc(devinfo, bind_table_index, + BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size), + BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, + BRW_DATAPORT_READ_TARGET_DATA_CACHE)); + + brw_pop_insn_state(p); +} + +brw_inst * +brw_fb_WRITE(struct brw_codegen *p, + struct brw_reg payload, + struct brw_reg implied_header, + unsigned msg_control, + unsigned binding_table_index, + unsigned msg_length, + unsigned response_length, + bool eot, + bool last_render_target, + bool header_present) +{ + const struct intel_device_info *devinfo = p->devinfo; + const unsigned target_cache = + (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE : + BRW_SFID_DATAPORT_WRITE); + brw_inst *insn; + struct brw_reg dest, src0; + + if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16) + dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW); + else + dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW); + + if (devinfo->ver >= 6) { + insn = next_insn(p, BRW_OPCODE_SENDC); + } else { + insn = next_insn(p, BRW_OPCODE_SEND); + } + brw_inst_set_sfid(devinfo, insn, target_cache); + brw_inst_set_compression(devinfo, insn, false); + + if (devinfo->ver >= 6) { + /* headerless version, just submit color payload */ + src0 = payload; + } else { + assert(payload.file == BRW_MESSAGE_REGISTER_FILE); + brw_inst_set_base_mrf(devinfo, insn, payload.nr); + src0 = implied_header; + } + + brw_set_dest(p, insn, dest); + brw_set_src0(p, insn, src0); + brw_set_desc(p, insn, + brw_message_desc(devinfo, msg_length, response_length, + header_present) | + brw_fb_write_desc(devinfo, binding_table_index, msg_control, + last_render_target, + false /* coarse_write */)); + brw_inst_set_eot(devinfo, insn, eot); + + return insn; +} + +brw_inst * +gfx9_fb_READ(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg payload, + unsigned binding_table_index, + unsigned msg_length, + unsigned response_length, + bool per_sample) +{ + const struct intel_device_info *devinfo = p->devinfo; + assert(devinfo->ver >= 9); + brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC); + + brw_inst_set_sfid(devinfo, insn, GFX6_SFID_DATAPORT_RENDER_CACHE); + brw_set_dest(p, insn, dst); + brw_set_src0(p, insn, payload); + brw_set_desc( + p, insn, + brw_message_desc(devinfo, msg_length, response_length, true) | + brw_fb_read_desc(devinfo, binding_table_index, 0 /* msg_control */, + 1 << brw_get_default_exec_size(p), per_sample)); + brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16); + + return insn; +} + +/** + * Texture sample instruction. + * Note: the msg_type plus msg_length values determine exactly what kind + * of sampling operation is performed. See volume 4, page 161 of docs. + */ +void brw_SAMPLE(struct brw_codegen *p, + struct brw_reg dest, + unsigned msg_reg_nr, + struct brw_reg src0, + unsigned binding_table_index, + unsigned sampler, + unsigned msg_type, + unsigned response_length, + unsigned msg_length, + unsigned header_present, + unsigned simd_mode, + unsigned return_format) +{ + const struct intel_device_info *devinfo = p->devinfo; + brw_inst *insn; + + if (msg_reg_nr != -1) + gfx6_resolve_implied_move(p, &src0, msg_reg_nr); + + insn = next_insn(p, BRW_OPCODE_SEND); + brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER); + brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */ + + /* From the 965 PRM (volume 4, part 1, section 14.2.41): + * + * "Instruction compression is not allowed for this instruction (that + * is, send). The hardware behavior is undefined if this instruction is + * set as compressed. However, compress control can be set to "SecHalf" + * to affect the EMask generation." + * + * No similar wording is found in later PRMs, but there are examples + * utilizing send with SecHalf. More importantly, SIMD8 sampler messages + * are allowed in SIMD16 mode and they could not work without SecHalf. For + * these reasons, we allow BRW_COMPRESSION_2NDHALF here. + */ + brw_inst_set_compression(devinfo, insn, false); + + if (devinfo->ver < 6) + brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr); + + brw_set_dest(p, insn, dest); + brw_set_src0(p, insn, src0); + brw_set_desc(p, insn, + brw_message_desc(devinfo, msg_length, response_length, + header_present) | + brw_sampler_desc(devinfo, binding_table_index, sampler, + msg_type, simd_mode, return_format)); +} + +/* Adjust the message header's sampler state pointer to + * select the correct group of 16 samplers. + */ +void brw_adjust_sampler_state_pointer(struct brw_codegen *p, + struct brw_reg header, + struct brw_reg sampler_index) +{ + /* The "Sampler Index" field can only store values between 0 and 15. + * However, we can add an offset to the "Sampler State Pointer" + * field, effectively selecting a different set of 16 samplers. + * + * The "Sampler State Pointer" needs to be aligned to a 32-byte + * offset, and each sampler state is only 16-bytes, so we can't + * exclusively use the offset - we have to use both. + */ + + const struct intel_device_info *devinfo = p->devinfo; + + if (sampler_index.file == BRW_IMMEDIATE_VALUE) { + const int sampler_state_size = 16; /* 16 bytes */ + uint32_t sampler = sampler_index.ud; + + if (sampler >= 16) { + assert(devinfo->verx10 >= 75); + brw_ADD(p, + get_element_ud(header, 3), + get_element_ud(brw_vec8_grf(0, 0), 3), + brw_imm_ud(16 * (sampler / 16) * sampler_state_size)); + } + } else { + /* Non-const sampler array indexing case */ + if (devinfo->verx10 <= 70) { + return; + } + + struct brw_reg temp = get_element_ud(header, 3); + + brw_push_insn_state(p); + brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0)); + brw_set_default_swsb(p, tgl_swsb_regdist(1)); + brw_SHL(p, temp, temp, brw_imm_ud(4)); + brw_ADD(p, + get_element_ud(header, 3), + get_element_ud(brw_vec8_grf(0, 0), 3), + temp); + brw_pop_insn_state(p); + } +} + +/* All these variables are pretty confusing - we might be better off + * using bitmasks and macros for this, in the old style. Or perhaps + * just having the caller instantiate the fields in dword3 itself. + */ +void brw_urb_WRITE(struct brw_codegen *p, + struct brw_reg dest, + unsigned msg_reg_nr, + struct brw_reg src0, + enum brw_urb_write_flags flags, + unsigned msg_length, + unsigned response_length, + unsigned offset, + unsigned swizzle) +{ + const struct intel_device_info *devinfo = p->devinfo; + brw_inst *insn; + + gfx6_resolve_implied_move(p, &src0, msg_reg_nr); + + if (devinfo->ver >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) { + /* Enable Channel Masks in the URB_WRITE_HWORD message header */ + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_exec_size(p, BRW_EXECUTE_1); + brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5), + BRW_REGISTER_TYPE_UD), + retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), + brw_imm_ud(0xff00)); + brw_pop_insn_state(p); + } + + insn = next_insn(p, BRW_OPCODE_SEND); + + assert(msg_length < BRW_MAX_MRF(devinfo->ver)); + + brw_set_dest(p, insn, dest); + brw_set_src0(p, insn, src0); + brw_set_src1(p, insn, brw_imm_d(0)); + + if (devinfo->ver < 6) + brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr); + + brw_set_urb_message(p, + insn, + flags, + msg_length, + response_length, + offset, + swizzle); +} + +void +brw_send_indirect_message(struct brw_codegen *p, + unsigned sfid, + struct brw_reg dst, + struct brw_reg payload, + struct brw_reg desc, + unsigned desc_imm, + bool eot) +{ + const struct intel_device_info *devinfo = p->devinfo; + struct brw_inst *send; + + dst = retype(dst, BRW_REGISTER_TYPE_UW); + + assert(desc.type == BRW_REGISTER_TYPE_UD); + + if (desc.file == BRW_IMMEDIATE_VALUE) { + send = next_insn(p, BRW_OPCODE_SEND); + brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD)); + brw_set_desc(p, send, desc.ud | desc_imm); + } else { + const struct tgl_swsb swsb = brw_get_default_swsb(p); + struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD); + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_exec_size(p, BRW_EXECUTE_1); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_set_default_flag_reg(p, 0, 0); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); + + /* Load the indirect descriptor to an address register using OR so the + * caller can specify additional descriptor bits with the desc_imm + * immediate. + */ + brw_OR(p, addr, desc, brw_imm_ud(desc_imm)); + + brw_pop_insn_state(p); + + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); + send = next_insn(p, BRW_OPCODE_SEND); + brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD)); + + if (devinfo->ver >= 12) + brw_inst_set_send_sel_reg32_desc(devinfo, send, true); + else + brw_set_src1(p, send, addr); + } + + brw_set_dest(p, send, dst); + brw_inst_set_sfid(devinfo, send, sfid); + brw_inst_set_eot(devinfo, send, eot); +} + +void +brw_send_indirect_split_message(struct brw_codegen *p, + unsigned sfid, + struct brw_reg dst, + struct brw_reg payload0, + struct brw_reg payload1, + struct brw_reg desc, + unsigned desc_imm, + struct brw_reg ex_desc, + unsigned ex_desc_imm, + bool ex_desc_scratch, + bool ex_bso, + bool eot) +{ + const struct intel_device_info *devinfo = p->devinfo; + struct brw_inst *send; + + dst = retype(dst, BRW_REGISTER_TYPE_UW); + + assert(desc.type == BRW_REGISTER_TYPE_UD); + + if (desc.file == BRW_IMMEDIATE_VALUE) { + desc.ud |= desc_imm; + } else { + const struct tgl_swsb swsb = brw_get_default_swsb(p); + struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD); + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_exec_size(p, BRW_EXECUTE_1); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_set_default_flag_reg(p, 0, 0); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); + + /* Load the indirect descriptor to an address register using OR so the + * caller can specify additional descriptor bits with the desc_imm + * immediate. + */ + brw_OR(p, addr, desc, brw_imm_ud(desc_imm)); + + brw_pop_insn_state(p); + desc = addr; + + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); + } + + if (ex_desc.file == BRW_IMMEDIATE_VALUE && + !ex_desc_scratch && + (devinfo->ver >= 12 || + ((ex_desc.ud | ex_desc_imm) & INTEL_MASK(15, 12)) == 0)) { + /* ATS-M PRMs, Volume 2d: Command Reference: Structures, + * EU_INSTRUCTION_SEND instruction + * + * "ExBSO: Exists If: ([ExDesc.IsReg]==true)" + */ + assert(!ex_bso); + ex_desc.ud |= ex_desc_imm; + } else { + const struct tgl_swsb swsb = brw_get_default_swsb(p); + struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD); + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_exec_size(p, BRW_EXECUTE_1); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_set_default_flag_reg(p, 0, 0); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); + + /* Load the indirect extended descriptor to an address register using OR + * so the caller can specify additional descriptor bits with the + * desc_imm immediate. + * + * Even though the instruction dispatcher always pulls the SFID and EOT + * fields from the instruction itself, actual external unit which + * processes the message gets the SFID and EOT from the extended + * descriptor which comes from the address register. If we don't OR + * those two bits in, the external unit may get confused and hang. + */ + unsigned imm_part = ex_bso ? 0 : (ex_desc_imm | sfid | eot << 5); + + if (ex_desc_scratch) { + /* Or the scratch surface offset together with the immediate part of + * the extended descriptor. + */ + assert(devinfo->verx10 >= 125); + brw_AND(p, addr, + retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), + brw_imm_ud(INTEL_MASK(31, 10))); + brw_OR(p, addr, addr, brw_imm_ud(imm_part)); + } else if (ex_desc.file == BRW_IMMEDIATE_VALUE) { + /* ex_desc bits 15:12 don't exist in the instruction encoding prior + * to Gfx12, so we may have fallen back to an indirect extended + * descriptor. + */ + brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part)); + } else { + brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part)); + } + + brw_pop_insn_state(p); + ex_desc = addr; + + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); + } + + send = next_insn(p, devinfo->ver >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS); + brw_set_dest(p, send, dst); + brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD)); + brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD)); + + if (desc.file == BRW_IMMEDIATE_VALUE) { + brw_inst_set_send_sel_reg32_desc(devinfo, send, 0); + brw_inst_set_send_desc(devinfo, send, desc.ud); + } else { + assert(desc.file == BRW_ARCHITECTURE_REGISTER_FILE); + assert(desc.nr == BRW_ARF_ADDRESS); + assert(desc.subnr == 0); + brw_inst_set_send_sel_reg32_desc(devinfo, send, 1); + } + + if (ex_desc.file == BRW_IMMEDIATE_VALUE) { + brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0); + brw_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud); + } else { + assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE); + assert(ex_desc.nr == BRW_ARF_ADDRESS); + assert((ex_desc.subnr & 0x3) == 0); + brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1); + brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, phys_subnr(devinfo, ex_desc) >> 2); + } + + if (ex_bso) { + /* The send instruction ExBSO field does not exist with UGM on Gfx20+, + * it is assumed. + * + * BSpec 56890 + */ + if (devinfo->ver < 20 || sfid != GFX12_SFID_UGM) + brw_inst_set_send_ex_bso(devinfo, send, true); + brw_inst_set_send_src1_len(devinfo, send, GET_BITS(ex_desc_imm, 10, 6)); + } + brw_inst_set_sfid(devinfo, send, sfid); + brw_inst_set_eot(devinfo, send, eot); +} + +static void +brw_send_indirect_surface_message(struct brw_codegen *p, + unsigned sfid, + struct brw_reg dst, + struct brw_reg payload, + struct brw_reg surface, + unsigned desc_imm) +{ + if (surface.file != BRW_IMMEDIATE_VALUE) { + const struct tgl_swsb swsb = brw_get_default_swsb(p); + struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD); + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_exec_size(p, BRW_EXECUTE_1); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_set_default_flag_reg(p, 0, 0); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); + + /* Mask out invalid bits from the surface index to avoid hangs e.g. when + * some surface array is accessed out of bounds. + */ + brw_AND(p, addr, + suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)), + BRW_GET_SWZ(surface.swizzle, 0)), + brw_imm_ud(0xff)); + + brw_pop_insn_state(p); + + surface = addr; + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); + } + + brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false); +} + +static bool +while_jumps_before_offset(const struct intel_device_info *devinfo, + brw_inst *insn, int while_offset, int start_offset) +{ + int scale = 16 / brw_jump_scale(devinfo); + int jip = devinfo->ver == 6 ? brw_inst_gfx6_jump_count(devinfo, insn) + : brw_inst_jip(devinfo, insn); + assert(jip < 0); + return while_offset + jip * scale <= start_offset; +} + + +static int +brw_find_next_block_end(struct brw_codegen *p, int start_offset) +{ + int offset; + void *store = p->store; + const struct intel_device_info *devinfo = p->devinfo; + + int depth = 0; + + for (offset = next_offset(devinfo, store, start_offset); + offset < p->next_insn_offset; + offset = next_offset(devinfo, store, offset)) { + brw_inst *insn = store + offset; + + switch (brw_inst_opcode(p->isa, insn)) { + case BRW_OPCODE_IF: + depth++; + break; + case BRW_OPCODE_ENDIF: + if (depth == 0) + return offset; + depth--; + break; + case BRW_OPCODE_WHILE: + /* If the while doesn't jump before our instruction, it's the end + * of a sibling do...while loop. Ignore it. + */ + if (!while_jumps_before_offset(devinfo, insn, offset, start_offset)) + continue; + FALLTHROUGH; + case BRW_OPCODE_ELSE: + case BRW_OPCODE_HALT: + if (depth == 0) + return offset; + break; + default: + break; + } + } + + return 0; +} + +/* There is no DO instruction on gfx6, so to find the end of the loop + * we have to see if the loop is jumping back before our start + * instruction. + */ +static int +brw_find_loop_end(struct brw_codegen *p, int start_offset) +{ + const struct intel_device_info *devinfo = p->devinfo; + int offset; + void *store = p->store; + + assert(devinfo->ver >= 6); + + /* Always start after the instruction (such as a WHILE) we're trying to fix + * up. + */ + for (offset = next_offset(devinfo, store, start_offset); + offset < p->next_insn_offset; + offset = next_offset(devinfo, store, offset)) { + brw_inst *insn = store + offset; + + if (brw_inst_opcode(p->isa, insn) == BRW_OPCODE_WHILE) { + if (while_jumps_before_offset(devinfo, insn, offset, start_offset)) + return offset; + } + } + assert(!"not reached"); + return start_offset; +} + +/* After program generation, go back and update the UIP and JIP of + * BREAK, CONT, and HALT instructions to their correct locations. + */ +void +brw_set_uip_jip(struct brw_codegen *p, int start_offset) +{ + const struct intel_device_info *devinfo = p->devinfo; + int offset; + int br = brw_jump_scale(devinfo); + int scale = 16 / br; + void *store = p->store; + + if (devinfo->ver < 6) + return; + + for (offset = start_offset; offset < p->next_insn_offset; offset += 16) { + brw_inst *insn = store + offset; + assert(brw_inst_cmpt_control(devinfo, insn) == 0); + + switch (brw_inst_opcode(p->isa, insn)) { + case BRW_OPCODE_BREAK: { + int block_end_offset = brw_find_next_block_end(p, offset); + assert(block_end_offset != 0); + brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale); + /* Gfx7 UIP points to WHILE; Gfx6 points just after it */ + brw_inst_set_uip(devinfo, insn, + (brw_find_loop_end(p, offset) - offset + + (devinfo->ver == 6 ? 16 : 0)) / scale); + break; + } + + case BRW_OPCODE_CONTINUE: { + int block_end_offset = brw_find_next_block_end(p, offset); + assert(block_end_offset != 0); + brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale); + brw_inst_set_uip(devinfo, insn, + (brw_find_loop_end(p, offset) - offset) / scale); + + assert(brw_inst_uip(devinfo, insn) != 0); + assert(brw_inst_jip(devinfo, insn) != 0); + break; + } + + case BRW_OPCODE_ENDIF: { + int block_end_offset = brw_find_next_block_end(p, offset); + int32_t jump = (block_end_offset == 0) ? + 1 * br : (block_end_offset - offset) / scale; + if (devinfo->ver >= 7) + brw_inst_set_jip(devinfo, insn, jump); + else + brw_inst_set_gfx6_jump_count(devinfo, insn, jump); + break; + } + + case BRW_OPCODE_HALT: { + /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19): + * + * "In case of the halt instruction not inside any conditional + * code block, the value of and should be the + * same. In case of the halt instruction inside conditional code + * block, the should be the end of the program, and the + * should be end of the most inner conditional code block." + * + * The uip will have already been set by whoever set up the + * instruction. + */ + int block_end_offset = brw_find_next_block_end(p, offset); + if (block_end_offset == 0) { + brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn)); + } else { + brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale); + } + assert(brw_inst_uip(devinfo, insn) != 0); + assert(brw_inst_jip(devinfo, insn) != 0); + break; + } + + default: + break; + } + } +} + +void brw_ff_sync(struct brw_codegen *p, + struct brw_reg dest, + unsigned msg_reg_nr, + struct brw_reg src0, + bool allocate, + unsigned response_length, + bool eot) +{ + const struct intel_device_info *devinfo = p->devinfo; + brw_inst *insn; + + gfx6_resolve_implied_move(p, &src0, msg_reg_nr); + + insn = next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, insn, dest); + brw_set_src0(p, insn, src0); + brw_set_src1(p, insn, brw_imm_d(0)); + + if (devinfo->ver < 6) + brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr); + + brw_set_ff_sync_message(p, + insn, + allocate, + response_length, + eot); +} + +/** + * Emit the SEND instruction necessary to generate stream output data on Gfx6 + * (for transform feedback). + * + * If send_commit_msg is true, this is the last piece of stream output data + * from this thread, so send the data as a committed write. According to the + * Sandy Bridge PRM (volume 2 part 1, section 4.5.1): + * + * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all + * writes are complete by sending the final write as a committed write." + */ +void +brw_svb_write(struct brw_codegen *p, + struct brw_reg dest, + unsigned msg_reg_nr, + struct brw_reg src0, + unsigned binding_table_index, + bool send_commit_msg) +{ + const struct intel_device_info *devinfo = p->devinfo; + assert(devinfo->ver == 6); + const unsigned target_cache = GFX6_SFID_DATAPORT_RENDER_CACHE; + brw_inst *insn; + + gfx6_resolve_implied_move(p, &src0, msg_reg_nr); + + insn = next_insn(p, BRW_OPCODE_SEND); + brw_inst_set_sfid(devinfo, insn, target_cache); + brw_set_dest(p, insn, dest); + brw_set_src0(p, insn, src0); + brw_set_desc(p, insn, + brw_message_desc(devinfo, 1, send_commit_msg, true) | + brw_dp_write_desc(devinfo, binding_table_index, + 0, /* msg_control: ignored */ + GFX6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE, + send_commit_msg)); /* send_commit_msg */ +} + +static unsigned +brw_surface_payload_size(unsigned num_channels, + unsigned exec_size /**< 0 for SIMD4x2 */) +{ + if (exec_size == 0) + return 1; /* SIMD4x2 */ + else if (exec_size <= 8) + return num_channels; + else + return 2 * num_channels; +} + +void +brw_untyped_atomic(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg payload, + struct brw_reg surface, + unsigned atomic_op, + unsigned msg_length, + bool response_expected, + bool header_present) +{ + const struct intel_device_info *devinfo = p->devinfo; + const unsigned sfid = (devinfo->verx10 >= 75 ? + HSW_SFID_DATAPORT_DATA_CACHE_1 : + GFX7_SFID_DATAPORT_DATA_CACHE); + const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1; + /* SIMD4x2 untyped atomic instructions only exist on HSW+ */ + const bool has_simd4x2 = devinfo->verx10 >= 75; + const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : + has_simd4x2 ? 0 : 8; + const unsigned response_length = + brw_surface_payload_size(response_expected, exec_size); + const unsigned desc = + brw_message_desc(devinfo, msg_length, response_length, header_present) | + brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op, + response_expected); + /* Mask out unused components -- This is especially important in Align16 + * mode on generations that don't have native support for SIMD4x2 atomics, + * because unused but enabled components will cause the dataport to perform + * additional atomic operations on the addresses that happen to be in the + * uninitialized Y, Z and W coordinates of the payload. + */ + const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X; + + brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask), + payload, surface, desc); +} + +void +brw_untyped_surface_read(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg payload, + struct brw_reg surface, + unsigned msg_length, + unsigned num_channels) +{ + const struct intel_device_info *devinfo = p->devinfo; + const unsigned sfid = (devinfo->verx10 >= 75 ? + HSW_SFID_DATAPORT_DATA_CACHE_1 : + GFX7_SFID_DATAPORT_DATA_CACHE); + const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1; + const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0; + const unsigned response_length = + brw_surface_payload_size(num_channels, exec_size); + const unsigned desc = + brw_message_desc(devinfo, msg_length, response_length, false) | + brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false); + + brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc); +} + +void +brw_untyped_surface_write(struct brw_codegen *p, + struct brw_reg payload, + struct brw_reg surface, + unsigned msg_length, + unsigned num_channels, + bool header_present) +{ + const struct intel_device_info *devinfo = p->devinfo; + const unsigned sfid = (devinfo->verx10 >= 75 ? + HSW_SFID_DATAPORT_DATA_CACHE_1 : + GFX7_SFID_DATAPORT_DATA_CACHE); + const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1; + /* SIMD4x2 untyped surface write instructions only exist on HSW+ */ + const bool has_simd4x2 = devinfo->verx10 >= 75; + const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : + has_simd4x2 ? 0 : 8; + const unsigned desc = + brw_message_desc(devinfo, msg_length, 0, header_present) | + brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true); + /* Mask out unused components -- See comment in brw_untyped_atomic(). */ + const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW; + + brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask), + payload, surface, desc); +} + +static void +brw_set_memory_fence_message(struct brw_codegen *p, + struct brw_inst *insn, + enum brw_message_target sfid, + bool commit_enable, + unsigned bti) +{ + const struct intel_device_info *devinfo = p->devinfo; + + brw_set_desc(p, insn, brw_message_desc( + devinfo, 1, (commit_enable ? 1 : 0), true)); + + brw_inst_set_sfid(devinfo, insn, sfid); + + switch (sfid) { + case GFX6_SFID_DATAPORT_RENDER_CACHE: + brw_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_RC_MEMORY_FENCE); + break; + case GFX7_SFID_DATAPORT_DATA_CACHE: + brw_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_DC_MEMORY_FENCE); + break; + default: + unreachable("Not reached"); + } + + if (commit_enable) + brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5); + + assert(devinfo->ver >= 11 || bti == 0); + brw_inst_set_binding_table_index(devinfo, insn, bti); +} + +static void +gfx12_set_memory_fence_message(struct brw_codegen *p, + struct brw_inst *insn, + enum brw_message_target sfid, + uint32_t desc) +{ + const unsigned mlen = 1 * reg_unit(p->devinfo); /* g0 header */ + /* Completion signaled by write to register. No data returned. */ + const unsigned rlen = 1 * reg_unit(p->devinfo); + + brw_inst_set_sfid(p->devinfo, insn, sfid); + + if (sfid == BRW_SFID_URB && p->devinfo->ver < 20) { + brw_set_desc(p, insn, brw_urb_fence_desc(p->devinfo) | + brw_message_desc(p->devinfo, mlen, rlen, true)); + } else { + enum lsc_fence_scope scope = lsc_fence_msg_desc_scope(p->devinfo, desc); + enum lsc_flush_type flush_type = lsc_fence_msg_desc_flush_type(p->devinfo, desc); + + if (sfid == GFX12_SFID_TGM) { + scope = LSC_FENCE_TILE; + flush_type = LSC_FLUSH_TYPE_EVICT; + } + + /* Wa_14012437816: + * + * "For any fence greater than local scope, always set flush type to + * at least invalidate so that fence goes on properly." + * + * "The bug is if flush_type is 'None', the scope is always downgraded + * to 'local'." + * + * Here set scope to NONE_6 instead of NONE, which has the same effect + * as NONE but avoids the downgrade to scope LOCAL. + */ + if (intel_needs_workaround(p->devinfo, 14012437816) && + scope > LSC_FENCE_LOCAL && + flush_type == LSC_FLUSH_TYPE_NONE) { + flush_type = LSC_FLUSH_TYPE_NONE_6; + } + + brw_set_desc(p, insn, lsc_fence_msg_desc(p->devinfo, scope, + flush_type, false) | + brw_message_desc(p->devinfo, mlen, rlen, false)); + } +} + +void +brw_memory_fence(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src, + enum opcode send_op, + enum brw_message_target sfid, + uint32_t desc, + bool commit_enable, + unsigned bti) +{ + const struct intel_device_info *devinfo = p->devinfo; + + dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW); + src = retype(vec1(src), BRW_REGISTER_TYPE_UD); + + /* Set dst as destination for dependency tracking, the MEMORY_FENCE + * message doesn't write anything back. + */ + struct brw_inst *insn = next_insn(p, send_op); + brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); + brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1); + brw_set_dest(p, insn, dst); + brw_set_src0(p, insn, src); + + /* All DG2 hardware requires LSC for fence messages, even A-step */ + if (devinfo->has_lsc) + gfx12_set_memory_fence_message(p, insn, sfid, desc); + else + brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti); +} + +void +brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst, bool last) +{ + const struct intel_device_info *devinfo = p->devinfo; + const unsigned exec_size = 1 << brw_get_default_exec_size(p); + const unsigned qtr_control = brw_get_default_group(p) / 8; + brw_inst *inst; + + assert(devinfo->ver == 7); + + brw_push_insn_state(p); + + /* The flag register is only used on Gfx7 in align1 mode, so avoid setting + * unnecessary bits in the instruction words, get the information we need + * and reset the default flag register. This allows more instructions to be + * compacted. + */ + const unsigned flag_subreg = p->current->flag_subreg; + brw_set_default_flag_reg(p, 0, 0); + + if (brw_get_default_access_mode(p) == BRW_ALIGN_1) { + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + + const struct brw_reg flag = brw_flag_subreg(flag_subreg); + + brw_set_default_exec_size(p, BRW_EXECUTE_1); + brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0)); + + /* Run enough instructions returning zero with execution masking and + * a conditional modifier enabled in order to get the full execution + * mask in f1.0. We could use a single 32-wide move here if it + * weren't because of the hardware bug that causes channel enables to + * be applied incorrectly to the second half of 32-wide instructions + * on Gfx7. + */ + const unsigned lower_size = MIN2(16, exec_size); + for (unsigned i = 0; i < exec_size / lower_size; i++) { + inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), + brw_imm_uw(0)); + brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE); + brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control); + brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z); + brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1); + brw_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2); + brw_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2); + } + + /* Find the first bit set in the exec_size-wide portion of the flag + * register that was updated by the last sequence of MOV + * instructions. + */ + const enum brw_reg_type type = brw_int_type(exec_size / 8, false); + brw_set_default_exec_size(p, BRW_EXECUTE_1); + if (!last) { + inst = brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control)); + } else { + inst = brw_LZD(p, vec1(dst), byte_offset(retype(flag, type), qtr_control)); + struct brw_reg neg = vec1(dst); + neg.negate = true; + inst = brw_ADD(p, vec1(dst), neg, brw_imm_uw(31)); + } + } else { + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + + /* Overwrite the destination without and with execution masking to + * find out which of the channels is active. + */ + brw_push_insn_state(p); + brw_set_default_exec_size(p, BRW_EXECUTE_4); + brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X), + brw_imm_ud(1)); + + inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X), + brw_imm_ud(0)); + brw_pop_insn_state(p); + brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE); + } + + brw_pop_insn_state(p); +} + +void +brw_broadcast(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src, + struct brw_reg idx) +{ + const struct intel_device_info *devinfo = p->devinfo; + const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1; + brw_inst *inst; + + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4); + + assert(src.file == BRW_GENERAL_REGISTER_FILE && + src.address_mode == BRW_ADDRESS_DIRECT); + assert(!src.abs && !src.negate); + + /* Gen12.5 adds the following region restriction: + * + * "Vx1 and VxH indirect addressing for Float, Half-Float, Double-Float + * and Quad-Word data must not be used." + * + * We require the source and destination types to match so stomp to an + * unsigned integer type. + */ + assert(src.type == dst.type); + src.type = dst.type = brw_reg_type_from_bit_size(type_sz(src.type) * 8, + BRW_REGISTER_TYPE_UD); + + if ((src.vstride == 0 && (src.hstride == 0 || !align1)) || + idx.file == BRW_IMMEDIATE_VALUE) { + /* Trivial, the source is already uniform or the index is a constant. + * We will typically not get here if the optimizer is doing its job, but + * asserting would be mean. + */ + const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0; + src = align1 ? stride(suboffset(src, i), 0, 1, 0) : + stride(suboffset(src, 4 * i), 0, 4, 1); + + if (type_sz(src.type) > 4 && !devinfo->has_64bit_int) { + brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), + subscript(src, BRW_REGISTER_TYPE_D, 0)); + brw_set_default_swsb(p, tgl_swsb_null()); + brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), + subscript(src, BRW_REGISTER_TYPE_D, 1)); + } else { + brw_MOV(p, dst, src); + } + } else { + /* From the Haswell PRM section "Register Region Restrictions": + * + * "The lower bits of the AddressImmediate must not overflow to + * change the register address. The lower 5 bits of Address + * Immediate when added to lower 5 bits of address register gives + * the sub-register offset. The upper bits of Address Immediate + * when added to upper bits of address register gives the register + * address. Any overflow from sub-register offset is dropped." + * + * Fortunately, for broadcast, we never have a sub-register offset so + * this isn't an issue. + */ + assert(src.subnr == 0); + + if (align1) { + const struct brw_reg addr = + retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD); + unsigned offset = src.nr * REG_SIZE + src.subnr; + /* Limit in bytes of the signed indirect addressing immediate. */ + const unsigned limit = 512; + + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_set_default_flag_reg(p, 0, 0); + + /* Take into account the component size and horizontal stride. */ + assert(src.vstride == src.hstride + src.width); + brw_SHL(p, addr, vec1(idx), + brw_imm_ud(util_logbase2(type_sz(src.type)) + + src.hstride - 1)); + + /* We can only address up to limit bytes using the indirect + * addressing immediate, account for the difference if the source + * register is above this limit. + */ + if (offset >= limit) { + brw_set_default_swsb(p, tgl_swsb_regdist(1)); + brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit)); + offset = offset % limit; + } + + brw_pop_insn_state(p); + + brw_set_default_swsb(p, tgl_swsb_regdist(1)); + + /* Use indirect addressing to fetch the specified component. */ + if (type_sz(src.type) > 4 && + (devinfo->platform == INTEL_PLATFORM_CHV || intel_device_info_is_9lp(devinfo) || + !devinfo->has_64bit_int)) { + /* From the Cherryview PRM Vol 7. "Register Region Restrictions": + * + * "When source or destination datatype is 64b or operation is + * integer DWord multiply, indirect addressing must not be + * used." + * + * To work around both of this issue, we do two integer MOVs + * insead of one 64-bit MOV. Because no double value should ever + * cross a register boundary, it's safe to use the immediate + * offset in the indirect here to handle adding 4 bytes to the + * offset and avoid the extra ADD to the register file. + */ + brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), + retype(brw_vec1_indirect(addr.subnr, offset), + BRW_REGISTER_TYPE_D)); + brw_set_default_swsb(p, tgl_swsb_null()); + brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), + retype(brw_vec1_indirect(addr.subnr, offset + 4), + BRW_REGISTER_TYPE_D)); + } else { + brw_MOV(p, dst, + retype(brw_vec1_indirect(addr.subnr, offset), src.type)); + } + } else { + /* In SIMD4x2 mode the index can be either zero or one, replicate it + * to all bits of a flag register, + */ + inst = brw_MOV(p, + brw_null_reg(), + stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1)); + brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE); + brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ); + brw_inst_set_flag_reg_nr(devinfo, inst, 1); + + /* and use predicated SEL to pick the right channel. */ + inst = brw_SEL(p, dst, + stride(suboffset(src, 4), 4, 4, 1), + stride(src, 4, 4, 1)); + brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL); + brw_inst_set_flag_reg_nr(devinfo, inst, 1); + } + } + + brw_pop_insn_state(p); +} + + +/** + * Emit the SEND message for a barrier + */ +void +brw_barrier(struct brw_codegen *p, struct brw_reg src) +{ + const struct intel_device_info *devinfo = p->devinfo; + struct brw_inst *inst; + + assert(devinfo->ver >= 7); + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + inst = next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW)); + brw_set_src0(p, inst, src); + brw_set_src1(p, inst, brw_null_reg()); + brw_set_desc(p, inst, brw_message_desc(devinfo, + 1 * reg_unit(devinfo), 0, false)); + + brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY); + brw_inst_set_gateway_subfuncid(devinfo, inst, + BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG); + + brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE); + brw_pop_insn_state(p); +} + + +/** + * Emit the wait instruction for a barrier + */ +void +brw_WAIT(struct brw_codegen *p) +{ + const struct intel_device_info *devinfo = p->devinfo; + struct brw_inst *insn; + + struct brw_reg src = brw_notification_reg(); + + insn = next_insn(p, BRW_OPCODE_WAIT); + brw_set_dest(p, insn, src); + brw_set_src0(p, insn, src); + brw_set_src1(p, insn, brw_null_reg()); + + brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1); + brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); +} + +void +brw_float_controls_mode(struct brw_codegen *p, + unsigned mode, unsigned mask) +{ + assert(p->current->mask_control == BRW_MASK_DISABLE); + + /* From the Skylake PRM, Volume 7, page 760: + * "Implementation Restriction on Register Access: When the control + * register is used as an explicit source and/or destination, hardware + * does not ensure execution pipeline coherency. Software must set the + * thread control field to ‘switch’ for an instruction that uses + * control register as an explicit operand." + * + * On Gfx12+ this is implemented in terms of SWSB annotations instead. + */ + brw_set_default_swsb(p, tgl_swsb_regdist(1)); + + brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0), + brw_imm_ud(~mask)); + brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1); + if (p->devinfo->ver < 12) + brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH); + + if (mode) { + brw_inst *inst_or = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0), + brw_imm_ud(mode)); + brw_inst_set_exec_size(p->devinfo, inst_or, BRW_EXECUTE_1); + if (p->devinfo->ver < 12) + brw_inst_set_thread_control(p->devinfo, inst_or, BRW_THREAD_SWITCH); + } + + if (p->devinfo->ver >= 12) + brw_SYNC(p, TGL_SYNC_NOP); +} + +void +brw_update_reloc_imm(const struct brw_isa_info *isa, + brw_inst *inst, + uint32_t value) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + /* Sanity check that the instruction is a MOV of an immediate */ + assert(brw_inst_opcode(isa, inst) == BRW_OPCODE_MOV); + assert(brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE); + + /* If it was compacted, we can't safely rewrite */ + assert(brw_inst_cmpt_control(devinfo, inst) == 0); + + brw_inst_set_imm_ud(devinfo, inst, value); +} + +/* A default value for constants that will be patched at run-time. + * We pick an arbitrary value that prevents instruction compaction. + */ +#define DEFAULT_PATCH_IMM 0x4a7cc037 + +void +brw_MOV_reloc_imm(struct brw_codegen *p, + struct brw_reg dst, + enum brw_reg_type src_type, + uint32_t id) +{ + assert(type_sz(src_type) == 4); + assert(type_sz(dst.type) == 4); + + brw_add_reloc(p, id, BRW_SHADER_RELOC_TYPE_MOV_IMM, + p->next_insn_offset, 0); + + brw_MOV(p, dst, retype(brw_imm_ud(DEFAULT_PATCH_IMM), src_type)); +} diff --git a/src/intel/compiler/elk/brw_eu_util.c b/src/intel/compiler/elk/brw_eu_util.c new file mode 100644 index 00000000000..9fc8ff9c7c7 --- /dev/null +++ b/src/intel/compiler/elk/brw_eu_util.c @@ -0,0 +1,119 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell + */ + + +#include "brw_eu_defines.h" +#include "brw_eu.h" + + +void brw_math_invert( struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src) +{ + gfx4_math(p, + dst, + BRW_MATH_FUNCTION_INV, + 0, + src, + BRW_MATH_PRECISION_FULL); +} + + + +void brw_copy4(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src, + unsigned count) +{ + unsigned i; + + dst = vec4(dst); + src = vec4(src); + + for (i = 0; i < count; i++) + { + unsigned delta = i*32; + brw_MOV(p, byte_offset(dst, delta), byte_offset(src, delta)); + brw_MOV(p, byte_offset(dst, delta+16), byte_offset(src, delta+16)); + } +} + + +void brw_copy8(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg src, + unsigned count) +{ + unsigned i; + + dst = vec8(dst); + src = vec8(src); + + for (i = 0; i < count; i++) + { + unsigned delta = i*32; + brw_MOV(p, byte_offset(dst, delta), byte_offset(src, delta)); + } +} + + +void brw_copy_indirect_to_indirect(struct brw_codegen *p, + struct brw_indirect dst_ptr, + struct brw_indirect src_ptr, + unsigned count) +{ + unsigned i; + + for (i = 0; i < count; i++) + { + unsigned delta = i*32; + brw_MOV(p, deref_4f(dst_ptr, delta), deref_4f(src_ptr, delta)); + brw_MOV(p, deref_4f(dst_ptr, delta+16), deref_4f(src_ptr, delta+16)); + } +} + + +void brw_copy_from_indirect(struct brw_codegen *p, + struct brw_reg dst, + struct brw_indirect ptr, + unsigned count) +{ + unsigned i; + + dst = vec4(dst); + + for (i = 0; i < count; i++) + { + unsigned delta = i*32; + brw_MOV(p, byte_offset(dst, delta), deref_4f(ptr, delta)); + brw_MOV(p, byte_offset(dst, delta+16), deref_4f(ptr, delta+16)); + } +} diff --git a/src/intel/compiler/elk/brw_eu_validate.c b/src/intel/compiler/elk/brw_eu_validate.c new file mode 100644 index 00000000000..ec22ef4fa03 --- /dev/null +++ b/src/intel/compiler/elk/brw_eu_validate.c @@ -0,0 +1,2827 @@ +/* + * Copyright © 2015-2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_eu_validate.c + * + * This file implements a pass that validates shader assembly. + * + * The restrictions implemented herein are intended to verify that instructions + * in shader assembly do not violate restrictions documented in the graphics + * programming reference manuals. + * + * The restrictions are difficult for humans to quickly verify due to their + * complexity and abundance. + * + * It is critical that this code is thoroughly unit tested because false + * results will lead developers astray, which is worse than having no validator + * at all. Functional changes to this file without corresponding unit tests (in + * test_eu_validate.cpp) will be rejected. + */ + +#include +#include "brw_eu.h" +#include "brw_disasm_info.h" + +/* We're going to do lots of string concatenation, so this should help. */ +struct string { + char *str; + size_t len; +}; + +static void +cat(struct string *dest, const struct string src) +{ + dest->str = realloc(dest->str, dest->len + src.len + 1); + memcpy(dest->str + dest->len, src.str, src.len); + dest->str[dest->len + src.len] = '\0'; + dest->len = dest->len + src.len; +} +#define CAT(dest, src) cat(&dest, (struct string){src, strlen(src)}) + +static bool +contains(const struct string haystack, const struct string needle) +{ + return haystack.str && memmem(haystack.str, haystack.len, + needle.str, needle.len) != NULL; +} +#define CONTAINS(haystack, needle) \ + contains(haystack, (struct string){needle, strlen(needle)}) + +#define error(str) "\tERROR: " str "\n" +#define ERROR_INDENT "\t " + +#define ERROR(msg) ERROR_IF(true, msg) +#define ERROR_IF(cond, msg) \ + do { \ + if ((cond) && !CONTAINS(error_msg, error(msg))) { \ + CAT(error_msg, error(msg)); \ + } \ + } while(0) + +#define CHECK(func, args...) \ + do { \ + struct string __msg = func(isa, inst, ##args); \ + if (__msg.str) { \ + cat(&error_msg, __msg); \ + free(__msg.str); \ + } \ + } while (0) + +#define STRIDE(stride) (stride != 0 ? 1 << ((stride) - 1) : 0) +#define WIDTH(width) (1 << (width)) + +static bool +inst_is_send(const struct brw_isa_info *isa, const brw_inst *inst) +{ + switch (brw_inst_opcode(isa, inst)) { + case BRW_OPCODE_SEND: + case BRW_OPCODE_SENDC: + case BRW_OPCODE_SENDS: + case BRW_OPCODE_SENDSC: + return true; + default: + return false; + } +} + +static bool +inst_is_split_send(const struct brw_isa_info *isa, const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + if (devinfo->ver >= 12) { + return inst_is_send(isa, inst); + } else { + switch (brw_inst_opcode(isa, inst)) { + case BRW_OPCODE_SENDS: + case BRW_OPCODE_SENDSC: + return true; + default: + return false; + } + } +} + +static unsigned +signed_type(unsigned type) +{ + switch (type) { + case BRW_REGISTER_TYPE_UD: return BRW_REGISTER_TYPE_D; + case BRW_REGISTER_TYPE_UW: return BRW_REGISTER_TYPE_W; + case BRW_REGISTER_TYPE_UB: return BRW_REGISTER_TYPE_B; + case BRW_REGISTER_TYPE_UQ: return BRW_REGISTER_TYPE_Q; + default: return type; + } +} + +static enum brw_reg_type +inst_dst_type(const struct brw_isa_info *isa, const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + return (devinfo->ver < 12 || !inst_is_send(isa, inst)) ? + brw_inst_dst_type(devinfo, inst) : BRW_REGISTER_TYPE_D; +} + +static bool +inst_is_raw_move(const struct brw_isa_info *isa, const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + unsigned dst_type = signed_type(inst_dst_type(isa, inst)); + unsigned src_type = signed_type(brw_inst_src0_type(devinfo, inst)); + + if (brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) { + /* FIXME: not strictly true */ + if (brw_inst_src0_type(devinfo, inst) == BRW_REGISTER_TYPE_VF || + brw_inst_src0_type(devinfo, inst) == BRW_REGISTER_TYPE_UV || + brw_inst_src0_type(devinfo, inst) == BRW_REGISTER_TYPE_V) { + return false; + } + } else if (brw_inst_src0_negate(devinfo, inst) || + brw_inst_src0_abs(devinfo, inst)) { + return false; + } + + return brw_inst_opcode(isa, inst) == BRW_OPCODE_MOV && + brw_inst_saturate(devinfo, inst) == 0 && + dst_type == src_type; +} + +static bool +dst_is_null(const struct intel_device_info *devinfo, const brw_inst *inst) +{ + return brw_inst_dst_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE && + brw_inst_dst_da_reg_nr(devinfo, inst) == BRW_ARF_NULL; +} + +static bool +src0_is_null(const struct intel_device_info *devinfo, const brw_inst *inst) +{ + return brw_inst_src0_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT && + brw_inst_src0_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE && + brw_inst_src0_da_reg_nr(devinfo, inst) == BRW_ARF_NULL; +} + +static bool +src1_is_null(const struct intel_device_info *devinfo, const brw_inst *inst) +{ + return brw_inst_src1_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE && + brw_inst_src1_da_reg_nr(devinfo, inst) == BRW_ARF_NULL; +} + +static bool +src0_is_acc(const struct intel_device_info *devinfo, const brw_inst *inst) +{ + return brw_inst_src0_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE && + (brw_inst_src0_da_reg_nr(devinfo, inst) & 0xF0) == BRW_ARF_ACCUMULATOR; +} + +static bool +src1_is_acc(const struct intel_device_info *devinfo, const brw_inst *inst) +{ + return brw_inst_src1_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE && + (brw_inst_src1_da_reg_nr(devinfo, inst) & 0xF0) == BRW_ARF_ACCUMULATOR; +} + +static bool +src0_has_scalar_region(const struct intel_device_info *devinfo, + const brw_inst *inst) +{ + return brw_inst_src0_vstride(devinfo, inst) == BRW_VERTICAL_STRIDE_0 && + brw_inst_src0_width(devinfo, inst) == BRW_WIDTH_1 && + brw_inst_src0_hstride(devinfo, inst) == BRW_HORIZONTAL_STRIDE_0; +} + +static bool +src1_has_scalar_region(const struct intel_device_info *devinfo, + const brw_inst *inst) +{ + return brw_inst_src1_vstride(devinfo, inst) == BRW_VERTICAL_STRIDE_0 && + brw_inst_src1_width(devinfo, inst) == BRW_WIDTH_1 && + brw_inst_src1_hstride(devinfo, inst) == BRW_HORIZONTAL_STRIDE_0; +} + +static struct string +invalid_values(const struct brw_isa_info *isa, const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + unsigned num_sources = brw_num_sources_from_inst(isa, inst); + struct string error_msg = { .str = NULL, .len = 0 }; + + switch ((enum brw_execution_size) brw_inst_exec_size(devinfo, inst)) { + case BRW_EXECUTE_1: + case BRW_EXECUTE_2: + case BRW_EXECUTE_4: + case BRW_EXECUTE_8: + case BRW_EXECUTE_16: + case BRW_EXECUTE_32: + break; + default: + ERROR("invalid execution size"); + break; + } + + if (error_msg.str) + return error_msg; + + if (devinfo->ver >= 12) { + unsigned group_size = 1 << brw_inst_exec_size(devinfo, inst); + unsigned qtr_ctrl = brw_inst_qtr_control(devinfo, inst); + unsigned nib_ctrl = brw_inst_nib_control(devinfo, inst); + + unsigned chan_off = (qtr_ctrl * 2 + nib_ctrl) << 2; + ERROR_IF(chan_off % group_size != 0, + "The execution size must be a factor of the chosen offset"); + } + + if (inst_is_send(isa, inst)) + return error_msg; + + if (num_sources == 3) { + /* Nothing to test: + * No 3-src instructions on Gfx4-5 + * No reg file bits on Gfx6-10 (align16) + * No invalid encodings on Gfx10-12 (align1) + */ + } else { + if (devinfo->ver > 6) { + ERROR_IF(brw_inst_dst_reg_file(devinfo, inst) == MRF || + (num_sources > 0 && + brw_inst_src0_reg_file(devinfo, inst) == MRF) || + (num_sources > 1 && + brw_inst_src1_reg_file(devinfo, inst) == MRF), + "invalid register file encoding"); + } + } + + if (error_msg.str) + return error_msg; + + if (num_sources == 3) { + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + if (devinfo->ver >= 10) { + ERROR_IF(brw_inst_3src_a1_dst_type (devinfo, inst) == INVALID_REG_TYPE || + brw_inst_3src_a1_src0_type(devinfo, inst) == INVALID_REG_TYPE || + brw_inst_3src_a1_src1_type(devinfo, inst) == INVALID_REG_TYPE || + brw_inst_3src_a1_src2_type(devinfo, inst) == INVALID_REG_TYPE, + "invalid register type encoding"); + } else { + ERROR("Align1 mode not allowed on Gen < 10"); + } + } else { + ERROR_IF(brw_inst_3src_a16_dst_type(devinfo, inst) == INVALID_REG_TYPE || + brw_inst_3src_a16_src_type(devinfo, inst) == INVALID_REG_TYPE, + "invalid register type encoding"); + } + } else { + ERROR_IF(brw_inst_dst_type (devinfo, inst) == INVALID_REG_TYPE || + (num_sources > 0 && + brw_inst_src0_type(devinfo, inst) == INVALID_REG_TYPE) || + (num_sources > 1 && + brw_inst_src1_type(devinfo, inst) == INVALID_REG_TYPE), + "invalid register type encoding"); + } + + return error_msg; +} + +static struct string +sources_not_null(const struct brw_isa_info *isa, + const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + unsigned num_sources = brw_num_sources_from_inst(isa, inst); + struct string error_msg = { .str = NULL, .len = 0 }; + + /* Nothing to test. 3-src instructions can only have GRF sources, and + * there's no bit to control the file. + */ + if (num_sources == 3) + return (struct string){}; + + /* Nothing to test. Split sends can only encode a file in sources that are + * allowed to be NULL. + */ + if (inst_is_split_send(isa, inst)) + return (struct string){}; + + if (num_sources >= 1 && brw_inst_opcode(isa, inst) != BRW_OPCODE_SYNC) + ERROR_IF(src0_is_null(devinfo, inst), "src0 is null"); + + if (num_sources == 2) + ERROR_IF(src1_is_null(devinfo, inst), "src1 is null"); + + return error_msg; +} + +static struct string +alignment_supported(const struct brw_isa_info *isa, + const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + struct string error_msg = { .str = NULL, .len = 0 }; + + ERROR_IF(devinfo->ver >= 11 && brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16, + "Align16 not supported"); + + return error_msg; +} + +static bool +inst_uses_src_acc(const struct brw_isa_info *isa, + const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + /* Check instructions that use implicit accumulator sources */ + switch (brw_inst_opcode(isa, inst)) { + case BRW_OPCODE_MAC: + case BRW_OPCODE_MACH: + case BRW_OPCODE_SADA2: + return true; + default: + break; + } + + /* FIXME: support 3-src instructions */ + unsigned num_sources = brw_num_sources_from_inst(isa, inst); + assert(num_sources < 3); + + return src0_is_acc(devinfo, inst) || (num_sources > 1 && src1_is_acc(devinfo, inst)); +} + +static struct string +send_restrictions(const struct brw_isa_info *isa, + const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + struct string error_msg = { .str = NULL, .len = 0 }; + + if (inst_is_split_send(isa, inst)) { + ERROR_IF(brw_inst_send_src1_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE && + brw_inst_send_src1_reg_nr(devinfo, inst) != BRW_ARF_NULL, + "src1 of split send must be a GRF or NULL"); + + ERROR_IF(brw_inst_eot(devinfo, inst) && + brw_inst_src0_da_reg_nr(devinfo, inst) < 112, + "send with EOT must use g112-g127"); + ERROR_IF(brw_inst_eot(devinfo, inst) && + brw_inst_send_src1_reg_file(devinfo, inst) == BRW_GENERAL_REGISTER_FILE && + brw_inst_send_src1_reg_nr(devinfo, inst) < 112, + "send with EOT must use g112-g127"); + + if (brw_inst_send_src0_reg_file(devinfo, inst) == BRW_GENERAL_REGISTER_FILE && + brw_inst_send_src1_reg_file(devinfo, inst) == BRW_GENERAL_REGISTER_FILE) { + /* Assume minimums if we don't know */ + unsigned mlen = 1; + if (!brw_inst_send_sel_reg32_desc(devinfo, inst)) { + const uint32_t desc = brw_inst_send_desc(devinfo, inst); + mlen = brw_message_desc_mlen(devinfo, desc) / reg_unit(devinfo); + } + + unsigned ex_mlen = 1; + if (!brw_inst_send_sel_reg32_ex_desc(devinfo, inst)) { + const uint32_t ex_desc = brw_inst_sends_ex_desc(devinfo, inst); + ex_mlen = brw_message_ex_desc_ex_mlen(devinfo, ex_desc) / + reg_unit(devinfo); + } + const unsigned src0_reg_nr = brw_inst_src0_da_reg_nr(devinfo, inst); + const unsigned src1_reg_nr = brw_inst_send_src1_reg_nr(devinfo, inst); + ERROR_IF((src0_reg_nr <= src1_reg_nr && + src1_reg_nr < src0_reg_nr + mlen) || + (src1_reg_nr <= src0_reg_nr && + src0_reg_nr < src1_reg_nr + ex_mlen), + "split send payloads must not overlap"); + } + } else if (inst_is_send(isa, inst)) { + ERROR_IF(brw_inst_src0_address_mode(devinfo, inst) != BRW_ADDRESS_DIRECT, + "send must use direct addressing"); + + if (devinfo->ver >= 7) { + ERROR_IF(brw_inst_send_src0_reg_file(devinfo, inst) != BRW_GENERAL_REGISTER_FILE, + "send from non-GRF"); + ERROR_IF(brw_inst_eot(devinfo, inst) && + brw_inst_src0_da_reg_nr(devinfo, inst) < 112, + "send with EOT must use g112-g127"); + } + + if (devinfo->ver >= 8) { + ERROR_IF(!dst_is_null(devinfo, inst) && + (brw_inst_dst_da_reg_nr(devinfo, inst) + + brw_inst_rlen(devinfo, inst) > 127) && + (brw_inst_src0_da_reg_nr(devinfo, inst) + + brw_inst_mlen(devinfo, inst) > + brw_inst_dst_da_reg_nr(devinfo, inst)), + "r127 must not be used for return address when there is " + "a src and dest overlap"); + } + } + + return error_msg; +} + +static bool +is_unsupported_inst(const struct brw_isa_info *isa, + const brw_inst *inst) +{ + return brw_inst_opcode(isa, inst) == BRW_OPCODE_ILLEGAL; +} + +/** + * Returns whether a combination of two types would qualify as mixed float + * operation mode + */ +static inline bool +types_are_mixed_float(enum brw_reg_type t0, enum brw_reg_type t1) +{ + return (t0 == BRW_REGISTER_TYPE_F && t1 == BRW_REGISTER_TYPE_HF) || + (t1 == BRW_REGISTER_TYPE_F && t0 == BRW_REGISTER_TYPE_HF); +} + +static enum brw_reg_type +execution_type_for_type(enum brw_reg_type type) +{ + switch (type) { + case BRW_REGISTER_TYPE_NF: + case BRW_REGISTER_TYPE_DF: + case BRW_REGISTER_TYPE_F: + case BRW_REGISTER_TYPE_HF: + return type; + + case BRW_REGISTER_TYPE_VF: + return BRW_REGISTER_TYPE_F; + + case BRW_REGISTER_TYPE_Q: + case BRW_REGISTER_TYPE_UQ: + return BRW_REGISTER_TYPE_Q; + + case BRW_REGISTER_TYPE_D: + case BRW_REGISTER_TYPE_UD: + return BRW_REGISTER_TYPE_D; + + case BRW_REGISTER_TYPE_W: + case BRW_REGISTER_TYPE_UW: + case BRW_REGISTER_TYPE_B: + case BRW_REGISTER_TYPE_UB: + case BRW_REGISTER_TYPE_V: + case BRW_REGISTER_TYPE_UV: + return BRW_REGISTER_TYPE_W; + } + unreachable("not reached"); +} + +/** + * Returns the execution type of an instruction \p inst + */ +static enum brw_reg_type +execution_type(const struct brw_isa_info *isa, const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + unsigned num_sources = brw_num_sources_from_inst(isa, inst); + enum brw_reg_type src0_exec_type, src1_exec_type; + + /* Execution data type is independent of destination data type, except in + * mixed F/HF instructions. + */ + enum brw_reg_type dst_exec_type = inst_dst_type(isa, inst); + + src0_exec_type = execution_type_for_type(brw_inst_src0_type(devinfo, inst)); + if (num_sources == 1) { + if (src0_exec_type == BRW_REGISTER_TYPE_HF) + return dst_exec_type; + return src0_exec_type; + } + + src1_exec_type = execution_type_for_type(brw_inst_src1_type(devinfo, inst)); + if (types_are_mixed_float(src0_exec_type, src1_exec_type) || + types_are_mixed_float(src0_exec_type, dst_exec_type) || + types_are_mixed_float(src1_exec_type, dst_exec_type)) { + return BRW_REGISTER_TYPE_F; + } + + if (src0_exec_type == src1_exec_type) + return src0_exec_type; + + if (src0_exec_type == BRW_REGISTER_TYPE_NF || + src1_exec_type == BRW_REGISTER_TYPE_NF) + return BRW_REGISTER_TYPE_NF; + + /* Mixed operand types where one is float is float on Gen < 6 + * (and not allowed on later platforms) + */ + if (devinfo->ver < 6 && + (src0_exec_type == BRW_REGISTER_TYPE_F || + src1_exec_type == BRW_REGISTER_TYPE_F)) + return BRW_REGISTER_TYPE_F; + + if (src0_exec_type == BRW_REGISTER_TYPE_Q || + src1_exec_type == BRW_REGISTER_TYPE_Q) + return BRW_REGISTER_TYPE_Q; + + if (src0_exec_type == BRW_REGISTER_TYPE_D || + src1_exec_type == BRW_REGISTER_TYPE_D) + return BRW_REGISTER_TYPE_D; + + if (src0_exec_type == BRW_REGISTER_TYPE_W || + src1_exec_type == BRW_REGISTER_TYPE_W) + return BRW_REGISTER_TYPE_W; + + if (src0_exec_type == BRW_REGISTER_TYPE_DF || + src1_exec_type == BRW_REGISTER_TYPE_DF) + return BRW_REGISTER_TYPE_DF; + + unreachable("not reached"); +} + +/** + * Returns whether a region is packed + * + * A region is packed if its elements are adjacent in memory, with no + * intervening space, no overlap, and no replicated values. + */ +static bool +is_packed(unsigned vstride, unsigned width, unsigned hstride) +{ + if (vstride == width) { + if (vstride == 1) { + return hstride == 0; + } else { + return hstride == 1; + } + } + + return false; +} + +/** + * Returns whether a region is linear + * + * A region is linear if its elements do not overlap and are not replicated. + * Unlike a packed region, intervening space (i.e. strided values) is allowed. + */ +static bool +is_linear(unsigned vstride, unsigned width, unsigned hstride) +{ + return vstride == width * hstride || + (hstride == 0 && width == 1); +} + +/** + * Returns whether an instruction is an explicit or implicit conversion + * to/from half-float. + */ +static bool +is_half_float_conversion(const struct brw_isa_info *isa, + const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + enum brw_reg_type dst_type = brw_inst_dst_type(devinfo, inst); + + unsigned num_sources = brw_num_sources_from_inst(isa, inst); + enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst); + + if (dst_type != src0_type && + (dst_type == BRW_REGISTER_TYPE_HF || src0_type == BRW_REGISTER_TYPE_HF)) { + return true; + } else if (num_sources > 1) { + enum brw_reg_type src1_type = brw_inst_src1_type(devinfo, inst); + return dst_type != src1_type && + (dst_type == BRW_REGISTER_TYPE_HF || + src1_type == BRW_REGISTER_TYPE_HF); + } + + return false; +} + +/* + * Returns whether an instruction is using mixed float operation mode + */ +static bool +is_mixed_float(const struct brw_isa_info *isa, const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + if (devinfo->ver < 8) + return false; + + if (inst_is_send(isa, inst)) + return false; + + unsigned opcode = brw_inst_opcode(isa, inst); + const struct opcode_desc *desc = brw_opcode_desc(isa, opcode); + if (desc->ndst == 0) + return false; + + /* FIXME: support 3-src instructions */ + unsigned num_sources = brw_num_sources_from_inst(isa, inst); + assert(num_sources < 3); + + enum brw_reg_type dst_type = brw_inst_dst_type(devinfo, inst); + enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst); + + if (num_sources == 1) + return types_are_mixed_float(src0_type, dst_type); + + enum brw_reg_type src1_type = brw_inst_src1_type(devinfo, inst); + + return types_are_mixed_float(src0_type, src1_type) || + types_are_mixed_float(src0_type, dst_type) || + types_are_mixed_float(src1_type, dst_type); +} + +/** + * Returns whether an instruction is an explicit or implicit conversion + * to/from byte. + */ +static bool +is_byte_conversion(const struct brw_isa_info *isa, + const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + enum brw_reg_type dst_type = brw_inst_dst_type(devinfo, inst); + + unsigned num_sources = brw_num_sources_from_inst(isa, inst); + enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst); + + if (dst_type != src0_type && + (type_sz(dst_type) == 1 || type_sz(src0_type) == 1)) { + return true; + } else if (num_sources > 1) { + enum brw_reg_type src1_type = brw_inst_src1_type(devinfo, inst); + return dst_type != src1_type && + (type_sz(dst_type) == 1 || type_sz(src1_type) == 1); + } + + return false; +} + +/** + * Checks restrictions listed in "General Restrictions Based on Operand Types" + * in the "Register Region Restrictions" section. + */ +static struct string +general_restrictions_based_on_operand_types(const struct brw_isa_info *isa, + const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + const struct opcode_desc *desc = + brw_opcode_desc(isa, brw_inst_opcode(isa, inst)); + unsigned num_sources = brw_num_sources_from_inst(isa, inst); + unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst); + struct string error_msg = { .str = NULL, .len = 0 }; + + if (inst_is_send(isa, inst)) + return error_msg; + + if (devinfo->ver >= 11) { + /* A register type of B or UB for DPAS actually means 4 bytes packed into + * a D or UD, so it is allowed. + */ + if (num_sources == 3 && brw_inst_opcode(isa, inst) != BRW_OPCODE_DPAS) { + ERROR_IF(brw_reg_type_to_size(brw_inst_3src_a1_src1_type(devinfo, inst)) == 1 || + brw_reg_type_to_size(brw_inst_3src_a1_src2_type(devinfo, inst)) == 1, + "Byte data type is not supported for src1/2 register regioning. This includes " + "byte broadcast as well."); + } + if (num_sources == 2) { + ERROR_IF(brw_reg_type_to_size(brw_inst_src1_type(devinfo, inst)) == 1, + "Byte data type is not supported for src1 register regioning. This includes " + "byte broadcast as well."); + } + } + + enum brw_reg_type dst_type; + + if (num_sources == 3) { + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) + dst_type = brw_inst_3src_a1_dst_type(devinfo, inst); + else + dst_type = brw_inst_3src_a16_dst_type(devinfo, inst); + } else { + dst_type = inst_dst_type(isa, inst); + } + + ERROR_IF(dst_type == BRW_REGISTER_TYPE_DF && + !devinfo->has_64bit_float, + "64-bit float destination, but platform does not support it"); + + ERROR_IF((dst_type == BRW_REGISTER_TYPE_Q || + dst_type == BRW_REGISTER_TYPE_UQ) && + !devinfo->has_64bit_int, + "64-bit int destination, but platform does not support it"); + + for (unsigned s = 0; s < num_sources; s++) { + enum brw_reg_type src_type; + if (num_sources == 3) { + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + switch (s) { + case 0: src_type = brw_inst_3src_a1_src0_type(devinfo, inst); break; + case 1: src_type = brw_inst_3src_a1_src1_type(devinfo, inst); break; + case 2: src_type = brw_inst_3src_a1_src2_type(devinfo, inst); break; + default: unreachable("invalid src"); + } + } else { + src_type = brw_inst_3src_a16_src_type(devinfo, inst); + } + } else { + switch (s) { + case 0: src_type = brw_inst_src0_type(devinfo, inst); break; + case 1: src_type = brw_inst_src1_type(devinfo, inst); break; + default: unreachable("invalid src"); + } + } + + ERROR_IF(src_type == BRW_REGISTER_TYPE_DF && + !devinfo->has_64bit_float, + "64-bit float source, but platform does not support it"); + + ERROR_IF((src_type == BRW_REGISTER_TYPE_Q || + src_type == BRW_REGISTER_TYPE_UQ) && + !devinfo->has_64bit_int, + "64-bit int source, but platform does not support it"); + } + + if (num_sources == 3) + return error_msg; + + if (exec_size == 1) + return error_msg; + + if (desc->ndst == 0) + return error_msg; + + /* The PRMs say: + * + * Where n is the largest element size in bytes for any source or + * destination operand type, ExecSize * n must be <= 64. + * + * But we do not attempt to enforce it, because it is implied by other + * rules: + * + * - that the destination stride must match the execution data type + * - sources may not span more than two adjacent GRF registers + * - destination may not span more than two adjacent GRF registers + * + * In fact, checking it would weaken testing of the other rules. + */ + + unsigned dst_stride = STRIDE(brw_inst_dst_hstride(devinfo, inst)); + bool dst_type_is_byte = + inst_dst_type(isa, inst) == BRW_REGISTER_TYPE_B || + inst_dst_type(isa, inst) == BRW_REGISTER_TYPE_UB; + + if (dst_type_is_byte) { + if (is_packed(exec_size * dst_stride, exec_size, dst_stride)) { + if (!inst_is_raw_move(isa, inst)) + ERROR("Only raw MOV supports a packed-byte destination"); + return error_msg; + } + } + + unsigned exec_type = execution_type(isa, inst); + unsigned exec_type_size = brw_reg_type_to_size(exec_type); + unsigned dst_type_size = brw_reg_type_to_size(dst_type); + + /* On IVB/BYT, region parameters and execution size for DF are in terms of + * 32-bit elements, so they are doubled. For evaluating the validity of an + * instruction, we halve them. + */ + if (devinfo->verx10 == 70 && + exec_type_size == 8 && dst_type_size == 4) + dst_type_size = 8; + + if (is_byte_conversion(isa, inst)) { + /* From the BDW+ PRM, Volume 2a, Command Reference, Instructions - MOV: + * + * "There is no direct conversion from B/UB to DF or DF to B/UB. + * There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB." + * + * Even if these restrictions are listed for the MOV instruction, we + * validate this more generally, since there is the possibility + * of implicit conversions from other instructions. + */ + enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst); + enum brw_reg_type src1_type = num_sources > 1 ? + brw_inst_src1_type(devinfo, inst) : 0; + + ERROR_IF(type_sz(dst_type) == 1 && + (type_sz(src0_type) == 8 || + (num_sources > 1 && type_sz(src1_type) == 8)), + "There are no direct conversions between 64-bit types and B/UB"); + + ERROR_IF(type_sz(dst_type) == 8 && + (type_sz(src0_type) == 1 || + (num_sources > 1 && type_sz(src1_type) == 1)), + "There are no direct conversions between 64-bit types and B/UB"); + } + + if (is_half_float_conversion(isa, inst)) { + /** + * A helper to validate used in the validation of the following restriction + * from the BDW+ PRM, Volume 2a, Command Reference, Instructions - MOV: + * + * "There is no direct conversion from HF to DF or DF to HF. + * There is no direct conversion from HF to Q/UQ or Q/UQ to HF." + * + * Even if these restrictions are listed for the MOV instruction, we + * validate this more generally, since there is the possibility + * of implicit conversions from other instructions, such us implicit + * conversion from integer to HF with the ADD instruction in SKL+. + */ + enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst); + enum brw_reg_type src1_type = num_sources > 1 ? + brw_inst_src1_type(devinfo, inst) : 0; + ERROR_IF(dst_type == BRW_REGISTER_TYPE_HF && + (type_sz(src0_type) == 8 || + (num_sources > 1 && type_sz(src1_type) == 8)), + "There are no direct conversions between 64-bit types and HF"); + + ERROR_IF(type_sz(dst_type) == 8 && + (src0_type == BRW_REGISTER_TYPE_HF || + (num_sources > 1 && src1_type == BRW_REGISTER_TYPE_HF)), + "There are no direct conversions between 64-bit types and HF"); + + /* From the BDW+ PRM: + * + * "Conversion between Integer and HF (Half Float) must be + * DWord-aligned and strided by a DWord on the destination." + * + * Also, the above restrictions seems to be expanded on CHV and SKL+ by: + * + * "There is a relaxed alignment rule for word destinations. When + * the destination type is word (UW, W, HF), destination data types + * can be aligned to either the lowest word or the second lowest + * word of the execution channel. This means the destination data + * words can be either all in the even word locations or all in the + * odd word locations." + * + * We do not implement the second rule as is though, since empirical + * testing shows inconsistencies: + * - It suggests that packed 16-bit is not allowed, which is not true. + * - It suggests that conversions from Q/DF to W (which need to be + * 64-bit aligned on the destination) are not possible, which is + * not true. + * + * So from this rule we only validate the implication that conversions + * from F to HF need to be DWord strided (except in Align1 mixed + * float mode where packed fp16 destination is allowed so long as the + * destination is oword-aligned). + * + * Finally, we only validate this for Align1 because Align16 always + * requires packed destinations, so these restrictions can't possibly + * apply to Align16 mode. + */ + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + if ((dst_type == BRW_REGISTER_TYPE_HF && + (brw_reg_type_is_integer(src0_type) || + (num_sources > 1 && brw_reg_type_is_integer(src1_type)))) || + (brw_reg_type_is_integer(dst_type) && + (src0_type == BRW_REGISTER_TYPE_HF || + (num_sources > 1 && src1_type == BRW_REGISTER_TYPE_HF)))) { + ERROR_IF(dst_stride * dst_type_size != 4, + "Conversions between integer and half-float must be " + "strided by a DWord on the destination"); + + unsigned subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst); + ERROR_IF(subreg % 4 != 0, + "Conversions between integer and half-float must be " + "aligned to a DWord on the destination"); + } else if ((devinfo->platform == INTEL_PLATFORM_CHV || + devinfo->ver >= 9) && + dst_type == BRW_REGISTER_TYPE_HF) { + unsigned subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst); + ERROR_IF(dst_stride != 2 && + !(is_mixed_float(isa, inst) && + dst_stride == 1 && subreg % 16 == 0), + "Conversions to HF must have either all words in even " + "word locations or all words in odd word locations or " + "be mixed-float with Oword-aligned packed destination"); + } + } + } + + /* There are special regioning rules for mixed-float mode in CHV and SKL that + * override the general rule for the ratio of sizes of the destination type + * and the execution type. We will add validation for those in a later patch. + */ + bool validate_dst_size_and_exec_size_ratio = + !is_mixed_float(isa, inst) || + !(devinfo->platform == INTEL_PLATFORM_CHV || devinfo->ver >= 9); + + if (validate_dst_size_and_exec_size_ratio && + exec_type_size > dst_type_size) { + if (!(dst_type_is_byte && inst_is_raw_move(isa, inst))) { + ERROR_IF(dst_stride * dst_type_size != exec_type_size, + "Destination stride must be equal to the ratio of the sizes " + "of the execution data type to the destination type"); + } + + unsigned subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst); + + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1 && + brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) { + /* The i965 PRM says: + * + * Implementation Restriction: The relaxed alignment rule for byte + * destination (#10.5) is not supported. + */ + if (devinfo->verx10 >= 45 && dst_type_is_byte) { + ERROR_IF(subreg % exec_type_size != 0 && + subreg % exec_type_size != 1, + "Destination subreg must be aligned to the size of the " + "execution data type (or to the next lowest byte for byte " + "destinations)"); + } else { + ERROR_IF(subreg % exec_type_size != 0, + "Destination subreg must be aligned to the size of the " + "execution data type"); + } + } + } + + return error_msg; +} + +/** + * Checks restrictions listed in "General Restrictions on Regioning Parameters" + * in the "Register Region Restrictions" section. + */ +static struct string +general_restrictions_on_region_parameters(const struct brw_isa_info *isa, + const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + const struct opcode_desc *desc = + brw_opcode_desc(isa, brw_inst_opcode(isa, inst)); + unsigned num_sources = brw_num_sources_from_inst(isa, inst); + unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst); + struct string error_msg = { .str = NULL, .len = 0 }; + + if (num_sources == 3) + return (struct string){}; + + /* Split sends don't have the bits in the instruction to encode regions so + * there's nothing to check. + */ + if (inst_is_split_send(isa, inst)) + return (struct string){}; + + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16) { + if (desc->ndst != 0 && !dst_is_null(devinfo, inst)) + ERROR_IF(brw_inst_dst_hstride(devinfo, inst) != BRW_HORIZONTAL_STRIDE_1, + "Destination Horizontal Stride must be 1"); + + if (num_sources >= 1) { + if (devinfo->verx10 >= 75) { + ERROR_IF(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE && + brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0 && + brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_2 && + brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4, + "In Align16 mode, only VertStride of 0, 2, or 4 is allowed"); + } else { + ERROR_IF(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE && + brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0 && + brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4, + "In Align16 mode, only VertStride of 0 or 4 is allowed"); + } + } + + if (num_sources == 2) { + if (devinfo->verx10 >= 75) { + ERROR_IF(brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE && + brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0 && + brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_2 && + brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4, + "In Align16 mode, only VertStride of 0, 2, or 4 is allowed"); + } else { + ERROR_IF(brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE && + brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0 && + brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4, + "In Align16 mode, only VertStride of 0 or 4 is allowed"); + } + } + + return error_msg; + } + + for (unsigned i = 0; i < num_sources; i++) { + unsigned vstride, width, hstride, element_size, subreg; + enum brw_reg_type type; + +#define DO_SRC(n) \ + if (brw_inst_src ## n ## _reg_file(devinfo, inst) == \ + BRW_IMMEDIATE_VALUE) \ + continue; \ + \ + vstride = STRIDE(brw_inst_src ## n ## _vstride(devinfo, inst)); \ + width = WIDTH(brw_inst_src ## n ## _width(devinfo, inst)); \ + hstride = STRIDE(brw_inst_src ## n ## _hstride(devinfo, inst)); \ + type = brw_inst_src ## n ## _type(devinfo, inst); \ + element_size = brw_reg_type_to_size(type); \ + subreg = brw_inst_src ## n ## _da1_subreg_nr(devinfo, inst) + + if (i == 0) { + DO_SRC(0); + } else { + DO_SRC(1); + } +#undef DO_SRC + + /* On IVB/BYT, region parameters and execution size for DF are in terms of + * 32-bit elements, so they are doubled. For evaluating the validity of an + * instruction, we halve them. + */ + if (devinfo->verx10 == 70 && + element_size == 8) + element_size = 4; + + /* ExecSize must be greater than or equal to Width. */ + ERROR_IF(exec_size < width, "ExecSize must be greater than or equal " + "to Width"); + + /* If ExecSize = Width and HorzStride ≠ 0, + * VertStride must be set to Width * HorzStride. + */ + if (exec_size == width && hstride != 0) { + ERROR_IF(vstride != width * hstride, + "If ExecSize = Width and HorzStride ≠ 0, " + "VertStride must be set to Width * HorzStride"); + } + + /* If Width = 1, HorzStride must be 0 regardless of the values of + * ExecSize and VertStride. + */ + if (width == 1) { + ERROR_IF(hstride != 0, + "If Width = 1, HorzStride must be 0 regardless " + "of the values of ExecSize and VertStride"); + } + + /* If ExecSize = Width = 1, both VertStride and HorzStride must be 0. */ + if (exec_size == 1 && width == 1) { + ERROR_IF(vstride != 0 || hstride != 0, + "If ExecSize = Width = 1, both VertStride " + "and HorzStride must be 0"); + } + + /* If VertStride = HorzStride = 0, Width must be 1 regardless of the + * value of ExecSize. + */ + if (vstride == 0 && hstride == 0) { + ERROR_IF(width != 1, + "If VertStride = HorzStride = 0, Width must be " + "1 regardless of the value of ExecSize"); + } + + /* VertStride must be used to cross GRF register boundaries. This rule + * implies that elements within a 'Width' cannot cross GRF boundaries. + */ + const uint64_t mask = (1ULL << element_size) - 1; + unsigned rowbase = subreg; + + for (int y = 0; y < exec_size / width; y++) { + uint64_t access_mask = 0; + unsigned offset = rowbase; + + for (int x = 0; x < width; x++) { + access_mask |= mask << (offset % 64); + offset += hstride * element_size; + } + + rowbase += vstride * element_size; + + if ((uint32_t)access_mask != 0 && (access_mask >> 32) != 0) { + ERROR("VertStride must be used to cross GRF register boundaries"); + break; + } + } + } + + /* Dst.HorzStride must not be 0. */ + if (desc->ndst != 0 && !dst_is_null(devinfo, inst)) { + ERROR_IF(brw_inst_dst_hstride(devinfo, inst) == BRW_HORIZONTAL_STRIDE_0, + "Destination Horizontal Stride must not be 0"); + } + + return error_msg; +} + +static struct string +special_restrictions_for_mixed_float_mode(const struct brw_isa_info *isa, + const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + struct string error_msg = { .str = NULL, .len = 0 }; + + const unsigned opcode = brw_inst_opcode(isa, inst); + const unsigned num_sources = brw_num_sources_from_inst(isa, inst); + if (num_sources >= 3) + return error_msg; + + if (!is_mixed_float(isa, inst)) + return error_msg; + + unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst); + bool is_align16 = brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16; + + enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst); + enum brw_reg_type src1_type = num_sources > 1 ? + brw_inst_src1_type(devinfo, inst) : 0; + enum brw_reg_type dst_type = brw_inst_dst_type(devinfo, inst); + + unsigned dst_stride = STRIDE(brw_inst_dst_hstride(devinfo, inst)); + bool dst_is_packed = is_packed(exec_size * dst_stride, exec_size, dst_stride); + + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode + * Float Operations: + * + * "Indirect addressing on source is not supported when source and + * destination data types are mixed float." + */ + ERROR_IF(brw_inst_src0_address_mode(devinfo, inst) != BRW_ADDRESS_DIRECT || + (num_sources > 1 && + brw_inst_src1_address_mode(devinfo, inst) != BRW_ADDRESS_DIRECT), + "Indirect addressing on source is not supported when source and " + "destination data types are mixed float"); + + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode + * Float Operations: + * + * "No SIMD16 in mixed mode when destination is f32. Instruction + * execution size must be no more than 8." + */ + ERROR_IF(exec_size > 8 && dst_type == BRW_REGISTER_TYPE_F, + "Mixed float mode with 32-bit float destination is limited " + "to SIMD8"); + + if (is_align16) { + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode + * Float Operations: + * + * "In Align16 mode, when half float and float data types are mixed + * between source operands OR between source and destination operands, + * the register content are assumed to be packed." + * + * Since Align16 doesn't have a concept of horizontal stride (or width), + * it means that vertical stride must always be 4, since 0 and 2 would + * lead to replicated data, and any other value is disallowed in Align16. + */ + ERROR_IF(brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4, + "Align16 mixed float mode assumes packed data (vstride must be 4"); + + ERROR_IF(num_sources >= 2 && + brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4, + "Align16 mixed float mode assumes packed data (vstride must be 4"); + + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode + * Float Operations: + * + * "For Align16 mixed mode, both input and output packed f16 data + * must be oword aligned, no oword crossing in packed f16." + * + * The previous rule requires that Align16 operands are always packed, + * and since there is only one bit for Align16 subnr, which represents + * offsets 0B and 16B, this rule is always enforced and we don't need to + * validate it. + */ + + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode + * Float Operations: + * + * "No SIMD16 in mixed mode when destination is packed f16 for both + * Align1 and Align16." + * + * And: + * + * "In Align16 mode, when half float and float data types are mixed + * between source operands OR between source and destination operands, + * the register content are assumed to be packed." + * + * Which implies that SIMD16 is not available in Align16. This is further + * confirmed by: + * + * "For Align16 mixed mode, both input and output packed f16 data + * must be oword aligned, no oword crossing in packed f16" + * + * Since oword-aligned packed f16 data would cross oword boundaries when + * the execution size is larger than 8. + */ + ERROR_IF(exec_size > 8, "Align16 mixed float mode is limited to SIMD8"); + + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode + * Float Operations: + * + * "No accumulator read access for Align16 mixed float." + */ + ERROR_IF(inst_uses_src_acc(isa, inst), + "No accumulator read access for Align16 mixed float"); + } else { + assert(!is_align16); + + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode + * Float Operations: + * + * "No SIMD16 in mixed mode when destination is packed f16 for both + * Align1 and Align16." + */ + ERROR_IF(exec_size > 8 && dst_is_packed && + dst_type == BRW_REGISTER_TYPE_HF, + "Align1 mixed float mode is limited to SIMD8 when destination " + "is packed half-float"); + + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode + * Float Operations: + * + * "Math operations for mixed mode: + * - In Align1, f16 inputs need to be strided" + */ + if (opcode == BRW_OPCODE_MATH) { + if (src0_type == BRW_REGISTER_TYPE_HF) { + ERROR_IF(STRIDE(brw_inst_src0_hstride(devinfo, inst)) <= 1, + "Align1 mixed mode math needs strided half-float inputs"); + } + + if (num_sources >= 2 && src1_type == BRW_REGISTER_TYPE_HF) { + ERROR_IF(STRIDE(brw_inst_src1_hstride(devinfo, inst)) <= 1, + "Align1 mixed mode math needs strided half-float inputs"); + } + } + + if (dst_type == BRW_REGISTER_TYPE_HF && dst_stride == 1) { + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode + * Float Operations: + * + * "In Align1, destination stride can be smaller than execution + * type. When destination is stride of 1, 16 bit packed data is + * updated on the destination. However, output packed f16 data + * must be oword aligned, no oword crossing in packed f16." + * + * The requirement of not crossing oword boundaries for 16-bit oword + * aligned data means that execution size is limited to 8. + */ + unsigned subreg; + if (brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) + subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst); + else + subreg = brw_inst_dst_ia_subreg_nr(devinfo, inst); + ERROR_IF(subreg % 16 != 0, + "Align1 mixed mode packed half-float output must be " + "oword aligned"); + ERROR_IF(exec_size > 8, + "Align1 mixed mode packed half-float output must not " + "cross oword boundaries (max exec size is 8)"); + + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode + * Float Operations: + * + * "When source is float or half float from accumulator register and + * destination is half float with a stride of 1, the source must + * register aligned. i.e., source must have offset zero." + * + * Align16 mixed float mode doesn't allow accumulator access on sources, + * so we only need to check this for Align1. + */ + if (src0_is_acc(devinfo, inst) && + (src0_type == BRW_REGISTER_TYPE_F || + src0_type == BRW_REGISTER_TYPE_HF)) { + ERROR_IF(brw_inst_src0_da1_subreg_nr(devinfo, inst) != 0, + "Mixed float mode requires register-aligned accumulator " + "source reads when destination is packed half-float"); + + } + + if (num_sources > 1 && + src1_is_acc(devinfo, inst) && + (src1_type == BRW_REGISTER_TYPE_F || + src1_type == BRW_REGISTER_TYPE_HF)) { + ERROR_IF(brw_inst_src1_da1_subreg_nr(devinfo, inst) != 0, + "Mixed float mode requires register-aligned accumulator " + "source reads when destination is packed half-float"); + } + } + + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode + * Float Operations: + * + * "No swizzle is allowed when an accumulator is used as an implicit + * source or an explicit source in an instruction. i.e. when + * destination is half float with an implicit accumulator source, + * destination stride needs to be 2." + * + * FIXME: it is not quite clear what the first sentence actually means + * or its link to the implication described after it, so we only + * validate the explicit implication, which is clearly described. + */ + if (dst_type == BRW_REGISTER_TYPE_HF && + inst_uses_src_acc(isa, inst)) { + ERROR_IF(dst_stride != 2, + "Mixed float mode with implicit/explicit accumulator " + "source and half-float destination requires a stride " + "of 2 on the destination"); + } + } + + return error_msg; +} + +/** + * Creates an \p access_mask for an \p exec_size, \p element_size, and a region + * + * An \p access_mask is a 32-element array of uint64_t, where each uint64_t is + * a bitmask of bytes accessed by the region. + * + * For instance the access mask of the source gX.1<4,2,2>F in an exec_size = 4 + * instruction would be + * + * access_mask[0] = 0x00000000000000F0 + * access_mask[1] = 0x000000000000F000 + * access_mask[2] = 0x0000000000F00000 + * access_mask[3] = 0x00000000F0000000 + * access_mask[4-31] = 0 + * + * because the first execution channel accesses bytes 7-4 and the second + * execution channel accesses bytes 15-12, etc. + */ +static void +align1_access_mask(uint64_t access_mask[static 32], + unsigned exec_size, unsigned element_size, unsigned subreg, + unsigned vstride, unsigned width, unsigned hstride) +{ + const uint64_t mask = (1ULL << element_size) - 1; + unsigned rowbase = subreg; + unsigned element = 0; + + for (int y = 0; y < exec_size / width; y++) { + unsigned offset = rowbase; + + for (int x = 0; x < width; x++) { + access_mask[element++] = mask << (offset % 64); + offset += hstride * element_size; + } + + rowbase += vstride * element_size; + } + + assert(element == 0 || element == exec_size); +} + +/** + * Returns the number of registers accessed according to the \p access_mask + */ +static int +registers_read(const uint64_t access_mask[static 32]) +{ + int regs_read = 0; + + for (unsigned i = 0; i < 32; i++) { + if (access_mask[i] > 0xFFFFFFFF) { + return 2; + } else if (access_mask[i]) { + regs_read = 1; + } + } + + return regs_read; +} + +/** + * Checks restrictions listed in "Region Alignment Rules" in the "Register + * Region Restrictions" section. + */ +static struct string +region_alignment_rules(const struct brw_isa_info *isa, + const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + const struct opcode_desc *desc = + brw_opcode_desc(isa, brw_inst_opcode(isa, inst)); + unsigned num_sources = brw_num_sources_from_inst(isa, inst); + unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst); + uint64_t dst_access_mask[32], src0_access_mask[32], src1_access_mask[32]; + struct string error_msg = { .str = NULL, .len = 0 }; + + if (num_sources == 3) + return (struct string){}; + + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16) + return (struct string){}; + + if (inst_is_send(isa, inst)) + return (struct string){}; + + memset(dst_access_mask, 0, sizeof(dst_access_mask)); + memset(src0_access_mask, 0, sizeof(src0_access_mask)); + memset(src1_access_mask, 0, sizeof(src1_access_mask)); + + for (unsigned i = 0; i < num_sources; i++) { + unsigned vstride, width, hstride, element_size, subreg; + enum brw_reg_type type; + + /* In Direct Addressing mode, a source cannot span more than 2 adjacent + * GRF registers. + */ + +#define DO_SRC(n) \ + if (brw_inst_src ## n ## _address_mode(devinfo, inst) != \ + BRW_ADDRESS_DIRECT) \ + continue; \ + \ + if (brw_inst_src ## n ## _reg_file(devinfo, inst) == \ + BRW_IMMEDIATE_VALUE) \ + continue; \ + \ + vstride = STRIDE(brw_inst_src ## n ## _vstride(devinfo, inst)); \ + width = WIDTH(brw_inst_src ## n ## _width(devinfo, inst)); \ + hstride = STRIDE(brw_inst_src ## n ## _hstride(devinfo, inst)); \ + type = brw_inst_src ## n ## _type(devinfo, inst); \ + element_size = brw_reg_type_to_size(type); \ + subreg = brw_inst_src ## n ## _da1_subreg_nr(devinfo, inst); \ + align1_access_mask(src ## n ## _access_mask, \ + exec_size, element_size, subreg, \ + vstride, width, hstride) + + if (i == 0) { + DO_SRC(0); + } else { + DO_SRC(1); + } +#undef DO_SRC + + unsigned num_vstride = exec_size / width; + unsigned num_hstride = width; + unsigned vstride_elements = (num_vstride - 1) * vstride; + unsigned hstride_elements = (num_hstride - 1) * hstride; + unsigned offset = (vstride_elements + hstride_elements) * element_size + + subreg; + ERROR_IF(offset >= 64 * reg_unit(devinfo), + "A source cannot span more than 2 adjacent GRF registers"); + } + + if (desc->ndst == 0 || dst_is_null(devinfo, inst)) + return error_msg; + + unsigned stride = STRIDE(brw_inst_dst_hstride(devinfo, inst)); + enum brw_reg_type dst_type = inst_dst_type(isa, inst); + unsigned element_size = brw_reg_type_to_size(dst_type); + unsigned subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst); + unsigned offset = ((exec_size - 1) * stride * element_size) + subreg; + ERROR_IF(offset >= 64 * reg_unit(devinfo), + "A destination cannot span more than 2 adjacent GRF registers"); + + if (error_msg.str) + return error_msg; + + /* On IVB/BYT, region parameters and execution size for DF are in terms of + * 32-bit elements, so they are doubled. For evaluating the validity of an + * instruction, we halve them. + */ + if (devinfo->verx10 == 70 && + element_size == 8) + element_size = 4; + + align1_access_mask(dst_access_mask, exec_size, element_size, subreg, + exec_size == 1 ? 0 : exec_size * stride, + exec_size == 1 ? 1 : exec_size, + exec_size == 1 ? 0 : stride); + + unsigned dst_regs = registers_read(dst_access_mask); + unsigned src0_regs = registers_read(src0_access_mask); + unsigned src1_regs = registers_read(src1_access_mask); + + /* The SNB, IVB, HSW, BDW, and CHV PRMs say: + * + * When an instruction has a source region spanning two registers and a + * destination region contained in one register, the number of elements + * must be the same between two sources and one of the following must be + * true: + * + * 1. The destination region is entirely contained in the lower OWord + * of a register. + * 2. The destination region is entirely contained in the upper OWord + * of a register. + * 3. The destination elements are evenly split between the two OWords + * of a register. + */ + if (devinfo->ver <= 8) { + if (dst_regs == 1 && (src0_regs == 2 || src1_regs == 2)) { + unsigned upper_oword_writes = 0, lower_oword_writes = 0; + + for (unsigned i = 0; i < exec_size; i++) { + if (dst_access_mask[i] > 0x0000FFFF) { + upper_oword_writes++; + } else { + assert(dst_access_mask[i] != 0); + lower_oword_writes++; + } + } + + ERROR_IF(lower_oword_writes != 0 && + upper_oword_writes != 0 && + upper_oword_writes != lower_oword_writes, + "Writes must be to only one OWord or " + "evenly split between OWords"); + } + } + + /* The IVB and HSW PRMs say: + * + * When an instruction has a source region that spans two registers and + * the destination spans two registers, the destination elements must be + * evenly split between the two registers [...] + * + * The SNB PRM contains similar wording (but written in a much more + * confusing manner). + * + * The BDW PRM says: + * + * When destination spans two registers, the source may be one or two + * registers. The destination elements must be evenly split between the + * two registers. + * + * The SKL PRM says: + * + * When destination of MATH instruction spans two registers, the + * destination elements must be evenly split between the two registers. + * + * It is not known whether this restriction applies to KBL other Gens after + * SKL. + */ + if (devinfo->ver <= 8 || + brw_inst_opcode(isa, inst) == BRW_OPCODE_MATH) { + + /* Nothing explicitly states that on Gen < 8 elements must be evenly + * split between two destination registers in the two exceptional + * source-region-spans-one-register cases, but since Broadwell requires + * evenly split writes regardless of source region, we assume that it was + * an oversight and require it. + */ + if (dst_regs == 2) { + unsigned upper_reg_writes = 0, lower_reg_writes = 0; + + for (unsigned i = 0; i < exec_size; i++) { + if (dst_access_mask[i] > 0xFFFFFFFF) { + upper_reg_writes++; + } else { + assert(dst_access_mask[i] != 0); + lower_reg_writes++; + } + } + + ERROR_IF(upper_reg_writes != lower_reg_writes, + "Writes must be evenly split between the two " + "destination registers"); + } + } + + /* The IVB and HSW PRMs say: + * + * When an instruction has a source region that spans two registers and + * the destination spans two registers, the destination elements must be + * evenly split between the two registers and each destination register + * must be entirely derived from one source register. + * + * Note: In such cases, the regioning parameters must ensure that the + * offset from the two source registers is the same. + * + * The SNB PRM contains similar wording (but written in a much more + * confusing manner). + * + * There are effectively three rules stated here: + * + * For an instruction with a source and a destination spanning two + * registers, + * + * (1) destination elements must be evenly split between the two + * registers + * (2) all destination elements in a register must be derived + * from one source register + * (3) the offset (i.e. the starting location in each of the two + * registers spanned by a region) must be the same in the two + * registers spanned by a region + * + * It is impossible to violate rule (1) without violating (2) or (3), so we + * do not attempt to validate it. + */ + if (devinfo->ver <= 7 && dst_regs == 2) { + for (unsigned i = 0; i < num_sources; i++) { +#define DO_SRC(n) \ + if (src ## n ## _regs <= 1) \ + continue; \ + \ + for (unsigned i = 0; i < exec_size; i++) { \ + if ((dst_access_mask[i] > 0xFFFFFFFF) != \ + (src ## n ## _access_mask[i] > 0xFFFFFFFF)) { \ + ERROR("Each destination register must be entirely derived " \ + "from one source register"); \ + break; \ + } \ + } \ + \ + unsigned offset_0 = \ + brw_inst_src ## n ## _da1_subreg_nr(devinfo, inst); \ + unsigned offset_1 = offset_0; \ + \ + for (unsigned i = 0; i < exec_size; i++) { \ + if (src ## n ## _access_mask[i] > 0xFFFFFFFF) { \ + offset_1 = __builtin_ctzll(src ## n ## _access_mask[i]) - 32; \ + break; \ + } \ + } \ + \ + ERROR_IF(num_sources == 2 && offset_0 != offset_1, \ + "The offset from the two source registers " \ + "must be the same") + + if (i == 0) { + DO_SRC(0); + } else { + DO_SRC(1); + } +#undef DO_SRC + } + } + + /* The IVB and HSW PRMs say: + * + * When destination spans two registers, the source MUST span two + * registers. The exception to the above rule: + * 1. When source is scalar, the source registers are not + * incremented. + * 2. When source is packed integer Word and destination is packed + * integer DWord, the source register is not incremented by the + * source sub register is incremented. + * + * The SNB PRM does not contain this rule, but the internal documentation + * indicates that it applies to SNB as well. We assume that the rule applies + * to Gen <= 5 although their PRMs do not state it. + * + * While the documentation explicitly says in exception (2) that the + * destination must be an integer DWord, the hardware allows at least a + * float destination type as well. We emit such instructions from + * + * fs_visitor::emit_interpolation_setup_gfx6 + * fs_visitor::emit_fragcoord_interpolation + * + * and have for years with no ill effects. + * + * Additionally the simulator source code indicates that the real condition + * is that the size of the destination type is 4 bytes. + * + * HSW PRMs also add a note to the second exception: + * "When lower 8 channels are disabled, the sub register of source1 + * operand is not incremented. If the lower 8 channels are expected + * to be disabled, say by predication, the instruction must be split + * into pair of simd8 operations." + * + * We can't reliably know if the channels won't be disabled due to, + * for example, IMASK. So, play it safe and disallow packed-word exception + * for src1. + */ + if (devinfo->ver <= 7 && dst_regs == 2) { + enum brw_reg_type dst_type = inst_dst_type(isa, inst); + bool dst_is_packed_dword = + is_packed(exec_size * stride, exec_size, stride) && + brw_reg_type_to_size(dst_type) == 4; + + for (unsigned i = 0; i < num_sources; i++) { +#define DO_SRC(n) \ + unsigned vstride, width, hstride; \ + vstride = STRIDE(brw_inst_src ## n ## _vstride(devinfo, inst)); \ + width = WIDTH(brw_inst_src ## n ## _width(devinfo, inst)); \ + hstride = STRIDE(brw_inst_src ## n ## _hstride(devinfo, inst)); \ + bool src ## n ## _is_packed_word = \ + n != 1 && is_packed(vstride, width, hstride) && \ + (brw_inst_src ## n ## _type(devinfo, inst) == BRW_REGISTER_TYPE_W || \ + brw_inst_src ## n ## _type(devinfo, inst) == BRW_REGISTER_TYPE_UW); \ + \ + ERROR_IF(src ## n ## _regs == 1 && \ + !src ## n ## _has_scalar_region(devinfo, inst) && \ + !(dst_is_packed_dword && src ## n ## _is_packed_word), \ + "When the destination spans two registers, the source must " \ + "span two registers\n" ERROR_INDENT "(exceptions for scalar " \ + "sources, and packed-word to packed-dword expansion for src0)") + + if (i == 0) { + DO_SRC(0); + } else { + DO_SRC(1); + } +#undef DO_SRC + } + } + + return error_msg; +} + +static struct string +vector_immediate_restrictions(const struct brw_isa_info *isa, + const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + unsigned num_sources = brw_num_sources_from_inst(isa, inst); + struct string error_msg = { .str = NULL, .len = 0 }; + + if (num_sources == 3 || num_sources == 0 || + (devinfo->ver >= 12 && inst_is_send(isa, inst))) + return (struct string){}; + + unsigned file = num_sources == 1 ? + brw_inst_src0_reg_file(devinfo, inst) : + brw_inst_src1_reg_file(devinfo, inst); + if (file != BRW_IMMEDIATE_VALUE) + return (struct string){}; + + enum brw_reg_type dst_type = inst_dst_type(isa, inst); + unsigned dst_type_size = brw_reg_type_to_size(dst_type); + unsigned dst_subreg = brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1 ? + brw_inst_dst_da1_subreg_nr(devinfo, inst) : 0; + unsigned dst_stride = STRIDE(brw_inst_dst_hstride(devinfo, inst)); + enum brw_reg_type type = num_sources == 1 ? + brw_inst_src0_type(devinfo, inst) : + brw_inst_src1_type(devinfo, inst); + + /* The PRMs say: + * + * When an immediate vector is used in an instruction, the destination + * must be 128-bit aligned with destination horizontal stride equivalent + * to a word for an immediate integer vector (v) and equivalent to a + * DWord for an immediate float vector (vf). + * + * The text has not been updated for the addition of the immediate unsigned + * integer vector type (uv) on SNB, but presumably the same restriction + * applies. + */ + switch (type) { + case BRW_REGISTER_TYPE_V: + case BRW_REGISTER_TYPE_UV: + case BRW_REGISTER_TYPE_VF: + ERROR_IF(dst_subreg % (128 / 8) != 0, + "Destination must be 128-bit aligned in order to use immediate " + "vector types"); + + if (type == BRW_REGISTER_TYPE_VF) { + ERROR_IF(dst_type_size * dst_stride != 4, + "Destination must have stride equivalent to dword in order " + "to use the VF type"); + } else { + ERROR_IF(dst_type_size * dst_stride != 2, + "Destination must have stride equivalent to word in order " + "to use the V or UV type"); + } + break; + default: + break; + } + + return error_msg; +} + +static struct string +special_requirements_for_handling_double_precision_data_types( + const struct brw_isa_info *isa, + const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + + unsigned num_sources = brw_num_sources_from_inst(isa, inst); + struct string error_msg = { .str = NULL, .len = 0 }; + + if (num_sources == 3 || num_sources == 0) + return (struct string){}; + + /* Split sends don't have types so there's no doubles there. */ + if (inst_is_split_send(isa, inst)) + return (struct string){}; + + enum brw_reg_type exec_type = execution_type(isa, inst); + unsigned exec_type_size = brw_reg_type_to_size(exec_type); + + enum brw_reg_file dst_file = brw_inst_dst_reg_file(devinfo, inst); + enum brw_reg_type dst_type = inst_dst_type(isa, inst); + unsigned dst_type_size = brw_reg_type_to_size(dst_type); + unsigned dst_hstride = STRIDE(brw_inst_dst_hstride(devinfo, inst)); + unsigned dst_reg = brw_inst_dst_da_reg_nr(devinfo, inst); + unsigned dst_subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst); + unsigned dst_address_mode = brw_inst_dst_address_mode(devinfo, inst); + + bool is_integer_dword_multiply = + devinfo->ver >= 8 && + brw_inst_opcode(isa, inst) == BRW_OPCODE_MUL && + (brw_inst_src0_type(devinfo, inst) == BRW_REGISTER_TYPE_D || + brw_inst_src0_type(devinfo, inst) == BRW_REGISTER_TYPE_UD) && + (brw_inst_src1_type(devinfo, inst) == BRW_REGISTER_TYPE_D || + brw_inst_src1_type(devinfo, inst) == BRW_REGISTER_TYPE_UD); + + const bool is_double_precision = + dst_type_size == 8 || exec_type_size == 8 || is_integer_dword_multiply; + + for (unsigned i = 0; i < num_sources; i++) { + unsigned vstride, width, hstride, type_size, reg, subreg, address_mode; + bool is_scalar_region; + enum brw_reg_file file; + enum brw_reg_type type; + +#define DO_SRC(n) \ + if (brw_inst_src ## n ## _reg_file(devinfo, inst) == \ + BRW_IMMEDIATE_VALUE) \ + continue; \ + \ + is_scalar_region = src ## n ## _has_scalar_region(devinfo, inst); \ + vstride = STRIDE(brw_inst_src ## n ## _vstride(devinfo, inst)); \ + width = WIDTH(brw_inst_src ## n ## _width(devinfo, inst)); \ + hstride = STRIDE(brw_inst_src ## n ## _hstride(devinfo, inst)); \ + file = brw_inst_src ## n ## _reg_file(devinfo, inst); \ + type = brw_inst_src ## n ## _type(devinfo, inst); \ + type_size = brw_reg_type_to_size(type); \ + reg = brw_inst_src ## n ## _da_reg_nr(devinfo, inst); \ + subreg = brw_inst_src ## n ## _da1_subreg_nr(devinfo, inst); \ + address_mode = brw_inst_src ## n ## _address_mode(devinfo, inst) + + if (i == 0) { + DO_SRC(0); + } else { + DO_SRC(1); + } +#undef DO_SRC + + const unsigned src_stride = (hstride ? hstride : vstride) * type_size; + const unsigned dst_stride = dst_hstride * dst_type_size; + + /* The PRMs say that for CHV, BXT: + * + * When source or destination datatype is 64b or operation is integer + * DWord multiply, regioning in Align1 must follow these rules: + * + * 1. Source and Destination horizontal stride must be aligned to the + * same qword. + * 2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride. + * 3. Source and Destination offset must be the same, except the case + * of scalar source. + * + * We assume that the restriction applies to GLK as well. + */ + if (is_double_precision && + brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1 && + (devinfo->platform == INTEL_PLATFORM_CHV || intel_device_info_is_9lp(devinfo))) { + ERROR_IF(!is_scalar_region && + (src_stride % 8 != 0 || + dst_stride % 8 != 0 || + src_stride != dst_stride), + "Source and destination horizontal stride must equal and a " + "multiple of a qword when the execution type is 64-bit"); + + ERROR_IF(vstride != width * hstride, + "Vstride must be Width * Hstride when the execution type is " + "64-bit"); + + ERROR_IF(!is_scalar_region && dst_subreg != subreg, + "Source and destination offset must be the same when the " + "execution type is 64-bit"); + } + + /* The PRMs say that for CHV, BXT: + * + * When source or destination datatype is 64b or operation is integer + * DWord multiply, indirect addressing must not be used. + * + * We assume that the restriction applies to GLK as well. + */ + if (is_double_precision && + (devinfo->platform == INTEL_PLATFORM_CHV || intel_device_info_is_9lp(devinfo))) { + ERROR_IF(BRW_ADDRESS_REGISTER_INDIRECT_REGISTER == address_mode || + BRW_ADDRESS_REGISTER_INDIRECT_REGISTER == dst_address_mode, + "Indirect addressing is not allowed when the execution type " + "is 64-bit"); + } + + /* The PRMs say that for CHV, BXT: + * + * ARF registers must never be used with 64b datatype or when + * operation is integer DWord multiply. + * + * We assume that the restriction applies to GLK as well. + * + * We assume that the restriction does not apply to the null register. + */ + if (is_double_precision && + (devinfo->platform == INTEL_PLATFORM_CHV || + intel_device_info_is_9lp(devinfo))) { + ERROR_IF(brw_inst_opcode(isa, inst) == BRW_OPCODE_MAC || + brw_inst_acc_wr_control(devinfo, inst) || + (BRW_ARCHITECTURE_REGISTER_FILE == file && + reg != BRW_ARF_NULL) || + (BRW_ARCHITECTURE_REGISTER_FILE == dst_file && + dst_reg != BRW_ARF_NULL), + "Architecture registers cannot be used when the execution " + "type is 64-bit"); + } + + /* From the hardware spec section "Register Region Restrictions": + * + * There are two rules: + * + * "In case of all floating point data types used in destination:" and + * + * "In case where source or destination datatype is 64b or operation is + * integer DWord multiply:" + * + * both of which list the same restrictions: + * + * "1. Register Regioning patterns where register data bit location + * of the LSB of the channels are changed between source and + * destination are not supported on Src0 and Src1 except for + * broadcast of a scalar. + * + * 2. Explicit ARF registers except null and accumulator must not be + * used." + */ + if (devinfo->verx10 >= 125 && + (brw_reg_type_is_floating_point(dst_type) || + is_double_precision)) { + ERROR_IF(!is_scalar_region && + BRW_ADDRESS_REGISTER_INDIRECT_REGISTER != address_mode && + (!is_linear(vstride, width, hstride) || + src_stride != dst_stride || + subreg != dst_subreg), + "Register Regioning patterns where register data bit " + "location of the LSB of the channels are changed between " + "source and destination are not supported except for " + "broadcast of a scalar."); + + ERROR_IF((address_mode == BRW_ADDRESS_DIRECT && file == BRW_ARCHITECTURE_REGISTER_FILE && + reg != BRW_ARF_NULL && !(reg >= BRW_ARF_ACCUMULATOR && reg < BRW_ARF_FLAG)) || + (dst_file == BRW_ARCHITECTURE_REGISTER_FILE && + dst_reg != BRW_ARF_NULL && dst_reg != BRW_ARF_ACCUMULATOR), + "Explicit ARF registers except null and accumulator must not " + "be used."); + } + + /* From the hardware spec section "Register Region Restrictions": + * + * "Vx1 and VxH indirect addressing for Float, Half-Float, Double-Float and + * Quad-Word data must not be used." + */ + if (devinfo->verx10 >= 125 && + (brw_reg_type_is_floating_point(type) || type_sz(type) == 8)) { + ERROR_IF(address_mode == BRW_ADDRESS_REGISTER_INDIRECT_REGISTER && + vstride == BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL, + "Vx1 and VxH indirect addressing for Float, Half-Float, " + "Double-Float and Quad-Word data must not be used"); + } + } + + /* The PRMs say that for BDW, SKL: + * + * If Align16 is required for an operation with QW destination and non-QW + * source datatypes, the execution size cannot exceed 2. + * + * We assume that the restriction applies to all Gfx8+ parts. + */ + if (is_double_precision && devinfo->ver >= 8) { + enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst); + enum brw_reg_type src1_type = + num_sources > 1 ? brw_inst_src1_type(devinfo, inst) : src0_type; + unsigned src0_type_size = brw_reg_type_to_size(src0_type); + unsigned src1_type_size = brw_reg_type_to_size(src1_type); + + ERROR_IF(brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16 && + dst_type_size == 8 && + (src0_type_size != 8 || src1_type_size != 8) && + brw_inst_exec_size(devinfo, inst) > BRW_EXECUTE_2, + "In Align16 exec size cannot exceed 2 with a QWord destination " + "and a non-QWord source"); + } + + /* The PRMs say that for CHV, BXT: + * + * When source or destination datatype is 64b or operation is integer + * DWord multiply, DepCtrl must not be used. + * + * We assume that the restriction applies to GLK as well. + */ + if (is_double_precision && + (devinfo->platform == INTEL_PLATFORM_CHV || intel_device_info_is_9lp(devinfo))) { + ERROR_IF(brw_inst_no_dd_check(devinfo, inst) || + brw_inst_no_dd_clear(devinfo, inst), + "DepCtrl is not allowed when the execution type is 64-bit"); + } + + return error_msg; +} + +static struct string +instruction_restrictions(const struct brw_isa_info *isa, + const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + struct string error_msg = { .str = NULL, .len = 0 }; + + /* From Wa_1604601757: + * + * "When multiplying a DW and any lower precision integer, source modifier + * is not supported." + */ + if (devinfo->ver >= 12 && + brw_inst_opcode(isa, inst) == BRW_OPCODE_MUL) { + enum brw_reg_type exec_type = execution_type(isa, inst); + const bool src0_valid = type_sz(brw_inst_src0_type(devinfo, inst)) == 4 || + brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE || + !(brw_inst_src0_negate(devinfo, inst) || + brw_inst_src0_abs(devinfo, inst)); + const bool src1_valid = type_sz(brw_inst_src1_type(devinfo, inst)) == 4 || + brw_inst_src1_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE || + !(brw_inst_src1_negate(devinfo, inst) || + brw_inst_src1_abs(devinfo, inst)); + + ERROR_IF(!brw_reg_type_is_floating_point(exec_type) && + type_sz(exec_type) == 4 && !(src0_valid && src1_valid), + "When multiplying a DW and any lower precision integer, source " + "modifier is not supported."); + } + + if (brw_inst_opcode(isa, inst) == BRW_OPCODE_CMP || + brw_inst_opcode(isa, inst) == BRW_OPCODE_CMPN) { + if (devinfo->ver <= 7) { + /* Page 166 of the Ivy Bridge PRM Volume 4 part 3 (Execution Unit + * ISA) says: + * + * Accumulator cannot be destination, implicit or explicit. The + * destination must be a general register or the null register. + * + * Page 77 of the Haswell PRM Volume 2b contains the same text. The + * 965G PRMs contain similar text. + * + * Page 864 (page 880 of the PDF) of the Broadwell PRM Volume 7 says: + * + * For the cmp and cmpn instructions, remove the accumulator + * restrictions. + */ + ERROR_IF(brw_inst_dst_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE && + brw_inst_dst_da_reg_nr(devinfo, inst) != BRW_ARF_NULL, + "Accumulator cannot be destination, implicit or explicit."); + } + + /* Page 166 of the Ivy Bridge PRM Volume 4 part 3 (Execution Unit ISA) + * says: + * + * If the destination is the null register, the {Switch} instruction + * option must be used. + * + * Page 77 of the Haswell PRM Volume 2b contains the same text. + */ + if (devinfo->ver == 7) { + ERROR_IF(dst_is_null(devinfo, inst) && + brw_inst_thread_control(devinfo, inst) != BRW_THREAD_SWITCH, + "If the destination is the null register, the {Switch} " + "instruction option must be used."); + } + + ERROR_IF(brw_inst_cond_modifier(devinfo, inst) == BRW_CONDITIONAL_NONE, + "CMP (or CMPN) must have a condition."); + } + + if (brw_inst_opcode(isa, inst) == BRW_OPCODE_SEL) { + if (devinfo->ver < 6) { + ERROR_IF(brw_inst_cond_modifier(devinfo, inst) != BRW_CONDITIONAL_NONE, + "SEL must not have a condition modifier"); + ERROR_IF(brw_inst_pred_control(devinfo, inst) == BRW_PREDICATE_NONE, + "SEL must be predicated"); + } else { + ERROR_IF((brw_inst_cond_modifier(devinfo, inst) != BRW_CONDITIONAL_NONE) == + (brw_inst_pred_control(devinfo, inst) != BRW_PREDICATE_NONE), + "SEL must either be predicated or have a condition modifiers"); + } + } + + if (brw_inst_opcode(isa, inst) == BRW_OPCODE_MUL) { + const enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst); + const enum brw_reg_type src1_type = brw_inst_src1_type(devinfo, inst); + const enum brw_reg_type dst_type = inst_dst_type(isa, inst); + + if (devinfo->ver == 6) { + /* Page 223 of the Sandybridge PRM volume 4 part 2 says: + * + * [DevSNB]: When multiple (sic) a DW and a W, the W has to be on + * src0, and the DW has to be on src1. + * + * This text appears only in the Sandybridge PRMw. + */ + ERROR_IF(brw_reg_type_is_integer(src0_type) && + type_sz(src0_type) == 4 && type_sz(src1_type) < 4, + "When multiplying a DW and any lower precision integer, the " + "DW operand must be src1."); + } else if (devinfo->ver >= 7) { + /* Page 966 (page 982 of the PDF) of Broadwell PRM volume 2a says: + * + * When multiplying a DW and any lower precision integer, the DW + * operand must on src0. + * + * Ivy Bridge, Haswell, Skylake, and Ice Lake PRMs contain the same + * text. + */ + ERROR_IF(brw_reg_type_is_integer(src1_type) && + type_sz(src0_type) < 4 && type_sz(src1_type) == 4, + "When multiplying a DW and any lower precision integer, the " + "DW operand must be src0."); + } + + if (devinfo->ver <= 7) { + /* Section 14.2.28 of Intel 965 Express Chipset PRM volume 4 says: + * + * Source operands cannot be an accumulator register. + * + * Iron Lake, Sandybridge, and Ivy Bridge PRMs have the same text. + * Haswell does not. Given that later PRMs have different + * restrictions on accumulator sources (see below), it seems most + * likely that Haswell shares the Ivy Bridge restriction. + */ + ERROR_IF(src0_is_acc(devinfo, inst) || src1_is_acc(devinfo, inst), + "Source operands cannot be an accumulator register."); + } else { + /* Page 971 (page 987 of the PDF), section "Accumulator + * Restrictions," of the Broadwell PRM volume 7 says: + * + * Integer source operands cannot be accumulators. + * + * The Skylake and Ice Lake PRMs contain the same text. + */ + ERROR_IF((src0_is_acc(devinfo, inst) && + brw_reg_type_is_integer(src0_type)) || + (src1_is_acc(devinfo, inst) && + brw_reg_type_is_integer(src1_type)), + "Integer source operands cannot be accumulators."); + } + + if (devinfo->ver <= 6) { + /* Page 223 of the Sandybridge PRM volume 4 part 2 says: + * + * Dword integer source is not allowed for this instruction in + * float execution mode. In other words, if one source is of type + * float (:f, :vf), the other source cannot be of type dword + * integer (:ud or :d). + * + * G965 and Iron Lake PRMs have similar text. Later GPUs do not + * allow mixed source types at all, but that restriction should be + * handled elsewhere. + */ + ERROR_IF(execution_type(isa, inst) == BRW_REGISTER_TYPE_F && + (src0_type == BRW_REGISTER_TYPE_UD || + src0_type == BRW_REGISTER_TYPE_D || + src1_type == BRW_REGISTER_TYPE_UD || + src1_type == BRW_REGISTER_TYPE_D), + "Dword integer source is not allowed for this instruction in" + "float execution mode."); + } + + if (devinfo->ver <= 7) { + /* Page 118 of the Haswell PRM volume 2b says: + * + * When operating on integers with at least one of the source + * being a DWord type (signed or unsigned), the destination cannot + * be floating-point (implementation note: the data converter only + * looks at the low 34 bits of the result). + * + * G965, Iron Lake, Sandybridge, and Ivy Bridge have similar text. + * Later GPUs do not allow mixed source and destination types at all, + * but that restriction should be handled elsewhere. + */ + ERROR_IF(dst_type == BRW_REGISTER_TYPE_F && + (src0_type == BRW_REGISTER_TYPE_UD || + src0_type == BRW_REGISTER_TYPE_D || + src1_type == BRW_REGISTER_TYPE_UD || + src1_type == BRW_REGISTER_TYPE_D), + "Float destination type not allowed with DWord source type."); + } + + if (devinfo->ver == 8) { + /* Page 966 (page 982 of the PDF) of the Broadwell PRM volume 2a + * says: + * + * When multiplying DW x DW, the dst cannot be accumulator. + * + * This text also appears in the Cherry Trail / Braswell PRM, but it + * does not appear in any other PRM. + */ + ERROR_IF((src0_type == BRW_REGISTER_TYPE_UD || + src0_type == BRW_REGISTER_TYPE_D) && + (src1_type == BRW_REGISTER_TYPE_UD || + src1_type == BRW_REGISTER_TYPE_D) && + brw_inst_dst_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE && + brw_inst_dst_da_reg_nr(devinfo, inst) != BRW_ARF_NULL, + "When multiplying DW x DW, the dst cannot be accumulator."); + } + + /* Page 935 (page 951 of the PDF) of the Ice Lake PRM volume 2a says: + * + * When multiplying integer data types, if one of the sources is a + * DW, the resulting full precision data is stored in the + * accumulator. However, if the destination data type is either W or + * DW, the low bits of the result are written to the destination + * register and the remaining high bits are discarded. This results + * in undefined Overflow and Sign flags. Therefore, conditional + * modifiers and saturation (.sat) cannot be used in this case. + * + * Similar text appears in every version of the PRM. + * + * The wording of the last sentence is not very clear. It could either + * be interpreted as "conditional modifiers combined with saturation + * cannot be used" or "neither conditional modifiers nor saturation can + * be used." I have interpreted it as the latter primarily because that + * is the more restrictive interpretation. + */ + ERROR_IF((src0_type == BRW_REGISTER_TYPE_UD || + src0_type == BRW_REGISTER_TYPE_D || + src1_type == BRW_REGISTER_TYPE_UD || + src1_type == BRW_REGISTER_TYPE_D) && + (dst_type == BRW_REGISTER_TYPE_UD || + dst_type == BRW_REGISTER_TYPE_D || + dst_type == BRW_REGISTER_TYPE_UW || + dst_type == BRW_REGISTER_TYPE_W) && + (brw_inst_saturate(devinfo, inst) != 0 || + brw_inst_cond_modifier(devinfo, inst) != BRW_CONDITIONAL_NONE), + "Neither Saturate nor conditional modifier allowed with DW " + "integer multiply."); + } + + if (brw_inst_opcode(isa, inst) == BRW_OPCODE_MATH) { + unsigned math_function = brw_inst_math_function(devinfo, inst); + switch (math_function) { + case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER: + case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT: + case BRW_MATH_FUNCTION_INT_DIV_REMAINDER: { + /* Page 442 of the Broadwell PRM Volume 2a "Extended Math Function" says: + * INT DIV function does not support source modifiers. + * Bspec 6647 extends it back to Ivy Bridge. + */ + bool src0_valid = !brw_inst_src0_negate(devinfo, inst) && + !brw_inst_src0_abs(devinfo, inst); + bool src1_valid = !brw_inst_src1_negate(devinfo, inst) && + !brw_inst_src1_abs(devinfo, inst); + ERROR_IF(!src0_valid || !src1_valid, + "INT DIV function does not support source modifiers."); + break; + } + default: + break; + } + } + + if (brw_inst_opcode(isa, inst) == BRW_OPCODE_DP4A) { + /* Page 396 (page 412 of the PDF) of the DG1 PRM volume 2a says: + * + * Only one of src0 or src1 operand may be an the (sic) accumulator + * register (acc#). + */ + ERROR_IF(src0_is_acc(devinfo, inst) && src1_is_acc(devinfo, inst), + "Only one of src0 or src1 operand may be an accumulator " + "register (acc#)."); + + } + + if (brw_inst_opcode(isa, inst) == BRW_OPCODE_ADD3) { + const enum brw_reg_type dst_type = inst_dst_type(isa, inst); + + ERROR_IF(dst_type != BRW_REGISTER_TYPE_D && + dst_type != BRW_REGISTER_TYPE_UD && + dst_type != BRW_REGISTER_TYPE_W && + dst_type != BRW_REGISTER_TYPE_UW, + "Destination must be integer D, UD, W, or UW type."); + + for (unsigned i = 0; i < 3; i++) { + enum brw_reg_type src_type; + + switch (i) { + case 0: src_type = brw_inst_3src_a1_src0_type(devinfo, inst); break; + case 1: src_type = brw_inst_3src_a1_src1_type(devinfo, inst); break; + case 2: src_type = brw_inst_3src_a1_src2_type(devinfo, inst); break; + default: unreachable("invalid src"); + } + + ERROR_IF(src_type != BRW_REGISTER_TYPE_D && + src_type != BRW_REGISTER_TYPE_UD && + src_type != BRW_REGISTER_TYPE_W && + src_type != BRW_REGISTER_TYPE_UW, + "Source must be integer D, UD, W, or UW type."); + + if (i == 0) { + if (brw_inst_3src_a1_src0_is_imm(devinfo, inst)) { + ERROR_IF(src_type != BRW_REGISTER_TYPE_W && + src_type != BRW_REGISTER_TYPE_UW, + "Immediate source must be integer W or UW type."); + } + } else if (i == 2) { + if (brw_inst_3src_a1_src2_is_imm(devinfo, inst)) { + ERROR_IF(src_type != BRW_REGISTER_TYPE_W && + src_type != BRW_REGISTER_TYPE_UW, + "Immediate source must be integer W or UW type."); + } + } + } + } + + if (brw_inst_opcode(isa, inst) == BRW_OPCODE_OR || + brw_inst_opcode(isa, inst) == BRW_OPCODE_AND || + brw_inst_opcode(isa, inst) == BRW_OPCODE_XOR || + brw_inst_opcode(isa, inst) == BRW_OPCODE_NOT) { + if (devinfo->ver >= 8) { + /* While the behavior of the negate source modifier is defined as + * logical not, the behavior of abs source modifier is not + * defined. Disallow it to be safe. + */ + ERROR_IF(brw_inst_src0_abs(devinfo, inst), + "Behavior of abs source modifier in logic ops is undefined."); + ERROR_IF(brw_inst_opcode(isa, inst) != BRW_OPCODE_NOT && + brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE && + brw_inst_src1_abs(devinfo, inst), + "Behavior of abs source modifier in logic ops is undefined."); + + /* Page 479 (page 495 of the PDF) of the Broadwell PRM volume 2a says: + * + * Source modifier is not allowed if source is an accumulator. + * + * The same text also appears for OR, NOT, and XOR instructions. + */ + ERROR_IF((brw_inst_src0_abs(devinfo, inst) || + brw_inst_src0_negate(devinfo, inst)) && + src0_is_acc(devinfo, inst), + "Source modifier is not allowed if source is an accumulator."); + ERROR_IF(brw_num_sources_from_inst(isa, inst) > 1 && + (brw_inst_src1_abs(devinfo, inst) || + brw_inst_src1_negate(devinfo, inst)) && + src1_is_acc(devinfo, inst), + "Source modifier is not allowed if source is an accumulator."); + } + + /* Page 479 (page 495 of the PDF) of the Broadwell PRM volume 2a says: + * + * This operation does not produce sign or overflow conditions. Only + * the .e/.z or .ne/.nz conditional modifiers should be used. + * + * The same text also appears for OR, NOT, and XOR instructions. + * + * Per the comment around nir_op_imod in brw_fs_nir.cpp, we have + * determined this to not be true. The only conditions that seem + * absolutely sketchy are O, R, and U. Some OpenGL shaders from Doom + * 2016 have been observed to generate and.g and operate correctly. + */ + const enum brw_conditional_mod cmod = + brw_inst_cond_modifier(devinfo, inst); + ERROR_IF(cmod == BRW_CONDITIONAL_O || + cmod == BRW_CONDITIONAL_R || + cmod == BRW_CONDITIONAL_U, + "O, R, and U conditional modifiers should not be used."); + } + + if (brw_inst_opcode(isa, inst) == BRW_OPCODE_BFI2) { + ERROR_IF(brw_inst_cond_modifier(devinfo, inst) != BRW_CONDITIONAL_NONE, + "BFI2 cannot have conditional modifier"); + + ERROR_IF(brw_inst_saturate(devinfo, inst), + "BFI2 cannot have saturate modifier"); + + enum brw_reg_type dst_type; + + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) + dst_type = brw_inst_3src_a1_dst_type(devinfo, inst); + else + dst_type = brw_inst_3src_a16_dst_type(devinfo, inst); + + ERROR_IF(dst_type != BRW_REGISTER_TYPE_D && + dst_type != BRW_REGISTER_TYPE_UD, + "BFI2 destination type must be D or UD"); + + for (unsigned s = 0; s < 3; s++) { + enum brw_reg_type src_type; + + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + switch (s) { + case 0: src_type = brw_inst_3src_a1_src0_type(devinfo, inst); break; + case 1: src_type = brw_inst_3src_a1_src1_type(devinfo, inst); break; + case 2: src_type = brw_inst_3src_a1_src2_type(devinfo, inst); break; + default: unreachable("invalid src"); + } + } else { + src_type = brw_inst_3src_a16_src_type(devinfo, inst); + } + + ERROR_IF(src_type != dst_type, + "BFI2 source type must match destination type"); + } + } + + if (brw_inst_opcode(isa, inst) == BRW_OPCODE_CSEL) { + ERROR_IF(brw_inst_pred_control(devinfo, inst) != BRW_PREDICATE_NONE, + "CSEL cannot be predicated"); + + /* CSEL is CMP and SEL fused into one. The condition modifier, which + * does not actually modify the flags, controls the built-in comparison. + */ + ERROR_IF(brw_inst_cond_modifier(devinfo, inst) == BRW_CONDITIONAL_NONE, + "CSEL must have a condition."); + + enum brw_reg_type dst_type; + + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) + dst_type = brw_inst_3src_a1_dst_type(devinfo, inst); + else + dst_type = brw_inst_3src_a16_dst_type(devinfo, inst); + + if (devinfo->ver < 8) { + ERROR_IF(devinfo->ver < 8, "CSEL not supported before Gfx8"); + } else if (devinfo->ver <= 9) { + ERROR_IF(dst_type != BRW_REGISTER_TYPE_F, + "CSEL destination type must be F"); + } else { + ERROR_IF(dst_type != BRW_REGISTER_TYPE_F && + dst_type != BRW_REGISTER_TYPE_HF && + dst_type != BRW_REGISTER_TYPE_D && + dst_type != BRW_REGISTER_TYPE_W, + "CSEL destination type must be F, HF, D, or W"); + } + + for (unsigned s = 0; s < 3; s++) { + enum brw_reg_type src_type; + + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + switch (s) { + case 0: src_type = brw_inst_3src_a1_src0_type(devinfo, inst); break; + case 1: src_type = brw_inst_3src_a1_src1_type(devinfo, inst); break; + case 2: src_type = brw_inst_3src_a1_src2_type(devinfo, inst); break; + default: unreachable("invalid src"); + } + } else { + src_type = brw_inst_3src_a16_src_type(devinfo, inst); + } + + ERROR_IF(src_type != dst_type, + "CSEL source type must match destination type"); + } + } + + if (brw_inst_opcode(isa, inst) == BRW_OPCODE_DPAS) { + ERROR_IF(brw_inst_dpas_3src_sdepth(devinfo, inst) != BRW_SYSTOLIC_DEPTH_8, + "Systolic depth must be 8."); + + const unsigned sdepth = 8; + + const enum brw_reg_type dst_type = + brw_inst_dpas_3src_dst_type(devinfo, inst); + const enum brw_reg_type src0_type = + brw_inst_dpas_3src_src0_type(devinfo, inst); + const enum brw_reg_type src1_type = + brw_inst_dpas_3src_src1_type(devinfo, inst); + const enum brw_reg_type src2_type = + brw_inst_dpas_3src_src2_type(devinfo, inst); + + const enum gfx12_sub_byte_precision src1_sub_byte = + brw_inst_dpas_3src_src1_subbyte(devinfo, inst); + + if (src1_type != BRW_REGISTER_TYPE_B && src1_type != BRW_REGISTER_TYPE_UB) { + ERROR_IF(src1_sub_byte != BRW_SUB_BYTE_PRECISION_NONE, + "Sub-byte precision must be None for source type larger than Byte."); + } else { + ERROR_IF(src1_sub_byte != BRW_SUB_BYTE_PRECISION_NONE && + src1_sub_byte != BRW_SUB_BYTE_PRECISION_4BIT && + src1_sub_byte != BRW_SUB_BYTE_PRECISION_2BIT, + "Invalid sub-byte precision."); + } + + const enum gfx12_sub_byte_precision src2_sub_byte = + brw_inst_dpas_3src_src2_subbyte(devinfo, inst); + + if (src2_type != BRW_REGISTER_TYPE_B && src2_type != BRW_REGISTER_TYPE_UB) { + ERROR_IF(src2_sub_byte != BRW_SUB_BYTE_PRECISION_NONE, + "Sub-byte precision must be None."); + } else { + ERROR_IF(src2_sub_byte != BRW_SUB_BYTE_PRECISION_NONE && + src2_sub_byte != BRW_SUB_BYTE_PRECISION_4BIT && + src2_sub_byte != BRW_SUB_BYTE_PRECISION_2BIT, + "Invalid sub-byte precision."); + } + + const unsigned src1_bits_per_element = + (8 * brw_reg_type_to_size(src1_type)) >> + brw_inst_dpas_3src_src1_subbyte(devinfo, inst); + + const unsigned src2_bits_per_element = + (8 * brw_reg_type_to_size(src2_type)) >> + brw_inst_dpas_3src_src2_subbyte(devinfo, inst); + + /* The MAX2(1, ...) is just to prevent possible division by 0 later. */ + const unsigned ops_per_chan = + MAX2(1, 32 / MAX2(src1_bits_per_element, src2_bits_per_element)); + + ERROR_IF(brw_inst_exec_size(devinfo, inst) != BRW_EXECUTE_8, + "DPAS execution size must be 8."); + + const unsigned exec_size = 8; + + const unsigned dst_subnr = brw_inst_dpas_3src_dst_subreg_nr(devinfo, inst); + const unsigned src0_subnr = brw_inst_dpas_3src_src0_subreg_nr(devinfo, inst); + const unsigned src1_subnr = brw_inst_dpas_3src_src1_subreg_nr(devinfo, inst); + const unsigned src2_subnr = brw_inst_dpas_3src_src2_subreg_nr(devinfo, inst); + + /* Until HF is supported as dst type, this is effectively subnr == 0. */ + ERROR_IF(dst_subnr % exec_size != 0, + "Destination subregister offset must be a multiple of ExecSize."); + + /* Until HF is supported as src0 type, this is effectively subnr == 0. */ + ERROR_IF(src0_subnr % exec_size != 0, + "Src0 subregister offset must be a multiple of ExecSize."); + + ERROR_IF(src1_subnr != 0, + "Src1 subregister offsets must be 0."); + + /* In nearly all cases, this effectively requires that src2.subnr be + * 0. It is only when src1 is 8 bits and src2 is 2 or 4 bits that the + * ops_per_chan value can allow non-zero src2.subnr. + */ + ERROR_IF(src2_subnr % (sdepth * ops_per_chan) != 0, + "Src2 subregister offset must be a multiple of SystolicDepth " + "times OPS_PER_CHAN."); + + ERROR_IF(dst_subnr * type_sz(dst_type) >= REG_SIZE, + "Destination subregister specifies next register."); + + ERROR_IF(src0_subnr * type_sz(src0_type) >= REG_SIZE, + "Src0 subregister specifies next register."); + + ERROR_IF((src1_subnr * type_sz(src1_type) * src1_bits_per_element) / 8 >= REG_SIZE, + "Src1 subregister specifies next register."); + + ERROR_IF((src2_subnr * type_sz(src2_type) * src2_bits_per_element) / 8 >= REG_SIZE, + "Src2 subregister specifies next register."); + + if (brw_inst_3src_atomic_control(devinfo, inst)) { + /* FINISHME: When we start emitting DPAS with Atomic set, figure out + * a way to validate it. Also add a test in test_eu_validate.cpp. + */ + ERROR_IF(true, + "When instruction option Atomic is used it must be follwed by a " + "DPAS instruction."); + } + + if (brw_inst_dpas_3src_exec_type(devinfo, inst) == + BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT) { + ERROR_IF(dst_type != BRW_REGISTER_TYPE_F, + "DPAS destination type must be F."); + ERROR_IF(src0_type != BRW_REGISTER_TYPE_F, + "DPAS src0 type must be F."); + ERROR_IF(src1_type != BRW_REGISTER_TYPE_HF, + "DPAS src1 type must be HF."); + ERROR_IF(src2_type != BRW_REGISTER_TYPE_HF, + "DPAS src2 type must be HF."); + } else { + ERROR_IF(dst_type != BRW_REGISTER_TYPE_D && + dst_type != BRW_REGISTER_TYPE_UD, + "DPAS destination type must be D or UD."); + ERROR_IF(src0_type != BRW_REGISTER_TYPE_D && + src0_type != BRW_REGISTER_TYPE_UD, + "DPAS src0 type must be D or UD."); + ERROR_IF(src1_type != BRW_REGISTER_TYPE_B && + src1_type != BRW_REGISTER_TYPE_UB, + "DPAS src1 base type must be B or UB."); + ERROR_IF(src2_type != BRW_REGISTER_TYPE_B && + src2_type != BRW_REGISTER_TYPE_UB, + "DPAS src2 base type must be B or UB."); + + if (brw_reg_type_is_unsigned_integer(dst_type)) { + ERROR_IF(!brw_reg_type_is_unsigned_integer(src0_type) || + !brw_reg_type_is_unsigned_integer(src1_type) || + !brw_reg_type_is_unsigned_integer(src2_type), + "If any source datatype is signed, destination datatype " + "must be signed."); + } + } + + /* FINISHME: Additional restrictions mentioned in the Bspec that are not + * yet enforced here: + * + * - General Accumulator registers access is not supported. This is + * currently enforced in brw_dpas_three_src (brw_eu_emit.c). + * + * - Given any combination of datatypes in the sources of a DPAS + * instructions, the boundaries of a register should not be crossed. + */ + } + + return error_msg; +} + +static struct string +send_descriptor_restrictions(const struct brw_isa_info *isa, + const brw_inst *inst) +{ + const struct intel_device_info *devinfo = isa->devinfo; + struct string error_msg = { .str = NULL, .len = 0 }; + + if (inst_is_split_send(isa, inst)) { + /* We can only validate immediate descriptors */ + if (brw_inst_send_sel_reg32_desc(devinfo, inst)) + return error_msg; + } else if (inst_is_send(isa, inst)) { + /* We can only validate immediate descriptors */ + if (brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE) + return error_msg; + } else { + return error_msg; + } + + const uint32_t desc = brw_inst_send_desc(devinfo, inst); + + switch (brw_inst_sfid(devinfo, inst)) { + case BRW_SFID_URB: + if (devinfo->ver < 20) + break; + FALLTHROUGH; + case GFX12_SFID_TGM: + case GFX12_SFID_SLM: + case GFX12_SFID_UGM: + ERROR_IF(!devinfo->has_lsc, "Platform does not support LSC"); + + ERROR_IF(lsc_opcode_has_transpose(lsc_msg_desc_opcode(devinfo, desc)) && + lsc_msg_desc_transpose(devinfo, desc) && + brw_inst_exec_size(devinfo, inst) != BRW_EXECUTE_1, + "Transposed vectors are restricted to Exec_Mask = 1."); + break; + + default: + break; + } + + if (brw_inst_sfid(devinfo, inst) == BRW_SFID_URB && devinfo->ver < 20) { + /* Gfx4 doesn't have a "header present" bit in the SEND message. */ + ERROR_IF(devinfo->ver > 4 && !brw_inst_header_present(devinfo, inst), + "Header must be present for all URB messages."); + + switch (brw_inst_urb_opcode(devinfo, inst)) { + case BRW_URB_OPCODE_WRITE_HWORD: + break; + + /* case FF_SYNC: */ + case BRW_URB_OPCODE_WRITE_OWORD: + /* Gfx5 / Gfx6 FF_SYNC message and Gfx7+ URB_WRITE_OWORD have the + * same opcode value. + */ + if (devinfo->ver == 5 || devinfo->ver == 6) { + ERROR_IF(brw_inst_urb_global_offset(devinfo, inst) != 0, + "FF_SYNC global offset must be zero."); + ERROR_IF(brw_inst_urb_swizzle_control(devinfo, inst) != 0, + "FF_SYNC swizzle control must be zero."); + ERROR_IF(brw_inst_urb_used(devinfo, inst) != 0, + "FF_SYNC used must be zero."); + ERROR_IF(brw_inst_urb_complete(devinfo, inst) != 0, + "FF_SYNC complete must be zero."); + + /* Volume 4 part 2 of the Sandybridge PRM (page 28) says: + * + * A message response (writeback) length of 1 GRF will be + * indicated on the ‘send’ instruction if the thread requires + * response data and/or synchronization. + */ + ERROR_IF((unsigned)brw_inst_rlen(devinfo, inst) > 1, + "FF_SYNC read length must be 0 or 1."); + } else { + ERROR_IF(devinfo->ver < 7, + "URB OWORD write messages only valid on gfx >= 7"); + } + break; + + case BRW_URB_OPCODE_READ_HWORD: + case BRW_URB_OPCODE_READ_OWORD: + ERROR_IF(devinfo->ver < 7, + "URB read messages only valid on gfx >= 7"); + break; + + case GFX7_URB_OPCODE_ATOMIC_MOV: + case GFX7_URB_OPCODE_ATOMIC_INC: + ERROR_IF(devinfo->ver < 7, + "URB atomic move and increment messages only valid on gfx >= 7"); + break; + + case GFX8_URB_OPCODE_ATOMIC_ADD: + /* The Haswell PRM lists this opcode as valid on page 317. */ + ERROR_IF(devinfo->verx10 < 75, + "URB atomic add message only valid on gfx >= 7.5"); + break; + + case GFX8_URB_OPCODE_SIMD8_READ: + ERROR_IF(brw_inst_rlen(devinfo, inst) == 0, + "URB SIMD8 read message must read some data."); + FALLTHROUGH; + + case GFX8_URB_OPCODE_SIMD8_WRITE: + ERROR_IF(devinfo->ver < 8, + "URB SIMD8 messages only valid on gfx >= 8"); + break; + + case GFX125_URB_OPCODE_FENCE: + ERROR_IF(devinfo->verx10 < 125, + "URB fence message only valid on gfx >= 12.5"); + break; + + default: + ERROR_IF(true, "Invalid URB message"); + break; + } + } + + return error_msg; +} + +bool +brw_validate_instruction(const struct brw_isa_info *isa, + const brw_inst *inst, int offset, + unsigned inst_size, + struct disasm_info *disasm) +{ + struct string error_msg = { .str = NULL, .len = 0 }; + + if (is_unsupported_inst(isa, inst)) { + ERROR("Instruction not supported on this Gen"); + } else { + CHECK(invalid_values); + + if (error_msg.str == NULL) { + CHECK(sources_not_null); + CHECK(send_restrictions); + CHECK(alignment_supported); + CHECK(general_restrictions_based_on_operand_types); + CHECK(general_restrictions_on_region_parameters); + CHECK(special_restrictions_for_mixed_float_mode); + CHECK(region_alignment_rules); + CHECK(vector_immediate_restrictions); + CHECK(special_requirements_for_handling_double_precision_data_types); + CHECK(instruction_restrictions); + CHECK(send_descriptor_restrictions); + } + } + + if (error_msg.str && disasm) { + disasm_insert_error(disasm, offset, inst_size, error_msg.str); + } + free(error_msg.str); + + return error_msg.len == 0; +} + +bool +brw_validate_instructions(const struct brw_isa_info *isa, + const void *assembly, int start_offset, int end_offset, + struct disasm_info *disasm) +{ + const struct intel_device_info *devinfo = isa->devinfo; + bool valid = true; + + for (int src_offset = start_offset; src_offset < end_offset;) { + const brw_inst *inst = assembly + src_offset; + bool is_compact = brw_inst_cmpt_control(devinfo, inst); + unsigned inst_size = is_compact ? sizeof(brw_compact_inst) + : sizeof(brw_inst); + brw_inst uncompacted; + + if (is_compact) { + brw_compact_inst *compacted = (void *)inst; + brw_uncompact_instruction(isa, &uncompacted, compacted); + inst = &uncompacted; + } + + bool v = brw_validate_instruction(isa, inst, src_offset, + inst_size, disasm); + valid = valid && v; + + src_offset += inst_size; + } + + return valid; +} diff --git a/src/intel/compiler/elk/brw_fs.cpp b/src/intel/compiler/elk/brw_fs.cpp new file mode 100644 index 00000000000..2a9cee96c5e --- /dev/null +++ b/src/intel/compiler/elk/brw_fs.cpp @@ -0,0 +1,8561 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_fs.cpp + * + * This file drives the GLSL IR -> LIR translation, contains the + * optimizations on the LIR, and drives the generation of native code + * from the LIR. + */ + +#include "brw_eu.h" +#include "brw_fs.h" +#include "brw_fs_builder.h" +#include "brw_fs_live_variables.h" +#include "brw_nir.h" +#include "brw_vec4_gs_visitor.h" +#include "brw_cfg.h" +#include "brw_dead_control_flow.h" +#include "brw_private.h" +#include "intel_nir.h" +#include "shader_enums.h" +#include "dev/intel_debug.h" +#include "dev/intel_wa.h" +#include "compiler/glsl_types.h" +#include "compiler/nir/nir_builder.h" +#include "util/u_math.h" + +#include + +using namespace brw; + +static unsigned get_lowered_simd_width(const fs_visitor *shader, + const fs_inst *inst); + +void +fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, + const fs_reg *src, unsigned sources) +{ + memset((void*)this, 0, sizeof(*this)); + + this->src = new fs_reg[MAX2(sources, 3)]; + for (unsigned i = 0; i < sources; i++) + this->src[i] = src[i]; + + this->opcode = opcode; + this->dst = dst; + this->sources = sources; + this->exec_size = exec_size; + this->base_mrf = -1; + + assert(dst.file != IMM && dst.file != UNIFORM); + + assert(this->exec_size != 0); + + this->conditional_mod = BRW_CONDITIONAL_NONE; + + /* This will be the case for almost all instructions. */ + switch (dst.file) { + case VGRF: + case ARF: + case FIXED_GRF: + case MRF: + case ATTR: + this->size_written = dst.component_size(exec_size); + break; + case BAD_FILE: + this->size_written = 0; + break; + case IMM: + case UNIFORM: + unreachable("Invalid destination register file"); + } + + this->writes_accumulator = false; +} + +fs_inst::fs_inst() +{ + init(BRW_OPCODE_NOP, 8, dst, NULL, 0); +} + +fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size) +{ + init(opcode, exec_size, reg_undef, NULL, 0); +} + +fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst) +{ + init(opcode, exec_size, dst, NULL, 0); +} + +fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, + const fs_reg &src0) +{ + const fs_reg src[1] = { src0 }; + init(opcode, exec_size, dst, src, 1); +} + +fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, + const fs_reg &src0, const fs_reg &src1) +{ + const fs_reg src[2] = { src0, src1 }; + init(opcode, exec_size, dst, src, 2); +} + +fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, + const fs_reg &src0, const fs_reg &src1, const fs_reg &src2) +{ + const fs_reg src[3] = { src0, src1, src2 }; + init(opcode, exec_size, dst, src, 3); +} + +fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst, + const fs_reg src[], unsigned sources) +{ + init(opcode, exec_width, dst, src, sources); +} + +fs_inst::fs_inst(const fs_inst &that) +{ + memcpy((void*)this, &that, sizeof(that)); + + this->src = new fs_reg[MAX2(that.sources, 3)]; + + for (unsigned i = 0; i < that.sources; i++) + this->src[i] = that.src[i]; +} + +fs_inst::~fs_inst() +{ + delete[] this->src; +} + +void +fs_inst::resize_sources(uint8_t num_sources) +{ + if (this->sources != num_sources) { + fs_reg *src = new fs_reg[MAX2(num_sources, 3)]; + + for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i) + src[i] = this->src[i]; + + delete[] this->src; + this->src = src; + this->sources = num_sources; + } +} + +void +fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld, + const fs_reg &dst, + const fs_reg &surface, + const fs_reg &surface_handle, + const fs_reg &varying_offset, + uint32_t const_offset, + uint8_t alignment, + unsigned components) +{ + assert(components <= 4); + + /* We have our constant surface use a pitch of 4 bytes, so our index can + * be any component of a vector, and then we load 4 contiguous + * components starting from that. TODO: Support loading fewer than 4. + */ + fs_reg total_offset = vgrf(glsl_uint_type()); + bld.ADD(total_offset, varying_offset, brw_imm_ud(const_offset)); + + /* The pull load message will load a vec4 (16 bytes). If we are loading + * a double this means we are only loading 2 elements worth of data. + * We also want to use a 32-bit data type for the dst of the load operation + * so other parts of the driver don't get confused about the size of the + * result. + */ + fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4); + + fs_reg srcs[PULL_VARYING_CONSTANT_SRCS]; + srcs[PULL_VARYING_CONSTANT_SRC_SURFACE] = surface; + srcs[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle; + srcs[PULL_VARYING_CONSTANT_SRC_OFFSET] = total_offset; + srcs[PULL_VARYING_CONSTANT_SRC_ALIGNMENT] = brw_imm_ud(alignment); + + fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL, + vec4_result, srcs, PULL_VARYING_CONSTANT_SRCS); + inst->size_written = 4 * vec4_result.component_size(inst->exec_size); + + shuffle_from_32bit_read(bld, dst, vec4_result, 0, components); +} + +/** + * A helper for MOV generation for fixing up broken hardware SEND dependency + * handling. + */ +void +fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf) +{ + /* The caller always wants uncompressed to emit the minimal extra + * dependencies, and to avoid having to deal with aligning its regs to 2. + */ + const fs_builder ubld = bld.annotate("send dependency resolve") + .quarter(0); + + ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F)); +} + +bool +fs_inst::is_send_from_grf() const +{ + switch (opcode) { + case SHADER_OPCODE_SEND: + case FS_OPCODE_INTERPOLATE_AT_SAMPLE: + case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: + case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: + case SHADER_OPCODE_INTERLOCK: + case SHADER_OPCODE_MEMORY_FENCE: + case SHADER_OPCODE_BARRIER: + return true; + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: + return src[1].file == VGRF; + case FS_OPCODE_FB_WRITE: + case FS_OPCODE_FB_READ: + return src[0].file == VGRF; + default: + return false; + } +} + +bool +fs_inst::is_control_source(unsigned arg) const +{ + switch (opcode) { + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4: + return arg == 0; + + case SHADER_OPCODE_BROADCAST: + case SHADER_OPCODE_SHUFFLE: + case SHADER_OPCODE_QUAD_SWIZZLE: + case FS_OPCODE_INTERPOLATE_AT_SAMPLE: + case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: + case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: + return arg == 1; + + case SHADER_OPCODE_MOV_INDIRECT: + case SHADER_OPCODE_CLUSTER_BROADCAST: + case SHADER_OPCODE_TEX: + case FS_OPCODE_TXB: + case SHADER_OPCODE_TXD: + case SHADER_OPCODE_TXF: + case SHADER_OPCODE_TXF_LZ: + case SHADER_OPCODE_TXF_CMS: + case SHADER_OPCODE_TXF_CMS_W: + case SHADER_OPCODE_TXF_UMS: + case SHADER_OPCODE_TXF_MCS: + case SHADER_OPCODE_TXL: + case SHADER_OPCODE_TXL_LZ: + case SHADER_OPCODE_TXS: + case SHADER_OPCODE_LOD: + case SHADER_OPCODE_TG4: + case SHADER_OPCODE_TG4_OFFSET: + case SHADER_OPCODE_SAMPLEINFO: + return arg == 1 || arg == 2; + + case SHADER_OPCODE_SEND: + return arg == 0 || arg == 1; + + default: + return false; + } +} + +bool +fs_inst::is_payload(unsigned arg) const +{ + switch (opcode) { + case FS_OPCODE_FB_WRITE: + case FS_OPCODE_FB_READ: + case VEC4_OPCODE_UNTYPED_ATOMIC: + case VEC4_OPCODE_UNTYPED_SURFACE_READ: + case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: + case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: + case FS_OPCODE_INTERPOLATE_AT_SAMPLE: + case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: + case SHADER_OPCODE_INTERLOCK: + case SHADER_OPCODE_MEMORY_FENCE: + case SHADER_OPCODE_BARRIER: + case SHADER_OPCODE_TEX: + case FS_OPCODE_TXB: + case SHADER_OPCODE_TXD: + case SHADER_OPCODE_TXF: + case SHADER_OPCODE_TXF_LZ: + case SHADER_OPCODE_TXF_CMS: + case SHADER_OPCODE_TXF_CMS_W: + case SHADER_OPCODE_TXF_UMS: + case SHADER_OPCODE_TXF_MCS: + case SHADER_OPCODE_TXL: + case SHADER_OPCODE_TXL_LZ: + case SHADER_OPCODE_TXS: + case SHADER_OPCODE_LOD: + case SHADER_OPCODE_TG4: + case SHADER_OPCODE_TG4_OFFSET: + case SHADER_OPCODE_SAMPLEINFO: + return arg == 0; + + case SHADER_OPCODE_SEND: + return arg == 2 || arg == 3; + + default: + return false; + } +} + +/** + * Returns true if this instruction's sources and destinations cannot + * safely be the same register. + * + * In most cases, a register can be written over safely by the same + * instruction that is its last use. For a single instruction, the + * sources are dereferenced before writing of the destination starts + * (naturally). + * + * However, there are a few cases where this can be problematic: + * + * - Virtual opcodes that translate to multiple instructions in the + * code generator: if src == dst and one instruction writes the + * destination before a later instruction reads the source, then + * src will have been clobbered. + * + * - SIMD16 compressed instructions with certain regioning (see below). + * + * The register allocator uses this information to set up conflicts between + * GRF sources and the destination. + */ +bool +fs_inst::has_source_and_destination_hazard() const +{ + switch (opcode) { + case FS_OPCODE_PACK_HALF_2x16_SPLIT: + /* Multiple partial writes to the destination */ + return true; + case SHADER_OPCODE_SHUFFLE: + /* This instruction returns an arbitrary channel from the source and + * gets split into smaller instructions in the generator. It's possible + * that one of the instructions will read from a channel corresponding + * to an earlier instruction. + */ + case SHADER_OPCODE_SEL_EXEC: + /* This is implemented as + * + * mov(16) g4<1>D 0D { align1 WE_all 1H }; + * mov(16) g4<1>D g5<8,8,1>D { align1 1H } + * + * Because the source is only read in the second instruction, the first + * may stomp all over it. + */ + return true; + case SHADER_OPCODE_QUAD_SWIZZLE: + switch (src[1].ud) { + case BRW_SWIZZLE_XXXX: + case BRW_SWIZZLE_YYYY: + case BRW_SWIZZLE_ZZZZ: + case BRW_SWIZZLE_WWWW: + case BRW_SWIZZLE_XXZZ: + case BRW_SWIZZLE_YYWW: + case BRW_SWIZZLE_XYXY: + case BRW_SWIZZLE_ZWZW: + /* These can be implemented as a single Align1 region on all + * platforms, so there's never a hazard between source and + * destination. C.f. fs_generator::generate_quad_swizzle(). + */ + return false; + default: + return !is_uniform(src[0]); + } + case BRW_OPCODE_DPAS: + /* This is overly conservative. The actual hazard is more complicated to + * describe. When the repeat count is N, the single instruction behaves + * like N instructions with a repeat count of one, but the destination + * and source registers are incremented (in somewhat complex ways) for + * each instruction. + * + * This means the source and destination register is actually a range of + * registers. The hazard exists of an earlier iteration would write a + * register that should be read by a later iteration. + * + * There may be some advantage to properly modeling this, but for now, + * be overly conservative. + */ + return rcount > 1; + default: + /* The SIMD16 compressed instruction + * + * add(16) g4<1>F g4<8,8,1>F g6<8,8,1>F + * + * is actually decoded in hardware as: + * + * add(8) g4<1>F g4<8,8,1>F g6<8,8,1>F + * add(8) g5<1>F g5<8,8,1>F g7<8,8,1>F + * + * Which is safe. However, if we have uniform accesses + * happening, we get into trouble: + * + * add(8) g4<1>F g4<0,1,0>F g6<8,8,1>F + * add(8) g5<1>F g4<0,1,0>F g7<8,8,1>F + * + * Now our destination for the first instruction overwrote the + * second instruction's src0, and we get garbage for those 8 + * pixels. There's a similar issue for the pre-gfx6 + * pixel_x/pixel_y, which are registers of 16-bit values and thus + * would get stomped by the first decode as well. + */ + if (exec_size == 16) { + for (int i = 0; i < sources; i++) { + if (src[i].file == VGRF && (src[i].stride == 0 || + src[i].type == BRW_REGISTER_TYPE_UW || + src[i].type == BRW_REGISTER_TYPE_W || + src[i].type == BRW_REGISTER_TYPE_UB || + src[i].type == BRW_REGISTER_TYPE_B)) { + return true; + } + } + } + return false; + } +} + +bool +fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const +{ + if (devinfo->ver == 6 && is_math()) + return false; + + if (is_send_from_grf()) + return false; + + /* From Wa_1604601757: + * + * "When multiplying a DW and any lower precision integer, source modifier + * is not supported." + */ + if (devinfo->ver >= 12 && (opcode == BRW_OPCODE_MUL || + opcode == BRW_OPCODE_MAD)) { + const brw_reg_type exec_type = get_exec_type(this); + const unsigned min_type_sz = opcode == BRW_OPCODE_MAD ? + MIN2(type_sz(src[1].type), type_sz(src[2].type)) : + MIN2(type_sz(src[0].type), type_sz(src[1].type)); + + if (brw_reg_type_is_integer(exec_type) && + type_sz(exec_type) >= 4 && + type_sz(exec_type) != min_type_sz) + return false; + } + + if (!backend_instruction::can_do_source_mods()) + return false; + + return true; +} + +bool +fs_inst::can_do_cmod() +{ + if (!backend_instruction::can_do_cmod()) + return false; + + /* The accumulator result appears to get used for the conditional modifier + * generation. When negating a UD value, there is a 33rd bit generated for + * the sign in the accumulator value, so now you can't check, for example, + * equality with a 32-bit value. See piglit fs-op-neg-uvec4. + */ + for (unsigned i = 0; i < sources; i++) { + if (brw_reg_type_is_unsigned_integer(src[i].type) && src[i].negate) + return false; + } + + return true; +} + +bool +fs_inst::can_change_types() const +{ + return dst.type == src[0].type && + !src[0].abs && !src[0].negate && !saturate && src[0].file != ATTR && + (opcode == BRW_OPCODE_MOV || + (opcode == BRW_OPCODE_SEL && + dst.type == src[1].type && + predicate != BRW_PREDICATE_NONE && + !src[1].abs && !src[1].negate && src[1].file != ATTR)); +} + +void +fs_reg::init() +{ + memset((void*)this, 0, sizeof(*this)); + type = BRW_REGISTER_TYPE_UD; + stride = 1; +} + +/** Generic unset register constructor. */ +fs_reg::fs_reg() +{ + init(); + this->file = BAD_FILE; +} + +fs_reg::fs_reg(struct ::brw_reg reg) : + backend_reg(reg) +{ + this->offset = 0; + this->stride = 1; + if (this->file == IMM && + (this->type != BRW_REGISTER_TYPE_V && + this->type != BRW_REGISTER_TYPE_UV && + this->type != BRW_REGISTER_TYPE_VF)) { + this->stride = 0; + } +} + +bool +fs_reg::equals(const fs_reg &r) const +{ + return (this->backend_reg::equals(r) && + stride == r.stride); +} + +bool +fs_reg::negative_equals(const fs_reg &r) const +{ + return (this->backend_reg::negative_equals(r) && + stride == r.stride); +} + +bool +fs_reg::is_contiguous() const +{ + switch (file) { + case ARF: + case FIXED_GRF: + return hstride == BRW_HORIZONTAL_STRIDE_1 && + vstride == width + hstride; + case MRF: + case VGRF: + case ATTR: + return stride == 1; + case UNIFORM: + case IMM: + case BAD_FILE: + return true; + } + + unreachable("Invalid register file"); +} + +unsigned +fs_reg::component_size(unsigned width) const +{ + if (file == ARF || file == FIXED_GRF) { + const unsigned w = MIN2(width, 1u << this->width); + const unsigned h = width >> this->width; + const unsigned vs = vstride ? 1 << (vstride - 1) : 0; + const unsigned hs = hstride ? 1 << (hstride - 1) : 0; + assert(w > 0); + return ((MAX2(1, h) - 1) * vs + (w - 1) * hs + 1) * type_sz(type); + } else { + return MAX2(width * stride, 1) * type_sz(type); + } +} + +void +fs_visitor::vfail(const char *format, va_list va) +{ + char *msg; + + if (failed) + return; + + failed = true; + + msg = ralloc_vasprintf(mem_ctx, format, va); + msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n", + dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg); + + this->fail_msg = msg; + + if (unlikely(debug_enabled)) { + fprintf(stderr, "%s", msg); + } +} + +void +fs_visitor::fail(const char *format, ...) +{ + va_list va; + + va_start(va, format); + vfail(format, va); + va_end(va); +} + +/** + * Mark this program as impossible to compile with dispatch width greater + * than n. + * + * During the SIMD8 compile (which happens first), we can detect and flag + * things that are unsupported in SIMD16+ mode, so the compiler can skip the + * SIMD16+ compile altogether. + * + * During a compile of dispatch width greater than n (if one happens anyway), + * this just calls fail(). + */ +void +fs_visitor::limit_dispatch_width(unsigned n, const char *msg) +{ + if (dispatch_width > n) { + fail("%s", msg); + } else { + max_dispatch_width = MIN2(max_dispatch_width, n); + brw_shader_perf_log(compiler, log_data, + "Shader dispatch width limited to SIMD%d: %s\n", + n, msg); + } +} + +/** + * Returns true if the instruction has a flag that means it won't + * update an entire destination register. + * + * For example, dead code elimination and live variable analysis want to know + * when a write to a variable screens off any preceding values that were in + * it. + */ +bool +fs_inst::is_partial_write() const +{ + if (this->predicate && !this->predicate_trivial && + this->opcode != BRW_OPCODE_SEL) + return true; + + if (this->dst.offset % REG_SIZE != 0) + return true; + + /* SEND instructions always write whole registers */ + if (this->opcode == SHADER_OPCODE_SEND) + return false; + + /* Special case UNDEF since a lot of places in the backend do things like this : + * + * fs_builder ubld = bld.exec_all().group(1, 0); + * fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD); + * ubld.UNDEF(tmp); <- partial write, even if the whole register is concerned + */ + if (this->opcode == SHADER_OPCODE_UNDEF) { + assert(this->dst.is_contiguous()); + return this->size_written < 32; + } + + return this->exec_size * type_sz(this->dst.type) < 32 || + !this->dst.is_contiguous(); +} + +unsigned +fs_inst::components_read(unsigned i) const +{ + /* Return zero if the source is not present. */ + if (src[i].file == BAD_FILE) + return 0; + + switch (opcode) { + case FS_OPCODE_LINTERP: + if (i == 0) + return 2; + else + return 1; + + case FS_OPCODE_PIXEL_X: + case FS_OPCODE_PIXEL_Y: + assert(i < 2); + if (i == 0) + return 2; + else + return 1; + + case FS_OPCODE_FB_WRITE_LOGICAL: + assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM); + /* First/second FB write color. */ + if (i < 2) + return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud; + else + return 1; + + case SHADER_OPCODE_TEX_LOGICAL: + case SHADER_OPCODE_TXD_LOGICAL: + case SHADER_OPCODE_TXF_LOGICAL: + case SHADER_OPCODE_TXL_LOGICAL: + case SHADER_OPCODE_TXS_LOGICAL: + case SHADER_OPCODE_IMAGE_SIZE_LOGICAL: + case FS_OPCODE_TXB_LOGICAL: + case SHADER_OPCODE_TXF_CMS_LOGICAL: + case SHADER_OPCODE_TXF_CMS_W_LOGICAL: + case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL: + case SHADER_OPCODE_TXF_UMS_LOGICAL: + case SHADER_OPCODE_TXF_MCS_LOGICAL: + case SHADER_OPCODE_LOD_LOGICAL: + case SHADER_OPCODE_TG4_LOGICAL: + case SHADER_OPCODE_TG4_OFFSET_LOGICAL: + case SHADER_OPCODE_SAMPLEINFO_LOGICAL: + assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM && + src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM && + src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM); + /* Texture coordinates. */ + if (i == TEX_LOGICAL_SRC_COORDINATE) + return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud; + /* Texture derivatives. */ + else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) && + opcode == SHADER_OPCODE_TXD_LOGICAL) + return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud; + /* Texture offset. */ + else if (i == TEX_LOGICAL_SRC_TG4_OFFSET) + return 2; + /* MCS */ + else if (i == TEX_LOGICAL_SRC_MCS) { + if (opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL) + return 2; + else if (opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL) + return 4; + else + return 1; + } else + return 1; + + case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: + case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: + assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM); + /* Surface coordinates. */ + if (i == SURFACE_LOGICAL_SRC_ADDRESS) + return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud; + /* Surface operation source (ignored for reads). */ + else if (i == SURFACE_LOGICAL_SRC_DATA) + return 0; + else + return 1; + + case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: + case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: + assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM && + src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM); + /* Surface coordinates. */ + if (i == SURFACE_LOGICAL_SRC_ADDRESS) + return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud; + /* Surface operation source. */ + else if (i == SURFACE_LOGICAL_SRC_DATA) + return src[SURFACE_LOGICAL_SRC_IMM_ARG].ud; + else + return 1; + + case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: + case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL: + case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL: + assert(src[A64_LOGICAL_ARG].file == IMM); + return 1; + + case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL: + assert(src[A64_LOGICAL_ARG].file == IMM); + if (i == A64_LOGICAL_SRC) { /* data to write */ + const unsigned comps = src[A64_LOGICAL_ARG].ud / exec_size; + assert(comps > 0); + return comps; + } else { + return 1; + } + + case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL: + assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM); + return 1; + + case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL: + assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM); + if (i == SURFACE_LOGICAL_SRC_DATA) { + const unsigned comps = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud / exec_size; + assert(comps > 0); + return comps; + } else { + return 1; + } + + case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: + assert(src[A64_LOGICAL_ARG].file == IMM); + return i == A64_LOGICAL_SRC ? src[A64_LOGICAL_ARG].ud : 1; + + case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL: + assert(src[A64_LOGICAL_ARG].file == IMM); + return i == A64_LOGICAL_SRC ? + lsc_op_num_data_values(src[A64_LOGICAL_ARG].ud) : 1; + + case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: + case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL: + /* Scattered logical opcodes use the following params: + * src[0] Surface coordinates + * src[1] Surface operation source (ignored for reads) + * src[2] Surface + * src[3] IMM with always 1 dimension. + * src[4] IMM with arg bitsize for scattered read/write 8, 16, 32 + */ + assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM && + src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM); + return i == SURFACE_LOGICAL_SRC_DATA ? 0 : 1; + + case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: + case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL: + assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM && + src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM); + return 1; + + case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: + case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: { + assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM && + src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM); + const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud; + /* Surface coordinates. */ + if (i == SURFACE_LOGICAL_SRC_ADDRESS) + return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud; + /* Surface operation source. */ + else if (i == SURFACE_LOGICAL_SRC_DATA) + return lsc_op_num_data_values(op); + else + return 1; + } + case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: + return (i == 0 ? 2 : 1); + + case SHADER_OPCODE_URB_WRITE_LOGICAL: + assert(src[URB_LOGICAL_SRC_COMPONENTS].file == IMM); + + if (i == URB_LOGICAL_SRC_DATA) + return src[URB_LOGICAL_SRC_COMPONENTS].ud; + else + return 1; + + case BRW_OPCODE_DPAS: + unreachable("Do not use components_read() for DPAS."); + + default: + return 1; + } +} + +unsigned +fs_inst::size_read(int arg) const +{ + switch (opcode) { + case SHADER_OPCODE_SEND: + if (arg == 2) { + return mlen * REG_SIZE; + } else if (arg == 3) { + return ex_mlen * REG_SIZE; + } + break; + + case FS_OPCODE_FB_WRITE: + case FS_OPCODE_REP_FB_WRITE: + if (arg == 0) { + if (base_mrf >= 0) + return src[0].file == BAD_FILE ? 0 : 2 * REG_SIZE; + else + return mlen * REG_SIZE; + } + break; + + case FS_OPCODE_FB_READ: + case FS_OPCODE_INTERPOLATE_AT_SAMPLE: + case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: + if (arg == 0) + return mlen * REG_SIZE; + break; + + case FS_OPCODE_SET_SAMPLE_ID: + if (arg == 1) + return 1; + break; + + case FS_OPCODE_LINTERP: + if (arg == 1) + return 16; + break; + + case SHADER_OPCODE_LOAD_PAYLOAD: + if (arg < this->header_size) + return retype(src[arg], BRW_REGISTER_TYPE_UD).component_size(8); + break; + + case CS_OPCODE_CS_TERMINATE: + case SHADER_OPCODE_BARRIER: + return REG_SIZE; + + case SHADER_OPCODE_MOV_INDIRECT: + if (arg == 0) { + assert(src[2].file == IMM); + return src[2].ud; + } + break; + + case BRW_OPCODE_DPAS: + switch (arg) { + case 0: + if (src[0].type == BRW_REGISTER_TYPE_HF) { + return rcount * REG_SIZE / 2; + } else { + return rcount * REG_SIZE; + } + case 1: + return sdepth * REG_SIZE; + case 2: + /* This is simpler than the formula described in the Bspec, but it + * covers all of the cases that we support on DG2. + */ + return rcount * REG_SIZE; + default: + unreachable("Invalid source number."); + } + break; + + case SHADER_OPCODE_TEX: + case FS_OPCODE_TXB: + case SHADER_OPCODE_TXD: + case SHADER_OPCODE_TXF: + case SHADER_OPCODE_TXF_LZ: + case SHADER_OPCODE_TXF_CMS: + case SHADER_OPCODE_TXF_CMS_W: + case SHADER_OPCODE_TXF_UMS: + case SHADER_OPCODE_TXF_MCS: + case SHADER_OPCODE_TXL: + case SHADER_OPCODE_TXL_LZ: + case SHADER_OPCODE_TXS: + case SHADER_OPCODE_LOD: + case SHADER_OPCODE_TG4: + case SHADER_OPCODE_TG4_OFFSET: + case SHADER_OPCODE_SAMPLEINFO: + if (arg == 0 && src[0].file == VGRF) + return mlen * REG_SIZE; + break; + + default: + break; + } + + switch (src[arg].file) { + case UNIFORM: + case IMM: + return components_read(arg) * type_sz(src[arg].type); + case BAD_FILE: + case ARF: + case FIXED_GRF: + case VGRF: + case ATTR: + return components_read(arg) * src[arg].component_size(exec_size); + case MRF: + unreachable("MRF registers are not allowed as sources"); + } + return 0; +} + +namespace { + unsigned + predicate_width(const intel_device_info *devinfo, brw_predicate predicate) + { + if (devinfo->ver >= 20) { + return 1; + } else { + switch (predicate) { + case BRW_PREDICATE_NONE: return 1; + case BRW_PREDICATE_NORMAL: return 1; + case BRW_PREDICATE_ALIGN1_ANY2H: return 2; + case BRW_PREDICATE_ALIGN1_ALL2H: return 2; + case BRW_PREDICATE_ALIGN1_ANY4H: return 4; + case BRW_PREDICATE_ALIGN1_ALL4H: return 4; + case BRW_PREDICATE_ALIGN1_ANY8H: return 8; + case BRW_PREDICATE_ALIGN1_ALL8H: return 8; + case BRW_PREDICATE_ALIGN1_ANY16H: return 16; + case BRW_PREDICATE_ALIGN1_ALL16H: return 16; + case BRW_PREDICATE_ALIGN1_ANY32H: return 32; + case BRW_PREDICATE_ALIGN1_ALL32H: return 32; + default: unreachable("Unsupported predicate"); + } + } + } + + /* Return the subset of flag registers that an instruction could + * potentially read or write based on the execution controls and flag + * subregister number of the instruction. + */ + unsigned + flag_mask(const fs_inst *inst, unsigned width) + { + assert(util_is_power_of_two_nonzero(width)); + const unsigned start = (inst->flag_subreg * 16 + inst->group) & + ~(width - 1); + const unsigned end = start + ALIGN(inst->exec_size, width); + return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1); + } + + unsigned + bit_mask(unsigned n) + { + return (n >= CHAR_BIT * sizeof(bit_mask(n)) ? ~0u : (1u << n) - 1); + } + + unsigned + flag_mask(const fs_reg &r, unsigned sz) + { + if (r.file == ARF) { + const unsigned start = (r.nr - BRW_ARF_FLAG) * 4 + r.subnr; + const unsigned end = start + sz; + return bit_mask(end) & ~bit_mask(start); + } else { + return 0; + } + } +} + +unsigned +fs_inst::flags_read(const intel_device_info *devinfo) const +{ + if (devinfo->ver < 20 && (predicate == BRW_PREDICATE_ALIGN1_ANYV || + predicate == BRW_PREDICATE_ALIGN1_ALLV)) { + /* The vertical predication modes combine corresponding bits from + * f0.0 and f1.0 on Gfx7+, and f0.0 and f0.1 on older hardware. + */ + const unsigned shift = devinfo->ver >= 7 ? 4 : 2; + return flag_mask(this, 1) << shift | flag_mask(this, 1); + } else if (predicate) { + return flag_mask(this, predicate_width(devinfo, predicate)); + } else { + unsigned mask = 0; + for (int i = 0; i < sources; i++) { + mask |= flag_mask(src[i], size_read(i)); + } + return mask; + } +} + +unsigned +fs_inst::flags_written(const intel_device_info *devinfo) const +{ + /* On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented + * using a separate cmpn and sel instruction. This lowering occurs in + * fs_vistor::lower_minmax which is called very, very late. + */ + if ((conditional_mod && ((opcode != BRW_OPCODE_SEL || devinfo->ver <= 5) && + opcode != BRW_OPCODE_CSEL && + opcode != BRW_OPCODE_IF && + opcode != BRW_OPCODE_WHILE)) || + opcode == FS_OPCODE_FB_WRITE) { + return flag_mask(this, 1); + } else if (opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL || + opcode == SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL || + opcode == FS_OPCODE_LOAD_LIVE_CHANNELS) { + return flag_mask(this, 32); + } else { + return flag_mask(dst, size_written); + } +} + +/** + * Returns how many MRFs an FS opcode will write over. + * + * Note that this is not the 0 or 1 implied writes in an actual gen + * instruction -- the FS opcodes often generate MOVs in addition. + */ +unsigned +fs_inst::implied_mrf_writes() const +{ + if (mlen == 0) + return 0; + + if (base_mrf == -1) + return 0; + + switch (opcode) { + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + return 1 * exec_size / 8; + case SHADER_OPCODE_POW: + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + return 2 * exec_size / 8; + case SHADER_OPCODE_TEX: + case FS_OPCODE_TXB: + case SHADER_OPCODE_TXD: + case SHADER_OPCODE_TXF: + case SHADER_OPCODE_TXF_CMS: + case SHADER_OPCODE_TXF_MCS: + case SHADER_OPCODE_TG4: + case SHADER_OPCODE_TG4_OFFSET: + case SHADER_OPCODE_TXL: + case SHADER_OPCODE_TXS: + case SHADER_OPCODE_LOD: + case SHADER_OPCODE_SAMPLEINFO: + return 1; + case FS_OPCODE_FB_WRITE: + case FS_OPCODE_REP_FB_WRITE: + return src[0].file == BAD_FILE ? 0 : 2; + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: + case SHADER_OPCODE_GFX4_SCRATCH_READ: + return 1; + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4: + return mlen; + case SHADER_OPCODE_GFX4_SCRATCH_WRITE: + return mlen; + default: + unreachable("not reached"); + } +} + +bool +fs_inst::has_sampler_residency() const +{ + switch (opcode) { + case SHADER_OPCODE_TEX_LOGICAL: + case FS_OPCODE_TXB_LOGICAL: + case SHADER_OPCODE_TXL_LOGICAL: + case SHADER_OPCODE_TXD_LOGICAL: + case SHADER_OPCODE_TXF_LOGICAL: + case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL: + case SHADER_OPCODE_TXF_CMS_W_LOGICAL: + case SHADER_OPCODE_TXF_CMS_LOGICAL: + case SHADER_OPCODE_TXS_LOGICAL: + case SHADER_OPCODE_TG4_OFFSET_LOGICAL: + case SHADER_OPCODE_TG4_LOGICAL: + assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM); + return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0; + default: + return false; + } +} + +fs_reg +fs_visitor::vgrf(const glsl_type *const type) +{ + int reg_width = dispatch_width / 8; + return fs_reg(VGRF, + alloc.allocate(glsl_count_dword_slots(type, false) * reg_width), + brw_type_for_base_type(type)); +} + +fs_reg::fs_reg(enum brw_reg_file file, unsigned nr) +{ + init(); + this->file = file; + this->nr = nr; + this->type = BRW_REGISTER_TYPE_F; + this->stride = (file == UNIFORM ? 0 : 1); +} + +fs_reg::fs_reg(enum brw_reg_file file, unsigned nr, enum brw_reg_type type) +{ + init(); + this->file = file; + this->nr = nr; + this->type = type; + this->stride = (file == UNIFORM ? 0 : 1); +} + +/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch. + * This brings in those uniform definitions + */ +void +fs_visitor::import_uniforms(fs_visitor *v) +{ + this->push_constant_loc = v->push_constant_loc; + this->uniforms = v->uniforms; +} + +enum brw_barycentric_mode +brw_barycentric_mode(nir_intrinsic_instr *intr) +{ + const glsl_interp_mode mode = + (enum glsl_interp_mode) nir_intrinsic_interp_mode(intr); + + /* Barycentric modes don't make sense for flat inputs. */ + assert(mode != INTERP_MODE_FLAT); + + unsigned bary; + switch (intr->intrinsic) { + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_at_offset: + bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL; + break; + case nir_intrinsic_load_barycentric_centroid: + bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID; + break; + case nir_intrinsic_load_barycentric_sample: + case nir_intrinsic_load_barycentric_at_sample: + bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE; + break; + default: + unreachable("invalid intrinsic"); + } + + if (mode == INTERP_MODE_NOPERSPECTIVE) + bary += 3; + + return (enum brw_barycentric_mode) bary; +} + +/** + * Turn one of the two CENTROID barycentric modes into PIXEL mode. + */ +static enum brw_barycentric_mode +centroid_to_pixel(enum brw_barycentric_mode bary) +{ + assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID || + bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID); + return (enum brw_barycentric_mode) ((unsigned) bary - 1); +} + +/** + * Walk backwards from the end of the program looking for a URB write that + * isn't in control flow, and mark it with EOT. + * + * Return true if successful or false if a separate EOT write is needed. + */ +bool +fs_visitor::mark_last_urb_write_with_eot() +{ + foreach_in_list_reverse(fs_inst, prev, &this->instructions) { + if (prev->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) { + prev->eot = true; + + /* Delete now dead instructions. */ + foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) { + if (dead == prev) + break; + dead->remove(); + } + return true; + } else if (prev->is_control_flow() || prev->has_side_effects()) { + break; + } + } + + return false; +} + +void +fs_visitor::emit_gs_thread_end() +{ + assert(stage == MESA_SHADER_GEOMETRY); + + struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); + + if (gs_compile->control_data_header_size_bits > 0) { + emit_gs_control_data_bits(this->final_gs_vertex_count); + } + + const fs_builder abld = fs_builder(this).at_end().annotate("thread end"); + fs_inst *inst; + + if (gs_prog_data->static_vertex_count != -1) { + /* Try and tag the last URB write with EOT instead of emitting a whole + * separate write just to finish the thread. + */ + if (mark_last_urb_write_with_eot()) + return; + + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles; + srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(0); + inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, + srcs, ARRAY_SIZE(srcs)); + } else { + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles; + srcs[URB_LOGICAL_SRC_DATA] = this->final_gs_vertex_count; + srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1); + inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, + srcs, ARRAY_SIZE(srcs)); + } + inst->eot = true; + inst->offset = 0; +} + +void +fs_visitor::assign_curb_setup() +{ + unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8); + + unsigned ubo_push_length = 0; + unsigned ubo_push_start[4]; + for (int i = 0; i < 4; i++) { + ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length); + ubo_push_length += stage_prog_data->ubo_ranges[i].length; + } + + prog_data->curb_read_length = uniform_push_length + ubo_push_length; + + uint64_t used = 0; + bool is_compute = gl_shader_stage_is_compute(stage); + + if (is_compute && brw_cs_prog_data(prog_data)->uses_inline_data) { + /* With COMPUTE_WALKER, we can push up to one register worth of data via + * the inline data parameter in the COMPUTE_WALKER command itself. + * + * TODO: Support inline data and push at the same time. + */ + assert(devinfo->verx10 >= 125); + assert(uniform_push_length <= reg_unit(devinfo)); + } else if (is_compute && devinfo->verx10 >= 125) { + assert(devinfo->has_lsc); + fs_builder ubld = fs_builder(this, 1).exec_all().at( + cfg->first_block(), cfg->first_block()->start()); + + /* The base offset for our push data is passed in as R0.0[31:6]. We have + * to mask off the bottom 6 bits. + */ + fs_reg base_addr = ubld.vgrf(BRW_REGISTER_TYPE_UD); + ubld.AND(base_addr, + retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), + brw_imm_ud(INTEL_MASK(31, 6))); + + /* On Gfx12-HP we load constants at the start of the program using A32 + * stateless messages. + */ + for (unsigned i = 0; i < uniform_push_length;) { + /* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */ + unsigned num_regs = MIN2(uniform_push_length - i, 8); + assert(num_regs > 0); + num_regs = 1 << util_logbase2(num_regs); + + fs_reg addr = ubld.vgrf(BRW_REGISTER_TYPE_UD); + ubld.ADD(addr, base_addr, brw_imm_ud(i * REG_SIZE)); + + fs_reg srcs[4] = { + brw_imm_ud(0), /* desc */ + brw_imm_ud(0), /* ex_desc */ + addr, /* payload */ + fs_reg(), /* payload2 */ + }; + + fs_reg dest = retype(brw_vec8_grf(payload().num_regs + i, 0), + BRW_REGISTER_TYPE_UD); + fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4); + + send->sfid = GFX12_SFID_UGM; + send->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, + 1 /* exec_size */, + LSC_ADDR_SURFTYPE_FLAT, + LSC_ADDR_SIZE_A32, + 1 /* num_coordinates */, + LSC_DATA_SIZE_D32, + num_regs * 8 /* num_channels */, + true /* transpose */, + LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS), + true /* has_dest */); + send->header_size = 0; + send->mlen = lsc_msg_desc_src0_len(devinfo, send->desc); + send->size_written = + lsc_msg_desc_dest_len(devinfo, send->desc) * REG_SIZE; + send->send_is_volatile = true; + + i += num_regs; + } + + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + } + + /* Map the offsets in the UNIFORM file to fixed HW regs. */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + for (unsigned int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == UNIFORM) { + int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4; + int constant_nr; + if (inst->src[i].nr >= UBO_START) { + /* constant_nr is in 32-bit units, the rest are in bytes */ + constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] + + inst->src[i].offset / 4; + } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) { + constant_nr = push_constant_loc[uniform_nr]; + } else { + /* Section 5.11 of the OpenGL 4.1 spec says: + * "Out-of-bounds reads return undefined values, which include + * values from other variables of the active program or zero." + * Just return the first push constant. + */ + constant_nr = 0; + } + + assert(constant_nr / 8 < 64); + used |= BITFIELD64_BIT(constant_nr / 8); + + struct brw_reg brw_reg = brw_vec1_grf(payload().num_regs + + constant_nr / 8, + constant_nr % 8); + brw_reg.abs = inst->src[i].abs; + brw_reg.negate = inst->src[i].negate; + + assert(inst->src[i].stride == 0); + inst->src[i] = byte_offset( + retype(brw_reg, inst->src[i].type), + inst->src[i].offset % 4); + } + } + } + + uint64_t want_zero = used & stage_prog_data->zero_push_reg; + if (want_zero) { + fs_builder ubld = fs_builder(this, 8).exec_all().at( + cfg->first_block(), cfg->first_block()->start()); + + /* push_reg_mask_param is in 32-bit units */ + unsigned mask_param = stage_prog_data->push_reg_mask_param; + struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8, + mask_param % 8); + + fs_reg b32; + for (unsigned i = 0; i < 64; i++) { + if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) { + fs_reg shifted = ubld.vgrf(BRW_REGISTER_TYPE_W, 2); + ubld.SHL(horiz_offset(shifted, 8), + byte_offset(retype(mask, BRW_REGISTER_TYPE_W), i / 8), + brw_imm_v(0x01234567)); + ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8)); + + fs_builder ubld16 = ubld.group(16, 0); + b32 = ubld16.vgrf(BRW_REGISTER_TYPE_D); + ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15)); + } + + if (want_zero & BITFIELD64_BIT(i)) { + assert(i < prog_data->curb_read_length); + struct brw_reg push_reg = + retype(brw_vec8_grf(payload().num_regs + i, 0), + BRW_REGISTER_TYPE_D); + + ubld.AND(push_reg, push_reg, component(b32, i % 16)); + } + } + + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + } + + /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */ + this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length; +} + +/* + * Build up an array of indices into the urb_setup array that + * references the active entries of the urb_setup array. + * Used to accelerate walking the active entries of the urb_setup array + * on each upload. + */ +void +brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data) +{ + /* TODO(mesh): Review usage of this in the context of Mesh, we may want to + * skip per-primitive attributes here. + */ + + /* Make sure uint8_t is sufficient */ + STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff); + uint8_t index = 0; + for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) { + if (wm_prog_data->urb_setup[attr] >= 0) { + wm_prog_data->urb_setup_attribs[index++] = attr; + } + } + wm_prog_data->urb_setup_attribs_count = index; +} + +static void +calculate_urb_setup(const struct intel_device_info *devinfo, + const struct brw_wm_prog_key *key, + struct brw_wm_prog_data *prog_data, + const nir_shader *nir, + const struct brw_mue_map *mue_map) +{ + memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup)); + memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel)); + + int urb_next = 0; /* in vec4s */ + + const uint64_t inputs_read = + nir->info.inputs_read & ~nir->info.per_primitive_inputs; + + /* Figure out where each of the incoming setup attributes lands. */ + if (key->mesh_input != BRW_NEVER) { + /* Per-Primitive Attributes are laid out by Hardware before the regular + * attributes, so order them like this to make easy later to map setup + * into real HW registers. + */ + if (nir->info.per_primitive_inputs) { + uint64_t per_prim_inputs_read = + nir->info.inputs_read & nir->info.per_primitive_inputs; + + /* In Mesh, PRIMITIVE_SHADING_RATE, VIEWPORT and LAYER slots + * are always at the beginning, because they come from MUE + * Primitive Header, not Per-Primitive Attributes. + */ + const uint64_t primitive_header_bits = VARYING_BIT_VIEWPORT | + VARYING_BIT_LAYER | + VARYING_BIT_PRIMITIVE_SHADING_RATE; + + if (mue_map) { + unsigned per_prim_start_dw = mue_map->per_primitive_start_dw; + unsigned per_prim_size_dw = mue_map->per_primitive_pitch_dw; + + bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0; + + if (reads_header || mue_map->user_data_in_primitive_header) { + /* Primitive Shading Rate, Layer and Viewport live in the same + * 4-dwords slot (psr is dword 0, layer is dword 1, and viewport + * is dword 2). + */ + if (per_prim_inputs_read & VARYING_BIT_PRIMITIVE_SHADING_RATE) + prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 0; + + if (per_prim_inputs_read & VARYING_BIT_LAYER) + prog_data->urb_setup[VARYING_SLOT_LAYER] = 0; + + if (per_prim_inputs_read & VARYING_BIT_VIEWPORT) + prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0; + + per_prim_inputs_read &= ~primitive_header_bits; + } else { + /* If fs doesn't need primitive header, then it won't be made + * available through SBE_MESH, so we have to skip them when + * calculating offset from start of per-prim data. + */ + per_prim_start_dw += mue_map->per_primitive_header_size_dw; + per_prim_size_dw -= mue_map->per_primitive_header_size_dw; + } + + u_foreach_bit64(i, per_prim_inputs_read) { + int start = mue_map->start_dw[i]; + + assert(start >= 0); + assert(mue_map->len_dw[i] > 0); + + assert(unsigned(start) >= per_prim_start_dw); + unsigned pos_dw = unsigned(start) - per_prim_start_dw; + + prog_data->urb_setup[i] = urb_next + pos_dw / 4; + prog_data->urb_setup_channel[i] = pos_dw % 4; + } + + urb_next = per_prim_size_dw / 4; + } else { + /* With no MUE map, we never read the primitive header, and + * per-primitive attributes won't be packed either, so just lay + * them in varying order. + */ + per_prim_inputs_read &= ~primitive_header_bits; + + for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) { + if (per_prim_inputs_read & BITFIELD64_BIT(i)) { + prog_data->urb_setup[i] = urb_next++; + } + } + + /* The actual setup attributes later must be aligned to a full GRF. */ + urb_next = ALIGN(urb_next, 2); + } + + prog_data->num_per_primitive_inputs = urb_next; + } + + const uint64_t clip_dist_bits = VARYING_BIT_CLIP_DIST0 | + VARYING_BIT_CLIP_DIST1; + + uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK; + + if (inputs_read & clip_dist_bits) { + assert(!mue_map || mue_map->per_vertex_header_size_dw > 8); + unique_fs_attrs &= ~clip_dist_bits; + } + + if (mue_map) { + unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw; + unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw; + + /* Per-Vertex header is available to fragment shader only if there's + * user data there. + */ + if (!mue_map->user_data_in_vertex_header) { + per_vertex_start_dw += 8; + per_vertex_size_dw -= 8; + } + + /* In Mesh, CLIP_DIST slots are always at the beginning, because + * they come from MUE Vertex Header, not Per-Vertex Attributes. + */ + if (inputs_read & clip_dist_bits) { + prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next; + prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next + 1; + } else if (mue_map && mue_map->per_vertex_header_size_dw > 8) { + /* Clip distances are in MUE, but we are not reading them in FS. */ + per_vertex_start_dw += 8; + per_vertex_size_dw -= 8; + } + + /* Per-Vertex attributes are laid out ordered. Because we always link + * Mesh and Fragment shaders, the which slots are written and read by + * each of them will match. */ + u_foreach_bit64(i, unique_fs_attrs) { + int start = mue_map->start_dw[i]; + + assert(start >= 0); + assert(mue_map->len_dw[i] > 0); + + assert(unsigned(start) >= per_vertex_start_dw); + unsigned pos_dw = unsigned(start) - per_vertex_start_dw; + + prog_data->urb_setup[i] = urb_next + pos_dw / 4; + prog_data->urb_setup_channel[i] = pos_dw % 4; + } + + urb_next += per_vertex_size_dw / 4; + } else { + /* If we don't have an MUE map, just lay down the inputs the FS reads + * in varying order, as we do for the legacy pipeline. + */ + if (inputs_read & clip_dist_bits) { + prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++; + prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++; + } + + for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { + if (unique_fs_attrs & BITFIELD64_BIT(i)) + prog_data->urb_setup[i] = urb_next++; + } + } + } else if (devinfo->ver >= 6) { + assert(!nir->info.per_primitive_inputs); + + uint64_t vue_header_bits = + VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT; + + uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK; + + /* VUE header fields all live in the same URB slot, so we pass them + * as a single FS input attribute. We want to only count them once. + */ + if (inputs_read & vue_header_bits) { + unique_fs_attrs &= ~vue_header_bits; + unique_fs_attrs |= VARYING_BIT_PSIZ; + } + + if (util_bitcount64(unique_fs_attrs) <= 16) { + /* The SF/SBE pipeline stage can do arbitrary rearrangement of the + * first 16 varying inputs, so we can put them wherever we want. + * Just put them in order. + * + * This is useful because it means that (a) inputs not used by the + * fragment shader won't take up valuable register space, and (b) we + * won't have to recompile the fragment shader if it gets paired with + * a different vertex (or geometry) shader. + * + * VUE header fields share the same FS input attribute. + */ + if (inputs_read & vue_header_bits) { + if (inputs_read & VARYING_BIT_PSIZ) + prog_data->urb_setup[VARYING_SLOT_PSIZ] = urb_next; + if (inputs_read & VARYING_BIT_LAYER) + prog_data->urb_setup[VARYING_SLOT_LAYER] = urb_next; + if (inputs_read & VARYING_BIT_VIEWPORT) + prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = urb_next; + + urb_next++; + } + + for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { + if (inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits & + BITFIELD64_BIT(i)) { + prog_data->urb_setup[i] = urb_next++; + } + } + } else { + /* We have enough input varyings that the SF/SBE pipeline stage can't + * arbitrarily rearrange them to suit our whim; we have to put them + * in an order that matches the output of the previous pipeline stage + * (geometry or vertex shader). + */ + + /* Re-compute the VUE map here in the case that the one coming from + * geometry has more than one position slot (used for Primitive + * Replication). + */ + struct intel_vue_map prev_stage_vue_map; + brw_compute_vue_map(devinfo, &prev_stage_vue_map, + key->input_slots_valid, + nir->info.separate_shader, 1); + + int first_slot = + brw_compute_first_urb_slot_required(inputs_read, + &prev_stage_vue_map); + + assert(prev_stage_vue_map.num_slots <= first_slot + 32); + for (int slot = first_slot; slot < prev_stage_vue_map.num_slots; + slot++) { + int varying = prev_stage_vue_map.slot_to_varying[slot]; + if (varying != BRW_VARYING_SLOT_PAD && + (inputs_read & BRW_FS_VARYING_INPUT_MASK & + BITFIELD64_BIT(varying))) { + prog_data->urb_setup[varying] = slot - first_slot; + } + } + urb_next = prev_stage_vue_map.num_slots - first_slot; + } + } else { + /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ + for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { + /* Point size is packed into the header, not as a general attribute */ + if (i == VARYING_SLOT_PSIZ) + continue; + + if (key->input_slots_valid & BITFIELD64_BIT(i)) { + /* The back color slot is skipped when the front color is + * also written to. In addition, some slots can be + * written in the vertex shader and not read in the + * fragment shader. So the register number must always be + * incremented, mapped or not. + */ + if (_mesa_varying_slot_in_fs((gl_varying_slot) i)) + prog_data->urb_setup[i] = urb_next; + urb_next++; + } + } + + /* + * It's a FS only attribute, and we did interpolation for this attribute + * in SF thread. So, count it here, too. + * + * See compile_sf_prog() for more info. + */ + if (inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC)) + prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++; + } + + prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs; + prog_data->inputs = inputs_read; + + brw_compute_urb_setup_index(prog_data); +} + +void +fs_visitor::assign_urb_setup() +{ + assert(stage == MESA_SHADER_FRAGMENT); + struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); + + int urb_start = payload().num_regs + prog_data->base.curb_read_length; + + /* Offset all the urb_setup[] index by the actual position of the + * setup regs, now that the location of the constants has been chosen. + */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == ATTR) { + /* ATTR fs_reg::nr in the FS is in units of logical scalar + * inputs each of which consumes 16B on Gfx4-Gfx12. In + * single polygon mode this leads to the following layout + * of the vertex setup plane parameters in the ATTR + * register file: + * + * fs_reg::nr Input Comp0 Comp1 Comp2 Comp3 + * 0 Attr0.x a1-a0 a2-a0 N/A a0 + * 1 Attr0.y a1-a0 a2-a0 N/A a0 + * 2 Attr0.z a1-a0 a2-a0 N/A a0 + * 3 Attr0.w a1-a0 a2-a0 N/A a0 + * 4 Attr1.x a1-a0 a2-a0 N/A a0 + * ... + * + * In multipolygon mode that no longer works since + * different channels may be processing polygons with + * different plane parameters, so each parameter above is + * represented as a dispatch_width-wide vector: + * + * fs_reg::nr fs_reg::offset Input Comp0 ... CompN + * 0 0 Attr0.x a1[0]-a0[0] ... a1[N]-a0[N] + * 0 4 * dispatch_width Attr0.x a2[0]-a0[0] ... a2[N]-a0[N] + * 0 8 * dispatch_width Attr0.x N/A ... N/A + * 0 12 * dispatch_width Attr0.x a0[0] ... a0[N] + * 1 0 Attr0.y a1[0]-a0[0] ... a1[N]-a0[N] + * ... + * + * Note that many of the components on a single row above + * are likely to be replicated multiple times (if, say, a + * single SIMD thread is only processing 2 different + * polygons), so plane parameters aren't actually stored + * in GRF memory with that layout to avoid wasting space. + * Instead we compose ATTR register regions with a 2D + * region that walks through the parameters of each + * polygon with the correct stride, reading the parameter + * corresponding to each channel directly from the PS + * thread payload. + * + * The latter layout corresponds to a param_width equal to + * dispatch_width, while the former (scalar parameter) + * layout has a param_width of 1. + * + * Gfx20+ represent plane parameters in a format similar + * to the above, except the parameters are packed in 12B + * and ordered like "a0, a1-a0, a2-a0" instead of the + * above vec4 representation with a missing component. + */ + const unsigned param_width = (max_polygons > 1 ? dispatch_width : 1); + + /* Size of a single scalar component of a plane parameter + * in bytes. + */ + const unsigned chan_sz = 4; + struct brw_reg reg; + assert(max_polygons > 0); + + /* Calculate the base register on the thread payload of + * either the block of vertex setup data or the block of + * per-primitive constant data depending on whether we're + * accessing a primitive or vertex input. Also calculate + * the index of the input within that block. + */ + const bool per_prim = inst->src[i].nr < prog_data->num_per_primitive_inputs; + const unsigned base = urb_start + + (per_prim ? 0 : + ALIGN(prog_data->num_per_primitive_inputs / 2, + reg_unit(devinfo)) * max_polygons); + const unsigned idx = per_prim ? inst->src[i].nr : + inst->src[i].nr - prog_data->num_per_primitive_inputs; + + /* Translate the offset within the param_width-wide + * representation described above into an offset and a + * grf, which contains the plane parameters for the first + * polygon processed by the thread. + */ + if (devinfo->ver >= 20 && !per_prim) { + /* Gfx20+ is able to pack 5 logical input components + * per 64B register for vertex setup data. + */ + const unsigned grf = base + idx / 5 * 2 * max_polygons; + assert(inst->src[i].offset / param_width < 12); + const unsigned delta = idx % 5 * 12 + + inst->src[i].offset / (param_width * chan_sz) * chan_sz + + inst->src[i].offset % chan_sz; + reg = byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type), + delta); + } else { + /* Earlier platforms and per-primitive block pack 2 logical + * input components per 32B register. + */ + const unsigned grf = base + idx / 2 * max_polygons; + assert(inst->src[i].offset / param_width < REG_SIZE / 2); + const unsigned delta = (idx % 2) * (REG_SIZE / 2) + + inst->src[i].offset / (param_width * chan_sz) * chan_sz + + inst->src[i].offset % chan_sz; + reg = byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type), + delta); + } + + if (max_polygons > 1) { + assert(devinfo->ver >= 12); + /* Misaligned channel strides that would lead to + * cross-channel access in the representation above are + * disallowed. + */ + assert(inst->src[i].stride * type_sz(inst->src[i].type) == chan_sz); + + /* Number of channels processing the same polygon. */ + const unsigned poly_width = dispatch_width / max_polygons; + assert(dispatch_width % max_polygons == 0); + + /* Accessing a subset of channels of a parameter vector + * starting from "chan" is necessary to handle + * SIMD-lowered instructions though. + */ + const unsigned chan = inst->src[i].offset % + (param_width * chan_sz) / chan_sz; + assert(chan < dispatch_width); + assert(chan % poly_width == 0); + const unsigned reg_size = reg_unit(devinfo) * REG_SIZE; + reg = byte_offset(reg, chan / poly_width * reg_size); + + if (inst->exec_size > poly_width) { + /* Accessing the parameters for multiple polygons. + * Corresponding parameters for different polygons + * are stored a GRF apart on the thread payload, so + * use that as vertical stride. + */ + const unsigned vstride = reg_size / type_sz(inst->src[i].type); + assert(vstride <= 32); + assert(chan % poly_width == 0); + reg = stride(reg, vstride, poly_width, 0); + } else { + /* Accessing one parameter for a single polygon -- + * Translate to a scalar region. + */ + assert(chan % poly_width + inst->exec_size <= poly_width); + reg = stride(reg, 0, 1, 0); + } + + } else { + const unsigned width = inst->src[i].stride == 0 ? + 1 : MIN2(inst->exec_size, 8); + reg = stride(reg, width * inst->src[i].stride, + width, inst->src[i].stride); + } + + reg.abs = inst->src[i].abs; + reg.negate = inst->src[i].negate; + inst->src[i] = reg; + } + } + } + + /* Each attribute is 4 setup channels, each of which is half a reg, + * but they may be replicated multiple times for multipolygon + * dispatch. + */ + this->first_non_payload_grf += prog_data->num_varying_inputs * 2 * max_polygons; + + /* Unlike regular attributes, per-primitive attributes have all 4 channels + * in the same slot, so each GRF can store two slots. + */ + assert(prog_data->num_per_primitive_inputs % 2 == 0); + this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2 * max_polygons; +} + +void +fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst) +{ + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == ATTR) { + assert(inst->src[i].nr == 0); + int grf = payload().num_regs + + prog_data->curb_read_length + + inst->src[i].offset / REG_SIZE; + + /* As explained at brw_reg_from_fs_reg, From the Haswell PRM: + * + * VertStride must be used to cross GRF register boundaries. This + * rule implies that elements within a 'Width' cannot cross GRF + * boundaries. + * + * So, for registers that are large enough, we have to split the exec + * size in two and trust the compression state to sort it out. + */ + unsigned total_size = inst->exec_size * + inst->src[i].stride * + type_sz(inst->src[i].type); + + assert(total_size <= 2 * REG_SIZE); + const unsigned exec_size = + (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2; + + unsigned width = inst->src[i].stride == 0 ? 1 : exec_size; + struct brw_reg reg = + stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type), + inst->src[i].offset % REG_SIZE), + exec_size * inst->src[i].stride, + width, inst->src[i].stride); + reg.abs = inst->src[i].abs; + reg.negate = inst->src[i].negate; + + inst->src[i] = reg; + } + } +} + +void +fs_visitor::assign_vs_urb_setup() +{ + struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data); + + assert(stage == MESA_SHADER_VERTEX); + + /* Each attribute is 4 regs. */ + this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots; + + assert(vs_prog_data->base.urb_read_length <= 15); + + /* Rewrite all ATTR file references to the hw grf that they land in. */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + convert_attr_sources_to_hw_regs(inst); + } +} + +void +fs_visitor::assign_tcs_urb_setup() +{ + assert(stage == MESA_SHADER_TESS_CTRL); + + /* Rewrite all ATTR file references to HW_REGs. */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + convert_attr_sources_to_hw_regs(inst); + } +} + +void +fs_visitor::assign_tes_urb_setup() +{ + assert(stage == MESA_SHADER_TESS_EVAL); + + struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data); + + first_non_payload_grf += 8 * vue_prog_data->urb_read_length; + + /* Rewrite all ATTR file references to HW_REGs. */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + convert_attr_sources_to_hw_regs(inst); + } +} + +void +fs_visitor::assign_gs_urb_setup() +{ + assert(stage == MESA_SHADER_GEOMETRY); + + struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data); + + first_non_payload_grf += + 8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in; + + foreach_block_and_inst(block, fs_inst, inst, cfg) { + /* Rewrite all ATTR file references to GRFs. */ + convert_attr_sources_to_hw_regs(inst); + } +} + + +/** + * Split large virtual GRFs into separate components if we can. + * + * This pass aggressively splits VGRFs into as small a chunks as possible, + * down to single registers if it can. If no VGRFs can be split, we return + * false so this pass can safely be used inside an optimization loop. We + * want to split, because virtual GRFs are what we register allocate and + * spill (due to contiguousness requirements for some instructions), and + * they're what we naturally generate in the codegen process, but most + * virtual GRFs don't actually need to be contiguous sets of GRFs. If we + * split, we'll end up with reduced live intervals and better dead code + * elimination and coalescing. + */ +bool +fs_visitor::split_virtual_grfs() +{ + /* Compact the register file so we eliminate dead vgrfs. This + * only defines split points for live registers, so if we have + * too large dead registers they will hit assertions later. + */ + compact_virtual_grfs(); + + unsigned num_vars = this->alloc.count; + + /* Count the total number of registers */ + unsigned reg_count = 0; + unsigned vgrf_to_reg[num_vars]; + for (unsigned i = 0; i < num_vars; i++) { + vgrf_to_reg[i] = reg_count; + reg_count += alloc.sizes[i]; + } + + /* An array of "split points". For each register slot, this indicates + * if this slot can be separated from the previous slot. Every time an + * instruction uses multiple elements of a register (as a source or + * destination), we mark the used slots as inseparable. Then we go + * through and split the registers into the smallest pieces we can. + */ + bool *split_points = new bool[reg_count]; + memset(split_points, 0, reg_count * sizeof(*split_points)); + + /* Mark all used registers as fully splittable */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (inst->dst.file == VGRF) { + unsigned reg = vgrf_to_reg[inst->dst.nr]; + for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++) + split_points[reg + j] = true; + } + + for (unsigned i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF) { + unsigned reg = vgrf_to_reg[inst->src[i].nr]; + for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++) + split_points[reg + j] = true; + } + } + } + + foreach_block_and_inst(block, fs_inst, inst, cfg) { + /* We fix up undef instructions later */ + if (inst->opcode == SHADER_OPCODE_UNDEF) { + assert(inst->dst.file == VGRF); + continue; + } + + if (inst->dst.file == VGRF) { + unsigned reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE; + for (unsigned j = 1; j < regs_written(inst); j++) + split_points[reg + j] = false; + } + for (unsigned i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF) { + unsigned reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE; + for (unsigned j = 1; j < regs_read(inst, i); j++) + split_points[reg + j] = false; + } + } + } + + /* Bitset of which registers have been split */ + bool *vgrf_has_split = new bool[num_vars]; + memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split)); + + unsigned *new_virtual_grf = new unsigned[reg_count]; + unsigned *new_reg_offset = new unsigned[reg_count]; + + unsigned reg = 0; + bool has_splits = false; + for (unsigned i = 0; i < num_vars; i++) { + /* The first one should always be 0 as a quick sanity check. */ + assert(split_points[reg] == false); + + /* j = 0 case */ + new_reg_offset[reg] = 0; + reg++; + unsigned offset = 1; + + /* j > 0 case */ + for (unsigned j = 1; j < alloc.sizes[i]; j++) { + /* If this is a split point, reset the offset to 0 and allocate a + * new virtual GRF for the previous offset many registers + */ + if (split_points[reg]) { + has_splits = true; + vgrf_has_split[i] = true; + assert(offset <= MAX_VGRF_SIZE(devinfo)); + unsigned grf = alloc.allocate(offset); + for (unsigned k = reg - offset; k < reg; k++) + new_virtual_grf[k] = grf; + offset = 0; + } + new_reg_offset[reg] = offset; + offset++; + reg++; + } + + /* The last one gets the original register number */ + assert(offset <= MAX_VGRF_SIZE(devinfo)); + alloc.sizes[i] = offset; + for (unsigned k = reg - offset; k < reg; k++) + new_virtual_grf[k] = i; + } + assert(reg == reg_count); + + bool progress; + if (!has_splits) { + progress = false; + goto cleanup; + } + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + if (inst->opcode == SHADER_OPCODE_UNDEF) { + assert(inst->dst.file == VGRF); + if (vgrf_has_split[inst->dst.nr]) { + const fs_builder ibld(this, block, inst); + assert(inst->size_written % REG_SIZE == 0); + unsigned reg_offset = inst->dst.offset / REG_SIZE; + unsigned size_written = 0; + while (size_written < inst->size_written) { + reg = vgrf_to_reg[inst->dst.nr] + reg_offset + size_written / REG_SIZE; + fs_inst *undef = + ibld.UNDEF( + byte_offset(fs_reg(VGRF, new_virtual_grf[reg], inst->dst.type), + new_reg_offset[reg] * REG_SIZE)); + undef->size_written = + MIN2(inst->size_written - size_written, undef->size_written); + assert(undef->size_written % REG_SIZE == 0); + size_written += undef->size_written; + } + inst->remove(block); + } else { + reg = vgrf_to_reg[inst->dst.nr]; + assert(new_reg_offset[reg] == 0); + assert(new_virtual_grf[reg] == inst->dst.nr); + } + continue; + } + + if (inst->dst.file == VGRF) { + reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE; + if (vgrf_has_split[inst->dst.nr]) { + inst->dst.nr = new_virtual_grf[reg]; + inst->dst.offset = new_reg_offset[reg] * REG_SIZE + + inst->dst.offset % REG_SIZE; + assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]); + } else { + assert(new_reg_offset[reg] == inst->dst.offset / REG_SIZE); + assert(new_virtual_grf[reg] == inst->dst.nr); + } + } + for (unsigned i = 0; i < inst->sources; i++) { + if (inst->src[i].file != VGRF) + continue; + + reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE; + if (vgrf_has_split[inst->src[i].nr]) { + inst->src[i].nr = new_virtual_grf[reg]; + inst->src[i].offset = new_reg_offset[reg] * REG_SIZE + + inst->src[i].offset % REG_SIZE; + assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]); + } else { + assert(new_reg_offset[reg] == inst->src[i].offset / REG_SIZE); + assert(new_virtual_grf[reg] == inst->src[i].nr); + } + } + } + invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES); + + progress = true; + +cleanup: + delete[] split_points; + delete[] vgrf_has_split; + delete[] new_virtual_grf; + delete[] new_reg_offset; + + return progress; +} + +/** + * Remove unused virtual GRFs and compact the vgrf_* arrays. + * + * During code generation, we create tons of temporary variables, many of + * which get immediately killed and are never used again. Yet, in later + * optimization and analysis passes, such as compute_live_intervals, we need + * to loop over all the virtual GRFs. Compacting them can save a lot of + * overhead. + */ +bool +fs_visitor::compact_virtual_grfs() +{ + bool progress = false; + int *remap_table = new int[this->alloc.count]; + memset(remap_table, -1, this->alloc.count * sizeof(int)); + + /* Mark which virtual GRFs are used. */ + foreach_block_and_inst(block, const fs_inst, inst, cfg) { + if (inst->dst.file == VGRF) + remap_table[inst->dst.nr] = 0; + + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF) + remap_table[inst->src[i].nr] = 0; + } + } + + /* Compact the GRF arrays. */ + int new_index = 0; + for (unsigned i = 0; i < this->alloc.count; i++) { + if (remap_table[i] == -1) { + /* We just found an unused register. This means that we are + * actually going to compact something. + */ + progress = true; + } else { + remap_table[i] = new_index; + alloc.sizes[new_index] = alloc.sizes[i]; + invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES); + ++new_index; + } + } + + this->alloc.count = new_index; + + /* Patch all the instructions to use the newly renumbered registers */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (inst->dst.file == VGRF) + inst->dst.nr = remap_table[inst->dst.nr]; + + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF) + inst->src[i].nr = remap_table[inst->src[i].nr]; + } + } + + /* Patch all the references to delta_xy, since they're used in register + * allocation. If they're unused, switch them to BAD_FILE so we don't + * think some random VGRF is delta_xy. + */ + for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) { + if (delta_xy[i].file == VGRF) { + if (remap_table[delta_xy[i].nr] != -1) { + delta_xy[i].nr = remap_table[delta_xy[i].nr]; + } else { + delta_xy[i].file = BAD_FILE; + } + } + } + + delete[] remap_table; + + return progress; +} + +int +brw_get_subgroup_id_param_index(const intel_device_info *devinfo, + const brw_stage_prog_data *prog_data) +{ + if (prog_data->nr_params == 0) + return -1; + + if (devinfo->verx10 >= 125) + return -1; + + /* The local thread id is always the last parameter in the list */ + uint32_t last_param = prog_data->param[prog_data->nr_params - 1]; + if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID) + return prog_data->nr_params - 1; + + return -1; +} + +/** + * Assign UNIFORM file registers to either push constants or pull constants. + * + * We allow a fragment shader to have more than the specified minimum + * maximum number of fragment shader uniform components (64). If + * there are too many of these, they'd fill up all of register space. + * So, this will push some of them out to the pull constant buffer and + * update the program to load them. + */ +void +fs_visitor::assign_constant_locations() +{ + /* Only the first compile gets to decide on locations. */ + if (push_constant_loc) + return; + + push_constant_loc = ralloc_array(mem_ctx, int, uniforms); + for (unsigned u = 0; u < uniforms; u++) + push_constant_loc[u] = u; + + /* Now that we know how many regular uniforms we'll push, reduce the + * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits. + */ + /* For gen4/5: + * Only allow 16 registers (128 uniform components) as push constants. + * + * If changing this value, note the limitation about total_regs in + * brw_curbe.c/crocus_state.c + */ + const unsigned max_push_length = compiler->devinfo->ver < 6 ? 16 : 64; + unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8); + for (int i = 0; i < 4; i++) { + struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; + + if (push_length + range->length > max_push_length) + range->length = max_push_length - push_length; + + push_length += range->length; + } + assert(push_length <= max_push_length); +} + +bool +fs_visitor::get_pull_locs(const fs_reg &src, + unsigned *out_surf_index, + unsigned *out_pull_index) +{ + assert(src.file == UNIFORM); + + if (src.nr < UBO_START) + return false; + + const struct brw_ubo_range *range = + &prog_data->ubo_ranges[src.nr - UBO_START]; + + /* If this access is in our (reduced) range, use the push data. */ + if (src.offset / 32 < range->length) + return false; + + *out_surf_index = range->block; + *out_pull_index = (32 * range->start + src.offset) / 4; + + prog_data->has_ubo_pull = true; + + return true; +} + +/** + * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD + * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs. + */ +bool +fs_visitor::lower_constant_loads() +{ + unsigned index, pull_index; + bool progress = false; + + foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { + /* Set up the annotation tracking for new generated instructions. */ + const fs_builder ibld(this, block, inst); + + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file != UNIFORM) + continue; + + /* We'll handle this case later */ + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) + continue; + + if (!get_pull_locs(inst->src[i], &index, &pull_index)) + continue; + + assert(inst->src[i].stride == 0); + + const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ + const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0); + const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD); + const unsigned base = pull_index * 4; + + fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS]; + srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index); + srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1)); + srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz); + + + ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst, + srcs, PULL_UNIFORM_CONSTANT_SRCS); + + /* Rewrite the instruction to use the temporary VGRF. */ + inst->src[i].file = VGRF; + inst->src[i].nr = dst.nr; + inst->src[i].offset = (base & (block_sz - 1)) + + inst->src[i].offset % 4; + + progress = true; + } + + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && + inst->src[0].file == UNIFORM) { + + if (!get_pull_locs(inst->src[0], &index, &pull_index)) + continue; + + VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst, + brw_imm_ud(index), + fs_reg() /* surface_handle */, + inst->src[1], + pull_index * 4, 4, 1); + inst->remove(block); + + progress = true; + } + } + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + + return progress; +} + +static uint64_t +src_as_uint(const fs_reg &src) +{ + assert(src.file == IMM); + + switch (src.type) { + case BRW_REGISTER_TYPE_W: + return (uint64_t)(int16_t)(src.ud & 0xffff); + + case BRW_REGISTER_TYPE_UW: + return (uint64_t)(uint16_t)(src.ud & 0xffff); + + case BRW_REGISTER_TYPE_D: + return (uint64_t)src.d; + + case BRW_REGISTER_TYPE_UD: + return (uint64_t)src.ud; + + case BRW_REGISTER_TYPE_Q: + return src.d64; + + case BRW_REGISTER_TYPE_UQ: + return src.u64; + + default: + unreachable("Invalid integer type."); + } +} + +static fs_reg +brw_imm_for_type(uint64_t value, enum brw_reg_type type) +{ + switch (type) { + case BRW_REGISTER_TYPE_W: + return brw_imm_w(value); + + case BRW_REGISTER_TYPE_UW: + return brw_imm_uw(value); + + case BRW_REGISTER_TYPE_D: + return brw_imm_d(value); + + case BRW_REGISTER_TYPE_UD: + return brw_imm_ud(value); + + case BRW_REGISTER_TYPE_Q: + return brw_imm_d(value); + + case BRW_REGISTER_TYPE_UQ: + return brw_imm_uq(value); + + default: + unreachable("Invalid integer type."); + } +} + +bool +fs_visitor::opt_algebraic() +{ + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + switch (inst->opcode) { + case BRW_OPCODE_MOV: + if (!devinfo->has_64bit_float && + inst->dst.type == BRW_REGISTER_TYPE_DF) { + assert(inst->dst.type == inst->src[0].type); + assert(!inst->saturate); + assert(!inst->src[0].abs); + assert(!inst->src[0].negate); + const brw::fs_builder ibld(this, block, inst); + + if (!inst->is_partial_write()) + ibld.emit_undef_for_dst(inst); + + ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_F, 1), + subscript(inst->src[0], BRW_REGISTER_TYPE_F, 1)); + ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_F, 0), + subscript(inst->src[0], BRW_REGISTER_TYPE_F, 0)); + + inst->remove(block); + progress = true; + } + + if (!devinfo->has_64bit_int && + (inst->dst.type == BRW_REGISTER_TYPE_UQ || + inst->dst.type == BRW_REGISTER_TYPE_Q)) { + assert(inst->dst.type == inst->src[0].type); + assert(!inst->saturate); + assert(!inst->src[0].abs); + assert(!inst->src[0].negate); + const brw::fs_builder ibld(this, block, inst); + + if (!inst->is_partial_write()) + ibld.emit_undef_for_dst(inst); + + ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1), + subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1)); + ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0), + subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0)); + + inst->remove(block); + progress = true; + } + + if ((inst->conditional_mod == BRW_CONDITIONAL_Z || + inst->conditional_mod == BRW_CONDITIONAL_NZ) && + inst->dst.is_null() && + (inst->src[0].abs || inst->src[0].negate)) { + inst->src[0].abs = false; + inst->src[0].negate = false; + progress = true; + break; + } + + if (inst->src[0].file != IMM) + break; + + if (inst->saturate) { + /* Full mixed-type saturates don't happen. However, we can end up + * with things like: + * + * mov.sat(8) g21<1>DF -1F + * + * Other mixed-size-but-same-base-type cases may also be possible. + */ + if (inst->dst.type != inst->src[0].type && + inst->dst.type != BRW_REGISTER_TYPE_DF && + inst->src[0].type != BRW_REGISTER_TYPE_F) + assert(!"unimplemented: saturate mixed types"); + + if (brw_saturate_immediate(inst->src[0].type, + &inst->src[0].as_brw_reg())) { + inst->saturate = false; + progress = true; + } + } + break; + + case BRW_OPCODE_MUL: + if (inst->src[1].file != IMM) + continue; + + if (brw_reg_type_is_floating_point(inst->src[1].type)) + break; + + /* From the BDW PRM, Vol 2a, "mul - Multiply": + * + * "When multiplying integer datatypes, if src0 is DW and src1 + * is W, irrespective of the destination datatype, the + * accumulator maintains full 48-bit precision." + * ... + * "When multiplying integer data types, if one of the sources + * is a DW, the resulting full precision data is stored in + * the accumulator." + * + * There are also similar notes in earlier PRMs. + * + * The MOV instruction can copy the bits of the source, but it + * does not clear the higher bits of the accumulator. So, because + * we might use the full accumulator in the MUL/MACH macro, we + * shouldn't replace such MULs with MOVs. + */ + if ((brw_reg_type_to_size(inst->src[0].type) == 4 || + brw_reg_type_to_size(inst->src[1].type) == 4) && + (inst->dst.is_accumulator() || + inst->writes_accumulator_implicitly(devinfo))) + break; + + /* a * 1.0 = a */ + if (inst->src[1].is_one()) { + inst->opcode = BRW_OPCODE_MOV; + inst->sources = 1; + inst->src[1] = reg_undef; + progress = true; + break; + } + + /* a * -1.0 = -a */ + if (inst->src[1].is_negative_one()) { + inst->opcode = BRW_OPCODE_MOV; + inst->sources = 1; + inst->src[0].negate = !inst->src[0].negate; + inst->src[1] = reg_undef; + progress = true; + break; + } + + break; + case BRW_OPCODE_ADD: + if (inst->src[1].file != IMM) + continue; + + if (brw_reg_type_is_integer(inst->src[1].type) && + inst->src[1].is_zero()) { + inst->opcode = BRW_OPCODE_MOV; + inst->sources = 1; + inst->src[1] = reg_undef; + progress = true; + break; + } + + if (inst->src[0].file == IMM) { + assert(inst->src[0].type == BRW_REGISTER_TYPE_F); + inst->opcode = BRW_OPCODE_MOV; + inst->sources = 1; + inst->src[0].f += inst->src[1].f; + inst->src[1] = reg_undef; + progress = true; + break; + } + break; + + case BRW_OPCODE_AND: + if (inst->src[0].file == IMM && inst->src[1].file == IMM) { + const uint64_t src0 = src_as_uint(inst->src[0]); + const uint64_t src1 = src_as_uint(inst->src[1]); + + inst->opcode = BRW_OPCODE_MOV; + inst->sources = 1; + inst->src[0] = brw_imm_for_type(src0 & src1, inst->dst.type); + inst->src[1] = reg_undef; + progress = true; + break; + } + + break; + + case BRW_OPCODE_OR: + if (inst->src[0].file == IMM && inst->src[1].file == IMM) { + const uint64_t src0 = src_as_uint(inst->src[0]); + const uint64_t src1 = src_as_uint(inst->src[1]); + + inst->opcode = BRW_OPCODE_MOV; + inst->sources = 1; + inst->src[0] = brw_imm_for_type(src0 | src1, inst->dst.type); + inst->src[1] = reg_undef; + progress = true; + break; + } + + if (inst->src[0].equals(inst->src[1]) || + inst->src[1].is_zero()) { + /* On Gfx8+, the OR instruction can have a source modifier that + * performs logical not on the operand. Cases of 'OR r0, ~r1, 0' + * or 'OR r0, ~r1, ~r1' should become a NOT instead of a MOV. + */ + if (inst->src[0].negate) { + inst->opcode = BRW_OPCODE_NOT; + inst->sources = 1; + inst->src[0].negate = false; + } else { + inst->opcode = BRW_OPCODE_MOV; + inst->sources = 1; + } + inst->src[1] = reg_undef; + progress = true; + break; + } + break; + case BRW_OPCODE_CMP: + if ((inst->conditional_mod == BRW_CONDITIONAL_Z || + inst->conditional_mod == BRW_CONDITIONAL_NZ) && + inst->src[1].is_zero() && + (inst->src[0].abs || inst->src[0].negate)) { + inst->src[0].abs = false; + inst->src[0].negate = false; + progress = true; + break; + } + break; + case BRW_OPCODE_SEL: + if (!devinfo->has_64bit_float && + !devinfo->has_64bit_int && + (inst->dst.type == BRW_REGISTER_TYPE_DF || + inst->dst.type == BRW_REGISTER_TYPE_UQ || + inst->dst.type == BRW_REGISTER_TYPE_Q)) { + assert(inst->dst.type == inst->src[0].type); + assert(!inst->saturate); + assert(!inst->src[0].abs && !inst->src[0].negate); + assert(!inst->src[1].abs && !inst->src[1].negate); + const brw::fs_builder ibld(this, block, inst); + + if (!inst->is_partial_write()) + ibld.emit_undef_for_dst(inst); + + set_predicate(inst->predicate, + ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0), + subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0), + subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0))); + set_predicate(inst->predicate, + ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1), + subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1), + subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 1))); + + inst->remove(block); + progress = true; + } + if (inst->src[0].equals(inst->src[1])) { + inst->opcode = BRW_OPCODE_MOV; + inst->sources = 1; + inst->src[1] = reg_undef; + inst->predicate = BRW_PREDICATE_NONE; + inst->predicate_inverse = false; + progress = true; + } else if (inst->saturate && inst->src[1].file == IMM) { + switch (inst->conditional_mod) { + case BRW_CONDITIONAL_LE: + case BRW_CONDITIONAL_L: + switch (inst->src[1].type) { + case BRW_REGISTER_TYPE_F: + if (inst->src[1].f >= 1.0f) { + inst->opcode = BRW_OPCODE_MOV; + inst->sources = 1; + inst->src[1] = reg_undef; + inst->conditional_mod = BRW_CONDITIONAL_NONE; + progress = true; + } + break; + default: + break; + } + break; + case BRW_CONDITIONAL_GE: + case BRW_CONDITIONAL_G: + switch (inst->src[1].type) { + case BRW_REGISTER_TYPE_F: + if (inst->src[1].f <= 0.0f) { + inst->opcode = BRW_OPCODE_MOV; + inst->sources = 1; + inst->src[1] = reg_undef; + inst->conditional_mod = BRW_CONDITIONAL_NONE; + progress = true; + } + break; + default: + break; + } + default: + break; + } + } + break; + case BRW_OPCODE_MAD: + if (inst->src[0].type != BRW_REGISTER_TYPE_F || + inst->src[1].type != BRW_REGISTER_TYPE_F || + inst->src[2].type != BRW_REGISTER_TYPE_F) + break; + if (inst->src[1].is_one()) { + inst->opcode = BRW_OPCODE_ADD; + inst->sources = 2; + inst->src[1] = inst->src[2]; + inst->src[2] = reg_undef; + progress = true; + } else if (inst->src[2].is_one()) { + inst->opcode = BRW_OPCODE_ADD; + inst->sources = 2; + inst->src[2] = reg_undef; + progress = true; + } + break; + case BRW_OPCODE_SHL: + if (inst->src[0].file == IMM && inst->src[1].file == IMM) { + /* It's not currently possible to generate this, and this constant + * folding does not handle it. + */ + assert(!inst->saturate); + + fs_reg result; + + switch (type_sz(inst->src[0].type)) { + case 2: + result = brw_imm_uw(0x0ffff & (inst->src[0].ud << (inst->src[1].ud & 0x1f))); + break; + case 4: + result = brw_imm_ud(inst->src[0].ud << (inst->src[1].ud & 0x1f)); + break; + case 8: + result = brw_imm_uq(inst->src[0].u64 << (inst->src[1].ud & 0x3f)); + break; + default: + /* Just in case a future platform re-enables B or UB types. */ + unreachable("Invalid source size."); + } + + inst->opcode = BRW_OPCODE_MOV; + inst->src[0] = retype(result, inst->dst.type); + inst->src[1] = reg_undef; + inst->sources = 1; + + progress = true; + } + break; + + case SHADER_OPCODE_BROADCAST: + if (is_uniform(inst->src[0])) { + inst->opcode = BRW_OPCODE_MOV; + inst->sources = 1; + inst->force_writemask_all = true; + progress = true; + } else if (inst->src[1].file == IMM) { + inst->opcode = BRW_OPCODE_MOV; + /* It's possible that the selected component will be too large and + * overflow the register. This can happen if someone does a + * readInvocation() from GLSL or SPIR-V and provides an OOB + * invocationIndex. If this happens and we some how manage + * to constant fold it in and get here, then component() may cause + * us to start reading outside of the VGRF which will lead to an + * assert later. Instead, just let it wrap around if it goes over + * exec_size. + */ + const unsigned comp = inst->src[1].ud & (inst->exec_size - 1); + inst->src[0] = component(inst->src[0], comp); + inst->sources = 1; + inst->force_writemask_all = true; + progress = true; + } + break; + + case SHADER_OPCODE_SHUFFLE: + if (is_uniform(inst->src[0])) { + inst->opcode = BRW_OPCODE_MOV; + inst->sources = 1; + progress = true; + } else if (inst->src[1].file == IMM) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[0] = component(inst->src[0], + inst->src[1].ud); + inst->sources = 1; + progress = true; + } + break; + + default: + break; + } + + /* Ensure that the correct source has the immediate value. 2-source + * instructions must have the immediate in src[1]. On Gfx12 and later, + * some 3-source instructions can have the immediate in src[0] or + * src[2]. It's complicated, so don't mess with 3-source instructions + * here. + */ + if (progress && inst->sources == 2 && inst->is_commutative()) { + if (inst->src[0].file == IMM) { + fs_reg tmp = inst->src[1]; + inst->src[1] = inst->src[0]; + inst->src[0] = tmp; + } + } + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW | + DEPENDENCY_INSTRUCTION_DETAIL); + + return progress; +} + +static unsigned +load_payload_sources_read_for_size(fs_inst *lp, unsigned size_read) +{ + assert(lp->opcode == SHADER_OPCODE_LOAD_PAYLOAD); + assert(size_read >= lp->header_size * REG_SIZE); + + unsigned i; + unsigned size = lp->header_size * REG_SIZE; + for (i = lp->header_size; size < size_read && i < lp->sources; i++) + size += lp->exec_size * type_sz(lp->src[i].type); + + /* Size read must cover exactly a subset of sources. */ + assert(size == size_read); + return i; +} + +/** + * Optimize sample messages that have constant zero values for the trailing + * parameters. We can just reduce the message length for these + * instructions instead of reserving a register for it. Trailing parameters + * that aren't sent default to zero anyway. This will cause the dead code + * eliminator to remove the MOV instruction that would otherwise be emitted to + * set up the zero value. + */ +bool +fs_visitor::opt_zero_samples() +{ + /* Implementation supports only SENDs, so applicable to Gfx7+ only. */ + assert(devinfo->ver >= 7); + + bool progress = false; + + foreach_block_and_inst(block, fs_inst, send, cfg) { + if (send->opcode != SHADER_OPCODE_SEND || + send->sfid != BRW_SFID_SAMPLER) + continue; + + /* Wa_14012688258: + * + * Don't trim zeros at the end of payload for sample operations + * in cube and cube arrays. + */ + if (send->keep_payload_trailing_zeros) + continue; + + /* This pass works on SENDs before splitting. */ + if (send->ex_mlen > 0) + continue; + + fs_inst *lp = (fs_inst *) send->prev; + + if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD) + continue; + + /* How much of the payload are actually read by this SEND. */ + const unsigned params = + load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE); + + /* We don't want to remove the message header or the first parameter. + * Removing the first parameter is not allowed, see the Haswell PRM + * volume 7, page 149: + * + * "Parameter 0 is required except for the sampleinfo message, which + * has no parameter 0" + */ + const unsigned first_param_idx = lp->header_size; + unsigned zero_size = 0; + for (unsigned i = params - 1; i > first_param_idx; i--) { + if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero()) + break; + zero_size += lp->exec_size * type_sz(lp->src[i].type) * lp->dst.stride; + } + + const unsigned zero_len = zero_size / (reg_unit(devinfo) * REG_SIZE); + if (zero_len > 0) { + send->mlen -= zero_len; + progress = true; + } + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL); + + return progress; +} + +/** + * Opportunistically split SEND message payloads. + * + * Gfx9+ supports "split" SEND messages, which take two payloads that are + * implicitly concatenated. If we find a SEND message with a single payload, + * we can split that payload in two. This results in smaller contiguous + * register blocks for us to allocate. But it can help beyond that, too. + * + * We try and split a LOAD_PAYLOAD between sources which change registers. + * For example, a sampler message often contains a x/y/z coordinate that may + * already be in a contiguous VGRF, combined with an LOD, shadow comparitor, + * or array index, which comes from elsewhere. In this case, the first few + * sources will be different offsets of the same VGRF, then a later source + * will be a different VGRF. So we split there, possibly eliminating the + * payload concatenation altogether. + */ +bool +fs_visitor::opt_split_sends() +{ + if (devinfo->ver < 9) + return false; + + bool progress = false; + + foreach_block_and_inst(block, fs_inst, send, cfg) { + if (send->opcode != SHADER_OPCODE_SEND || + send->mlen <= reg_unit(devinfo) || send->ex_mlen > 0) + continue; + + assert(send->src[2].file == VGRF); + + /* Currently don't split sends that reuse a previously used payload. */ + fs_inst *lp = (fs_inst *) send->prev; + + if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD) + continue; + + if (lp->dst.file != send->src[2].file || lp->dst.nr != send->src[2].nr) + continue; + + /* Split either after the header (if present), or when consecutive + * sources switch from one VGRF to a different one. + */ + unsigned mid = lp->header_size; + if (mid == 0) { + for (mid = 1; mid < lp->sources; mid++) { + if (lp->src[mid].file == BAD_FILE) + continue; + + if (lp->src[0].file != lp->src[mid].file || + lp->src[0].nr != lp->src[mid].nr) + break; + } + } + + /* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so + * find out how many sources from the payload does it really need. + */ + const unsigned end = + load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE); + + /* Nothing to split. */ + if (end <= mid) + continue; + + const fs_builder ibld(this, block, lp); + fs_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size); + fs_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0); + + assert(lp1->size_written % REG_SIZE == 0); + assert(lp2->size_written % REG_SIZE == 0); + assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen); + + lp1->dst = fs_reg(VGRF, alloc.allocate(lp1->size_written / REG_SIZE), lp1->dst.type); + lp2->dst = fs_reg(VGRF, alloc.allocate(lp2->size_written / REG_SIZE), lp2->dst.type); + + send->resize_sources(4); + send->src[2] = lp1->dst; + send->src[3] = lp2->dst; + send->ex_mlen = lp2->size_written / REG_SIZE; + send->mlen -= send->ex_mlen; + + progress = true; + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + + return progress; +} + +/** + * Remove redundant or useless halts. + * + * For example, we can eliminate halts in the following sequence: + * + * halt (redundant with the next halt) + * halt (useless; jumps to the next instruction) + * halt-target + */ +bool +fs_visitor::opt_redundant_halt() +{ + bool progress = false; + + unsigned halt_count = 0; + fs_inst *halt_target = NULL; + bblock_t *halt_target_block = NULL; + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (inst->opcode == BRW_OPCODE_HALT) + halt_count++; + + if (inst->opcode == SHADER_OPCODE_HALT_TARGET) { + halt_target = inst; + halt_target_block = block; + break; + } + } + + if (!halt_target) { + assert(halt_count == 0); + return false; + } + + /* Delete any HALTs immediately before the halt target. */ + for (fs_inst *prev = (fs_inst *) halt_target->prev; + !prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT; + prev = (fs_inst *) halt_target->prev) { + prev->remove(halt_target_block); + halt_count--; + progress = true; + } + + if (halt_count == 0) { + halt_target->remove(halt_target_block); + progress = true; + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + + return progress; +} + +/** + * Compute a bitmask with GRF granularity with a bit set for each GRF starting + * from \p r.offset which overlaps the region starting at \p s.offset and + * spanning \p ds bytes. + */ +static inline unsigned +mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds) +{ + const int rel_offset = reg_offset(s) - reg_offset(r); + const int shift = rel_offset / REG_SIZE; + const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE); + assert(reg_space(r) == reg_space(s) && + shift >= 0 && shift < int(8 * sizeof(unsigned))); + return ((1 << n) - 1) << shift; +} + +bool +fs_visitor::compute_to_mrf() +{ + bool progress = false; + int next_ip = 0; + + /* No MRFs on Gen >= 7. */ + if (devinfo->ver >= 7) + return false; + + const fs_live_variables &live = live_analysis.require(); + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + int ip = next_ip; + next_ip++; + + if (inst->opcode != BRW_OPCODE_MOV || + inst->is_partial_write() || + inst->dst.file != MRF || inst->src[0].file != VGRF || + inst->dst.type != inst->src[0].type || + inst->src[0].abs || inst->src[0].negate || + !inst->src[0].is_contiguous() || + inst->src[0].offset % REG_SIZE != 0) + continue; + + /* Can't compute-to-MRF this GRF if someone else was going to + * read it later. + */ + if (live.vgrf_end[inst->src[0].nr] > ip) + continue; + + /* Found a move of a GRF to a MRF. Let's see if we can go rewrite the + * things that computed the value of all GRFs of the source region. The + * regs_left bitset keeps track of the registers we haven't yet found a + * generating instruction for. + */ + unsigned regs_left = (1 << regs_read(inst, 0)) - 1; + + foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { + if (regions_overlap(scan_inst->dst, scan_inst->size_written, + inst->src[0], inst->size_read(0))) { + /* Found the last thing to write our reg we want to turn + * into a compute-to-MRF. + */ + + /* If this one instruction didn't populate all the + * channels, bail. We might be able to rewrite everything + * that writes that reg, but it would require smarter + * tracking. + */ + if (scan_inst->is_partial_write()) + break; + + /* Handling things not fully contained in the source of the copy + * would need us to understand coalescing out more than one MOV at + * a time. + */ + if (!region_contained_in(scan_inst->dst, scan_inst->size_written, + inst->src[0], inst->size_read(0))) + break; + + /* SEND instructions can't have MRF as a destination. */ + if (scan_inst->mlen) + break; + + if (devinfo->ver == 6) { + /* gfx6 math instructions must have the destination be + * GRF, so no compute-to-MRF for them. + */ + if (scan_inst->is_math()) { + break; + } + } + + /* Clear the bits for any registers this instruction overwrites. */ + regs_left &= ~mask_relative_to( + inst->src[0], scan_inst->dst, scan_inst->size_written); + if (!regs_left) + break; + } + + /* We don't handle control flow here. Most computation of + * values that end up in MRFs are shortly before the MRF + * write anyway. + */ + if (block->start() == scan_inst) + break; + + /* You can't read from an MRF, so if someone else reads our + * MRF's source GRF that we wanted to rewrite, that stops us. + */ + bool interfered = false; + for (int i = 0; i < scan_inst->sources; i++) { + if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i), + inst->src[0], inst->size_read(0))) { + interfered = true; + } + } + if (interfered) + break; + + if (regions_overlap(scan_inst->dst, scan_inst->size_written, + inst->dst, inst->size_written)) { + /* If somebody else writes our MRF here, we can't + * compute-to-MRF before that. + */ + break; + } + + if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 && + regions_overlap(fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE, + inst->dst, inst->size_written)) { + /* Found a SEND instruction, which means that there are + * live values in MRFs from base_mrf to base_mrf + + * scan_inst->mlen - 1. Don't go pushing our MRF write up + * above it. + */ + break; + } + } + + if (regs_left) + continue; + + /* Found all generating instructions of our MRF's source value, so it + * should be safe to rewrite them to point to the MRF directly. + */ + regs_left = (1 << regs_read(inst, 0)) - 1; + + foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { + if (regions_overlap(scan_inst->dst, scan_inst->size_written, + inst->src[0], inst->size_read(0))) { + /* Clear the bits for any registers this instruction overwrites. */ + regs_left &= ~mask_relative_to( + inst->src[0], scan_inst->dst, scan_inst->size_written); + + const unsigned rel_offset = reg_offset(scan_inst->dst) - + reg_offset(inst->src[0]); + + if (inst->dst.nr & BRW_MRF_COMPR4) { + /* Apply the same address transformation done by the hardware + * for COMPR4 MRF writes. + */ + assert(rel_offset < 2 * REG_SIZE); + scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4; + + /* Clear the COMPR4 bit if the generating instruction is not + * compressed. + */ + if (scan_inst->size_written < 2 * REG_SIZE) + scan_inst->dst.nr &= ~BRW_MRF_COMPR4; + + } else { + /* Calculate the MRF number the result of this instruction is + * ultimately written to. + */ + scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE; + } + + scan_inst->dst.file = MRF; + scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE; + scan_inst->saturate |= inst->saturate; + if (!regs_left) + break; + } + } + + assert(!regs_left); + inst->remove(block); + progress = true; + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + + return progress; +} + +/** + * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control + * flow. We could probably do better here with some form of divergence + * analysis. + */ +bool +fs_visitor::eliminate_find_live_channel() +{ + bool progress = false; + unsigned depth = 0; + + if (!brw_stage_has_packed_dispatch(devinfo, stage, max_polygons, + stage_prog_data)) { + /* The optimization below assumes that channel zero is live on thread + * dispatch, which may not be the case if the fixed function dispatches + * threads sparsely. + */ + return false; + } + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + switch (inst->opcode) { + case BRW_OPCODE_IF: + case BRW_OPCODE_DO: + depth++; + break; + + case BRW_OPCODE_ENDIF: + case BRW_OPCODE_WHILE: + depth--; + break; + + case BRW_OPCODE_HALT: + /* This can potentially make control flow non-uniform until the end + * of the program. + */ + goto out; + + case SHADER_OPCODE_FIND_LIVE_CHANNEL: + if (depth == 0) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[0] = brw_imm_ud(0u); + inst->sources = 1; + inst->force_writemask_all = true; + progress = true; + } + break; + + default: + break; + } + } + +out: + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL); + + return progress; +} + +/** + * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE + * instructions to FS_OPCODE_REP_FB_WRITE. + */ +void +fs_visitor::emit_repclear_shader() +{ + brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; + fs_inst *write = NULL; + + assert(uniforms == 0); + assume(key->nr_color_regions > 0); + + fs_reg color_output, header; + if (devinfo->ver >= 7) { + color_output = retype(brw_vec4_grf(127, 0), BRW_REGISTER_TYPE_UD); + header = retype(brw_vec8_grf(125, 0), BRW_REGISTER_TYPE_UD); + } else { + color_output = retype(brw_vec4_reg(MRF, 2, 0), BRW_REGISTER_TYPE_UD); + header = retype(brw_vec8_reg(MRF, 0, 0), BRW_REGISTER_TYPE_UD); + } + + /* We pass the clear color as a flat input. Copy it to the output. */ + fs_reg color_input = + brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_UD, + BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4, + BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); + + const fs_builder bld = fs_builder(this).at_end(); + bld.exec_all().group(4, 0).MOV(color_output, color_input); + + if (key->nr_color_regions > 1) { + /* Copy g0..g1 as the message header */ + bld.exec_all().group(16, 0) + .MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + } + + for (int i = 0; i < key->nr_color_regions; ++i) { + if (i > 0) + bld.exec_all().group(1, 0).MOV(component(header, 2), brw_imm_ud(i)); + + if (devinfo->ver >= 7) { + write = bld.emit(SHADER_OPCODE_SEND); + write->resize_sources(3); + write->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE; + write->src[0] = brw_imm_ud(0); + write->src[1] = brw_imm_ud(0); + write->src[2] = i == 0 ? color_output : header; + write->check_tdr = true; + write->send_has_side_effects = true; + write->desc = brw_fb_write_desc(devinfo, i, + BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED, + i == key->nr_color_regions - 1, false); + } else { + write = bld.emit(FS_OPCODE_REP_FB_WRITE); + write->target = i; + write->base_mrf = i == 0 ? color_output.nr : header.nr; + } + + /* We can use a headerless message for the first render target */ + write->header_size = i == 0 ? 0 : 2; + write->mlen = 1 + write->header_size; + } + write->eot = true; + write->last_rt = true; + + calculate_cfg(); + + this->first_non_payload_grf = payload().num_regs; + + lower_scoreboard(); +} + +/** + * Walks through basic blocks, looking for repeated MRF writes and + * removing the later ones. + */ +bool +fs_visitor::remove_duplicate_mrf_writes() +{ + fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->ver)]; + bool progress = false; + + /* Need to update the MRF tracking for compressed instructions. */ + if (dispatch_width >= 16) + return false; + + memset(last_mrf_move, 0, sizeof(last_mrf_move)); + + foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { + if (inst->is_control_flow()) { + memset(last_mrf_move, 0, sizeof(last_mrf_move)); + } + + if (inst->opcode == BRW_OPCODE_MOV && + inst->dst.file == MRF) { + fs_inst *prev_inst = last_mrf_move[inst->dst.nr]; + if (prev_inst && prev_inst->opcode == BRW_OPCODE_MOV && + inst->dst.equals(prev_inst->dst) && + inst->src[0].equals(prev_inst->src[0]) && + inst->saturate == prev_inst->saturate && + inst->predicate == prev_inst->predicate && + inst->conditional_mod == prev_inst->conditional_mod && + inst->exec_size == prev_inst->exec_size) { + inst->remove(block); + progress = true; + continue; + } + } + + /* Clear out the last-write records for MRFs that were overwritten. */ + if (inst->dst.file == MRF) { + last_mrf_move[inst->dst.nr] = NULL; + } + + if (inst->mlen > 0 && inst->base_mrf != -1) { + /* Found a SEND instruction, which will include two or fewer + * implied MRF writes. We could do better here. + */ + for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { + last_mrf_move[inst->base_mrf + i] = NULL; + } + } + + /* Clear out any MRF move records whose sources got overwritten. */ + for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) { + if (last_mrf_move[i] && + regions_overlap(inst->dst, inst->size_written, + last_mrf_move[i]->src[0], + last_mrf_move[i]->size_read(0))) { + last_mrf_move[i] = NULL; + } + } + + if (inst->opcode == BRW_OPCODE_MOV && + inst->dst.file == MRF && + inst->src[0].file != ARF && + !inst->is_partial_write()) { + last_mrf_move[inst->dst.nr] = inst; + } + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + + return progress; +} + +/** + * Rounding modes for conversion instructions are included for each + * conversion, but right now it is a state. So once it is set, + * we don't need to call it again for subsequent calls. + * + * This is useful for vector/matrices conversions, as setting the + * mode once is enough for the full vector/matrix + */ +bool +fs_visitor::remove_extra_rounding_modes() +{ + bool progress = false; + unsigned execution_mode = this->nir->info.float_controls_execution_mode; + + brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED; + if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | + FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 | + FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) & + execution_mode) + base_mode = BRW_RND_MODE_RTNE; + if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | + FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | + FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) & + execution_mode) + base_mode = BRW_RND_MODE_RTZ; + + foreach_block (block, cfg) { + brw_rnd_mode prev_mode = base_mode; + + foreach_inst_in_block_safe (fs_inst, inst, block) { + if (inst->opcode == SHADER_OPCODE_RND_MODE) { + assert(inst->src[0].file == BRW_IMMEDIATE_VALUE); + const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d; + if (mode == prev_mode) { + inst->remove(block); + progress = true; + } else { + prev_mode = mode; + } + } + } + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + + return progress; +} + +static void +clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len) +{ + /* Clear the flag for registers that actually got read (as expected). */ + for (int i = 0; i < inst->sources; i++) { + int grf; + if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) { + grf = inst->src[i].nr; + } else { + continue; + } + + if (grf >= first_grf && + grf < first_grf + grf_len) { + deps[grf - first_grf] = false; + if (inst->exec_size == 16) + deps[grf - first_grf + 1] = false; + } + } +} + +/** + * Implements this workaround for the original 965: + * + * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not + * check for post destination dependencies on this instruction, software + * must ensure that there is no destination hazard for the case of ‘write + * followed by a posted write’ shown in the following example. + * + * 1. mov r3 0 + * 2. send r3.xy + * 3. mov r2 r3 + * + * Due to no post-destination dependency check on the ‘send’, the above + * code sequence could have two instructions (1 and 2) in flight at the + * same time that both consider ‘r3’ as the target of their final writes. + */ +void +fs_visitor::insert_gfx4_pre_send_dependency_workarounds(bblock_t *block, + fs_inst *inst) +{ + int write_len = regs_written(inst); + int first_write_grf = inst->dst.nr; + bool needs_dep[BRW_MAX_MRF(devinfo->ver)]; + assert(write_len < (int)sizeof(needs_dep) - 1); + + memset(needs_dep, false, sizeof(needs_dep)); + memset(needs_dep, true, write_len); + + clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len); + + /* Walk backwards looking for writes to registers we're writing which + * aren't read since being written. If we hit the start of the program, + * we assume that there are no outstanding dependencies on entry to the + * program. + */ + foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { + /* If we hit control flow, assume that there *are* outstanding + * dependencies, and force their cleanup before our instruction. + */ + if (block->start() == scan_inst && block->num != 0) { + for (int i = 0; i < write_len; i++) { + if (needs_dep[i]) + DEP_RESOLVE_MOV(fs_builder(this, block, inst), + first_write_grf + i); + } + return; + } + + /* We insert our reads as late as possible on the assumption that any + * instruction but a MOV that might have left us an outstanding + * dependency has more latency than a MOV. + */ + if (scan_inst->dst.file == VGRF) { + for (unsigned i = 0; i < regs_written(scan_inst); i++) { + int reg = scan_inst->dst.nr + i; + + if (reg >= first_write_grf && + reg < first_write_grf + write_len && + needs_dep[reg - first_write_grf]) { + DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg); + needs_dep[reg - first_write_grf] = false; + if (scan_inst->exec_size == 16) + needs_dep[reg - first_write_grf + 1] = false; + } + } + } + + /* Clear the flag for registers that actually got read (as expected). */ + clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len); + + /* Continue the loop only if we haven't resolved all the dependencies */ + int i; + for (i = 0; i < write_len; i++) { + if (needs_dep[i]) + break; + } + if (i == write_len) + return; + } +} + +/** + * Implements this workaround for the original 965: + * + * "[DevBW, DevCL] Errata: A destination register from a send can not be + * used as a destination register until after it has been sourced by an + * instruction with a different destination register. + */ +void +fs_visitor::insert_gfx4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst) +{ + int write_len = regs_written(inst); + unsigned first_write_grf = inst->dst.nr; + bool needs_dep[BRW_MAX_MRF(devinfo->ver)]; + assert(write_len < (int)sizeof(needs_dep) - 1); + + memset(needs_dep, false, sizeof(needs_dep)); + memset(needs_dep, true, write_len); + /* Walk forwards looking for writes to registers we're writing which aren't + * read before being written. + */ + foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) { + /* If we hit control flow, force resolve all remaining dependencies. */ + if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) { + for (int i = 0; i < write_len; i++) { + if (needs_dep[i]) + DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst), + first_write_grf + i); + } + return; + } + + /* Clear the flag for registers that actually got read (as expected). */ + clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len); + + /* We insert our reads as late as possible since they're reading the + * result of a SEND, which has massive latency. + */ + if (scan_inst->dst.file == VGRF && + scan_inst->dst.nr >= first_write_grf && + scan_inst->dst.nr < first_write_grf + write_len && + needs_dep[scan_inst->dst.nr - first_write_grf]) { + DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst), + scan_inst->dst.nr); + needs_dep[scan_inst->dst.nr - first_write_grf] = false; + } + + /* Continue the loop only if we haven't resolved all the dependencies */ + int i; + for (i = 0; i < write_len; i++) { + if (needs_dep[i]) + break; + } + if (i == write_len) + return; + } +} + +void +fs_visitor::insert_gfx4_send_dependency_workarounds() +{ + if (devinfo->ver != 4 || devinfo->platform == INTEL_PLATFORM_G4X) + return; + + bool progress = false; + + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (inst->mlen != 0 && inst->dst.file == VGRF) { + insert_gfx4_pre_send_dependency_workarounds(block, inst); + insert_gfx4_post_send_dependency_workarounds(block, inst); + progress = true; + } + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); +} + +bool +fs_visitor::lower_load_payload() +{ + bool progress = false; + + foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { + if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) + continue; + + assert(inst->dst.file == MRF || inst->dst.file == VGRF); + assert(inst->saturate == false); + fs_reg dst = inst->dst; + + /* Get rid of COMPR4. We'll add it back in if we need it */ + if (dst.file == MRF) + dst.nr = dst.nr & ~BRW_MRF_COMPR4; + + const fs_builder ibld(this, block, inst); + const fs_builder ubld = ibld.exec_all(); + + for (uint8_t i = 0; i < inst->header_size;) { + /* Number of header GRFs to initialize at once with a single MOV + * instruction. + */ + const unsigned n = + (i + 1 < inst->header_size && inst->src[i].stride == 1 && + inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ? + 2 : 1; + + if (inst->src[i].file != BAD_FILE) + ubld.group(8 * n, 0).MOV(retype(dst, BRW_REGISTER_TYPE_UD), + retype(inst->src[i], BRW_REGISTER_TYPE_UD)); + + dst = byte_offset(dst, n * REG_SIZE); + i += n; + } + + if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) && + inst->exec_size > 8) { + /* In this case, the payload portion of the LOAD_PAYLOAD isn't + * a straightforward copy. Instead, the result of the + * LOAD_PAYLOAD is treated as interleaved and the first four + * non-header sources are unpacked as: + * + * m + 0: r0 + * m + 1: g0 + * m + 2: b0 + * m + 3: a0 + * m + 4: r1 + * m + 5: g1 + * m + 6: b1 + * m + 7: a1 + * + * This is used for gen <= 5 fb writes. + */ + assert(inst->exec_size == 16); + assert(inst->header_size + 4 <= inst->sources); + for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) { + if (inst->src[i].file != BAD_FILE) { + if (devinfo->has_compr4) { + fs_reg compr4_dst = retype(dst, inst->src[i].type); + compr4_dst.nr |= BRW_MRF_COMPR4; + ibld.MOV(compr4_dst, inst->src[i]); + } else { + /* Platform doesn't have COMPR4. We have to fake it */ + fs_reg mov_dst = retype(dst, inst->src[i].type); + ibld.quarter(0).MOV(mov_dst, quarter(inst->src[i], 0)); + mov_dst.nr += 4; + ibld.quarter(1).MOV(mov_dst, quarter(inst->src[i], 1)); + } + } + + dst.nr++; + } + + /* The loop above only ever incremented us through the first set + * of 4 registers. However, thanks to the magic of COMPR4, we + * actually wrote to the first 8 registers, so we need to take + * that into account now. + */ + dst.nr += 4; + + /* The COMPR4 code took care of the first 4 sources. We'll let + * the regular path handle any remaining sources. Yes, we are + * modifying the instruction but we're about to delete it so + * this really doesn't hurt anything. + */ + inst->header_size += 4; + } + + for (uint8_t i = inst->header_size; i < inst->sources; i++) { + dst.type = inst->src[i].type; + if (inst->src[i].file != BAD_FILE) { + ibld.MOV(dst, inst->src[i]); + } + dst = offset(dst, ibld, 1); + } + + inst->remove(block); + progress = true; + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + + return progress; +} + +/** + * Factor an unsigned 32-bit integer. + * + * Attempts to factor \c x into two values that are at most 0xFFFF. If no + * such factorization is possible, either because the value is too large or is + * prime, both \c result_a and \c result_b will be zero. + */ +static void +factor_uint32(uint32_t x, unsigned *result_a, unsigned *result_b) +{ + /* This is necessary to prevent various opportunities for division by zero + * below. + */ + assert(x > 0xffff); + + /* This represents the actual expected constraints on the input. Namely, + * both the upper and lower words should be > 1. + */ + assert(x >= 0x00020002); + + *result_a = 0; + *result_b = 0; + + /* The value is too large to factor with the constraints. */ + if (x > (0xffffu * 0xffffu)) + return; + + /* A non-prime number will have the form p*q*d where p is some prime + * number, q > 1, and 1 <= d <= q. To meet the constraints of this + * function, (p*d) < 0x10000. This implies d <= floor(0xffff / p). + * Furthermore, since q < 0x10000, d >= floor(x / (0xffff * p)). Finally, + * floor(x / (0xffff * p)) <= d <= floor(0xffff / p). + * + * The observation is finding the largest possible value of p reduces the + * possible range of d. After selecting p, all values of d in this range + * are tested until a factorization is found. The size of the range of + * possible values of d sets an upper bound on the run time of the + * function. + */ + static const uint16_t primes[256] = { + 2, 3, 5, 7, 11, 13, 17, 19, + 23, 29, 31, 37, 41, 43, 47, 53, + 59, 61, 67, 71, 73, 79, 83, 89, + 97, 101, 103, 107, 109, 113, 127, 131, /* 32 */ + 137, 139, 149, 151, 157, 163, 167, 173, + 179, 181, 191, 193, 197, 199, 211, 223, + 227, 229, 233, 239, 241, 251, 257, 263, + 269, 271, 277, 281, 283, 293, 307, 311, /* 64 */ + 313, 317, 331, 337, 347, 349, 353, 359, + 367, 373, 379, 383, 389, 397, 401, 409, + 419, 421, 431, 433, 439, 443, 449, 457, + 461, 463, 467, 479, 487, 491, 499, 503, /* 96 */ + 509, 521, 523, 541, 547, 557, 563, 569, + 571, 577, 587, 593, 599, 601, 607, 613, + 617, 619, 631, 641, 643, 647, 653, 659, + 661, 673, 677, 683, 691, 701, 709, 719, /* 128 */ + 727, 733, 739, 743, 751, 757, 761, 769, + 773, 787, 797, 809, 811, 821, 823, 827, + 829, 839, 853, 857, 859, 863, 877, 881, + 883, 887, 907, 911, 919, 929, 937, 941, /* 160 */ + 947, 953, 967, 971, 977, 983, 991, 997, + 1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049, + 1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097, + 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, /* 192 */ + 1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223, + 1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283, + 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321, + 1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, /* 224 */ + 1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459, + 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511, + 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, + 1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619, /* 256 */ + }; + + unsigned p; + unsigned x_div_p; + + for (int i = ARRAY_SIZE(primes) - 1; i >= 0; i--) { + p = primes[i]; + x_div_p = x / p; + + if ((x_div_p * p) == x) + break; + } + + /* A prime factor was not found. */ + if (x_div_p * p != x) + return; + + /* Terminate early if d=1 is a solution. */ + if (x_div_p < 0x10000) { + *result_a = x_div_p; + *result_b = p; + return; + } + + /* Pick the maximum possible value for 'd'. It's important that the loop + * below execute while d <= max_d because max_d is a valid value. Having + * the wrong loop bound would cause 1627*1367*47 (0x063b0c83) to be + * incorrectly reported as not being factorable. The problem would occur + * with any value that is a factor of two primes in the table and one prime + * not in the table. + */ + const unsigned max_d = 0xffff / p; + + /* Pick an initial value of 'd' that (combined with rejecting too large + * values above) guarantees that 'q' will always be small enough. + * DIV_ROUND_UP is used to prevent 'd' from being zero. + */ + for (unsigned d = DIV_ROUND_UP(x_div_p, 0xffff); d <= max_d; d++) { + unsigned q = x_div_p / d; + + if ((q * d) == x_div_p) { + assert(p * d * q == x); + assert((p * d) < 0x10000); + + *result_a = q; + *result_b = p * d; + break; + } + + /* Since every value of 'd' is tried, as soon as 'd' is larger + * than 'q', we're just re-testing combinations that have + * already been tested. + */ + if (d > q) + break; + } +} + +void +fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block) +{ + const fs_builder ibld(this, block, inst); + + /* It is correct to use inst->src[1].d in both end of the comparison. + * Using .ud in the UINT16_MAX comparison would cause any negative value to + * fail the check. + */ + if (inst->src[1].file == IMM && + (inst->src[1].d >= INT16_MIN && inst->src[1].d <= UINT16_MAX)) { + /* The MUL instruction isn't commutative. On Gen <= 6, only the low + * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of + * src1 are used. + * + * If multiplying by an immediate value that fits in 16-bits, do a + * single MUL instruction with that value in the proper location. + */ + const bool ud = (inst->src[1].d >= 0); + if (devinfo->ver < 7) { + fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type); + ibld.MOV(imm, inst->src[1]); + ibld.MUL(inst->dst, imm, inst->src[0]); + } else { + ibld.MUL(inst->dst, inst->src[0], + ud ? brw_imm_uw(inst->src[1].ud) + : brw_imm_w(inst->src[1].d)); + } + } else { + /* Gen < 8 (and some Gfx8+ low-power parts like Cherryview) cannot + * do 32-bit integer multiplication in one instruction, but instead + * must do a sequence (which actually calculates a 64-bit result): + * + * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D + * mach(8) null g3<8,8,1>D g4<8,8,1>D + * mov(8) g2<1>D acc0<8,8,1>D + * + * But on Gen > 6, the ability to use second accumulator register + * (acc1) for non-float data types was removed, preventing a simple + * implementation in SIMD16. A 16-channel result can be calculated by + * executing the three instructions twice in SIMD8, once with quarter + * control of 1Q for the first eight channels and again with 2Q for + * the second eight channels. + * + * Which accumulator register is implicitly accessed (by AccWrEnable + * for instance) is determined by the quarter control. Unfortunately + * Ivybridge (and presumably Baytrail) has a hardware bug in which an + * implicit accumulator access by an instruction with 2Q will access + * acc1 regardless of whether the data type is usable in acc1. + * + * Specifically, the 2Q mach(8) writes acc1 which does not exist for + * integer data types. + * + * Since we only want the low 32-bits of the result, we can do two + * 32-bit x 16-bit multiplies (like the mul and mach are doing), and + * adjust the high result and add them (like the mach is doing): + * + * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW + * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW + * shl(8) g9<1>D g8<8,8,1>D 16D + * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D + * + * We avoid the shl instruction by realizing that we only want to add + * the low 16-bits of the "high" result to the high 16-bits of the + * "low" result and using proper regioning on the add: + * + * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW + * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW + * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW + * + * Since it does not use the (single) accumulator register, we can + * schedule multi-component multiplications much better. + */ + + bool needs_mov = false; + fs_reg orig_dst = inst->dst; + + /* Get a new VGRF for the "low" 32x16-bit multiplication result if + * reusing the original destination is impossible due to hardware + * restrictions, source/destination overlap, or it being the null + * register. + */ + fs_reg low = inst->dst; + if (orig_dst.is_null() || orig_dst.file == MRF || + regions_overlap(inst->dst, inst->size_written, + inst->src[0], inst->size_read(0)) || + regions_overlap(inst->dst, inst->size_written, + inst->src[1], inst->size_read(1)) || + inst->dst.stride >= 4) { + needs_mov = true; + low = fs_reg(VGRF, alloc.allocate(regs_written(inst)), + inst->dst.type); + } + + /* Get a new VGRF but keep the same stride as inst->dst */ + fs_reg high(VGRF, alloc.allocate(regs_written(inst)), inst->dst.type); + high.stride = inst->dst.stride; + high.offset = inst->dst.offset % REG_SIZE; + + bool do_addition = true; + if (devinfo->ver >= 7) { + /* From Wa_1604601757: + * + * "When multiplying a DW and any lower precision integer, source modifier + * is not supported." + * + * An unsupported negate modifier on src[1] would ordinarily be + * lowered by the subsequent lower_regioning pass. In this case that + * pass would spawn another dword multiply. Instead, lower the + * modifier first. + */ + const bool source_mods_unsupported = (devinfo->ver >= 12); + + if (inst->src[1].abs || (inst->src[1].negate && + source_mods_unsupported)) + lower_src_modifiers(this, block, inst, 1); + + if (inst->src[1].file == IMM) { + unsigned a; + unsigned b; + + /* If the immeditate value can be factored into two values, A and + * B, that each fit in 16-bits, the multiplication result can + * instead be calculated as (src1 * (A * B)) = ((src1 * A) * B). + * This saves an operation (the addition) and a temporary register + * (high). + * + * Skip the optimization if either the high word or the low word + * is 0 or 1. In these conditions, at least one of the + * multiplications generated by the straightforward method will be + * eliminated anyway. + */ + if (inst->src[1].ud > 0x0001ffff && + (inst->src[1].ud & 0xffff) > 1) { + factor_uint32(inst->src[1].ud, &a, &b); + + if (a != 0) { + ibld.MUL(low, inst->src[0], brw_imm_uw(a)); + ibld.MUL(low, low, brw_imm_uw(b)); + do_addition = false; + } + } + + if (do_addition) { + ibld.MUL(low, inst->src[0], + brw_imm_uw(inst->src[1].ud & 0xffff)); + ibld.MUL(high, inst->src[0], + brw_imm_uw(inst->src[1].ud >> 16)); + } + } else { + ibld.MUL(low, inst->src[0], + subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0)); + ibld.MUL(high, inst->src[0], + subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1)); + } + } else { + if (inst->src[0].abs) + lower_src_modifiers(this, block, inst, 0); + + ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0), + inst->src[1]); + ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1), + inst->src[1]); + } + + if (do_addition) { + ibld.ADD(subscript(low, BRW_REGISTER_TYPE_UW, 1), + subscript(low, BRW_REGISTER_TYPE_UW, 1), + subscript(high, BRW_REGISTER_TYPE_UW, 0)); + } + + if (needs_mov || inst->conditional_mod) + set_condmod(inst->conditional_mod, ibld.MOV(orig_dst, low)); + } +} + +void +fs_visitor::lower_mul_qword_inst(fs_inst *inst, bblock_t *block) +{ + const fs_builder ibld(this, block, inst); + + /* Considering two 64-bit integers ab and cd where each letter ab + * corresponds to 32 bits, we get a 128-bit result WXYZ. We * cd + * only need to provide the YZ part of the result. ------- + * BD + * Only BD needs to be 64 bits. For AD and BC we only care + AD + * about the lower 32 bits (since they are part of the upper + BC + * 32 bits of our result). AC is not needed since it starts + AC + * on the 65th bit of the result. ------- + * WXYZ + */ + unsigned int q_regs = regs_written(inst); + unsigned int d_regs = (q_regs + 1) / 2; + + fs_reg bd(VGRF, alloc.allocate(q_regs), BRW_REGISTER_TYPE_UQ); + fs_reg ad(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD); + fs_reg bc(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD); + + /* Here we need the full 64 bit result for 32b * 32b. */ + if (devinfo->has_integer_dword_mul) { + ibld.MUL(bd, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0), + subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0)); + } else { + fs_reg bd_high(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD); + fs_reg bd_low(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD); + const unsigned acc_width = reg_unit(devinfo) * 8; + fs_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD), + inst->group % acc_width); + + fs_inst *mul = ibld.MUL(acc, + subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0), + subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0)); + mul->writes_accumulator = true; + + ibld.MACH(bd_high, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0), + subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0)); + ibld.MOV(bd_low, acc); + + ibld.UNDEF(bd); + ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 0), bd_low); + ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 1), bd_high); + } + + ibld.MUL(ad, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1), + subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0)); + ibld.MUL(bc, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0), + subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 1)); + + ibld.ADD(ad, ad, bc); + ibld.ADD(subscript(bd, BRW_REGISTER_TYPE_UD, 1), + subscript(bd, BRW_REGISTER_TYPE_UD, 1), ad); + + if (devinfo->has_64bit_int) { + ibld.MOV(inst->dst, bd); + } else { + if (!inst->is_partial_write()) + ibld.emit_undef_for_dst(inst); + ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0), + subscript(bd, BRW_REGISTER_TYPE_UD, 0)); + ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1), + subscript(bd, BRW_REGISTER_TYPE_UD, 1)); + } +} + +void +fs_visitor::lower_mulh_inst(fs_inst *inst, bblock_t *block) +{ + const fs_builder ibld(this, block, inst); + + /* According to the BDW+ BSpec page for the "Multiply Accumulate + * High" instruction: + * + * "An added preliminary mov is required for source modification on + * src1: + * mov (8) r3.0<1>:d -r3<8;8,1>:d + * mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw + * mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d" + */ + if (devinfo->ver >= 8 && (inst->src[1].negate || inst->src[1].abs)) + lower_src_modifiers(this, block, inst, 1); + + /* Should have been lowered to 8-wide. */ + assert(inst->exec_size <= get_lowered_simd_width(this, inst)); + const unsigned acc_width = reg_unit(devinfo) * 8; + const fs_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), inst->dst.type), + inst->group % acc_width); + fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]); + fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]); + + if (devinfo->ver >= 8) { + /* Until Gfx8, integer multiplies read 32-bits from one source, + * and 16-bits from the other, and relying on the MACH instruction + * to generate the high bits of the result. + * + * On Gfx8, the multiply instruction does a full 32x32-bit + * multiply, but in order to do a 64-bit multiply we can simulate + * the previous behavior and then use a MACH instruction. + */ + assert(mul->src[1].type == BRW_REGISTER_TYPE_D || + mul->src[1].type == BRW_REGISTER_TYPE_UD); + mul->src[1].type = BRW_REGISTER_TYPE_UW; + mul->src[1].stride *= 2; + + if (mul->src[1].file == IMM) { + mul->src[1] = brw_imm_uw(mul->src[1].ud); + } + } else if (devinfo->verx10 == 70 && + inst->group > 0) { + /* Among other things the quarter control bits influence which + * accumulator register is used by the hardware for instructions + * that access the accumulator implicitly (e.g. MACH). A + * second-half instruction would normally map to acc1, which + * doesn't exist on Gfx7 and up (the hardware does emulate it for + * floating-point instructions *only* by taking advantage of the + * extra precision of acc0 not normally used for floating point + * arithmetic). + * + * HSW and up are careful enough not to try to access an + * accumulator register that doesn't exist, but on earlier Gfx7 + * hardware we need to make sure that the quarter control bits are + * zero to avoid non-deterministic behaviour and emit an extra MOV + * to get the result masked correctly according to the current + * channel enables. + */ + mach->group = 0; + mach->force_writemask_all = true; + mach->dst = ibld.vgrf(inst->dst.type); + ibld.MOV(inst->dst, mach->dst); + } +} + +bool +fs_visitor::lower_integer_multiplication() +{ + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + if (inst->opcode == BRW_OPCODE_MUL) { + /* If the instruction is already in a form that does not need lowering, + * return early. + */ + if (devinfo->ver >= 7) { + if (type_sz(inst->src[1].type) < 4 && type_sz(inst->src[0].type) <= 4) + continue; + } else { + if (type_sz(inst->src[0].type) < 4 && type_sz(inst->src[1].type) <= 4) + continue; + } + + if ((inst->dst.type == BRW_REGISTER_TYPE_Q || + inst->dst.type == BRW_REGISTER_TYPE_UQ) && + (inst->src[0].type == BRW_REGISTER_TYPE_Q || + inst->src[0].type == BRW_REGISTER_TYPE_UQ) && + (inst->src[1].type == BRW_REGISTER_TYPE_Q || + inst->src[1].type == BRW_REGISTER_TYPE_UQ)) { + lower_mul_qword_inst(inst, block); + inst->remove(block); + progress = true; + } else if (!inst->dst.is_accumulator() && + (inst->dst.type == BRW_REGISTER_TYPE_D || + inst->dst.type == BRW_REGISTER_TYPE_UD) && + (!devinfo->has_integer_dword_mul || + devinfo->verx10 >= 125)) { + lower_mul_dword_inst(inst, block); + inst->remove(block); + progress = true; + } + } else if (inst->opcode == SHADER_OPCODE_MULH) { + lower_mulh_inst(inst, block); + inst->remove(block); + progress = true; + } + + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + + return progress; +} + +bool +fs_visitor::lower_minmax() +{ + assert(devinfo->ver < 6); + + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + const fs_builder ibld(this, block, inst); + + if (inst->opcode == BRW_OPCODE_SEL && + inst->predicate == BRW_PREDICATE_NONE) { + /* If src1 is an immediate value that is not NaN, then it can't be + * NaN. In that case, emit CMP because it is much better for cmod + * propagation. Likewise if src1 is not float. Gfx4 and Gfx5 don't + * support HF or DF, so it is not necessary to check for those. + */ + if (inst->src[1].type != BRW_REGISTER_TYPE_F || + (inst->src[1].file == IMM && !isnan(inst->src[1].f))) { + ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1], + inst->conditional_mod); + } else { + ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1], + inst->conditional_mod); + } + inst->predicate = BRW_PREDICATE_NORMAL; + inst->conditional_mod = BRW_CONDITIONAL_NONE; + + progress = true; + } + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + + return progress; +} + +bool +fs_visitor::lower_sub_sat() +{ + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + const fs_builder ibld(this, block, inst); + + if (inst->opcode == SHADER_OPCODE_USUB_SAT || + inst->opcode == SHADER_OPCODE_ISUB_SAT) { + /* The fundamental problem is the hardware performs source negation + * at the bit width of the source. If the source is 0x80000000D, the + * negation is 0x80000000D. As a result, subtractSaturate(0, + * 0x80000000) will produce 0x80000000 instead of 0x7fffffff. There + * are at least three ways to resolve this: + * + * 1. Use the accumulator for the negated source. The accumulator is + * 33 bits, so our source 0x80000000 is sign-extended to + * 0x1800000000. The negation of which is 0x080000000. This + * doesn't help for 64-bit integers (which are already bigger than + * 33 bits). There are also only 8 accumulators, so SIMD16 or + * SIMD32 instructions would have to be split into multiple SIMD8 + * instructions. + * + * 2. Use slightly different math. For any n-bit value x, we know (x + * >> 1) != -(x >> 1). We can use this fact to only do + * subtractions involving (x >> 1). subtractSaturate(a, b) == + * subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)). + * + * 3. For unsigned sources, it is sufficient to replace the + * subtractSaturate with (a > b) ? a - b : 0. + * + * It may also be possible to use the SUBB instruction. This + * implicitly writes the accumulator, so it could only be used in the + * same situations as #1 above. It is further limited by only + * allowing UD sources. + */ + if (inst->exec_size == 8 && inst->src[0].type != BRW_REGISTER_TYPE_Q && + inst->src[0].type != BRW_REGISTER_TYPE_UQ) { + fs_reg acc(ARF, BRW_ARF_ACCUMULATOR, inst->src[1].type); + + ibld.MOV(acc, inst->src[1]); + fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]); + add->saturate = true; + add->src[0].negate = true; + } else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) { + /* tmp = src1 >> 1; + * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp)); + */ + fs_reg tmp1 = ibld.vgrf(inst->src[0].type); + fs_reg tmp2 = ibld.vgrf(inst->src[0].type); + fs_reg tmp3 = ibld.vgrf(inst->src[0].type); + fs_inst *add; + + ibld.SHR(tmp1, inst->src[1], brw_imm_d(1)); + + add = ibld.ADD(tmp2, inst->src[1], tmp1); + add->src[1].negate = true; + + add = ibld.ADD(tmp3, inst->src[0], tmp1); + add->src[1].negate = true; + add->saturate = true; + + add = ibld.ADD(inst->dst, tmp3, tmp2); + add->src[1].negate = true; + add->saturate = true; + } else { + /* a > b ? a - b : 0 */ + ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1], + BRW_CONDITIONAL_G); + + fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]); + add->src[1].negate = !add->src[1].negate; + + ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0)) + ->predicate = BRW_PREDICATE_NORMAL; + } + + inst->remove(block); + progress = true; + } + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + + return progress; +} + +/** + * Get the mask of SIMD channels enabled during dispatch and not yet disabled + * by discard. Due to the layout of the sample mask in the fragment shader + * thread payload, \p bld is required to have a dispatch_width() not greater + * than 16 for fragment shaders. + */ +fs_reg +brw_sample_mask_reg(const fs_builder &bld) +{ + const fs_visitor &s = *bld.shader; + + if (s.stage != MESA_SHADER_FRAGMENT) { + return brw_imm_ud(0xffffffff); + } else if (brw_wm_prog_data(s.stage_prog_data)->uses_kill) { + assert(bld.dispatch_width() <= 16); + return brw_flag_subreg(sample_mask_flag_subreg(s) + bld.group() / 16); + } else { + assert(s.devinfo->ver >= 6 && bld.dispatch_width() <= 16); + assert(s.devinfo->ver < 20); + return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7), + BRW_REGISTER_TYPE_UW); + } +} + +uint32_t +brw_fb_write_msg_control(const fs_inst *inst, + const struct brw_wm_prog_data *prog_data) +{ + uint32_t mctl; + + if (inst->opcode == FS_OPCODE_REP_FB_WRITE) { + assert(inst->group == 0 && inst->exec_size == 16); + mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED; + } else if (prog_data->dual_src_blend) { + assert(inst->exec_size == 8); + + if (inst->group % 16 == 0) + mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01; + else if (inst->group % 16 == 8) + mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23; + else + unreachable("Invalid dual-source FB write instruction group"); + } else { + assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16)); + + if (inst->exec_size == 16) + mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE; + else if (inst->exec_size == 8) + mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01; + else + unreachable("Invalid FB write execution size"); + } + + return mctl; +} + + /** + * Predicate the specified instruction on the sample mask. + */ +void +brw_emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst) +{ + assert(bld.shader->stage == MESA_SHADER_FRAGMENT && + bld.group() == inst->group && + bld.dispatch_width() == inst->exec_size); + + const fs_visitor &s = *bld.shader; + const fs_reg sample_mask = brw_sample_mask_reg(bld); + const unsigned subreg = sample_mask_flag_subreg(s); + + if (brw_wm_prog_data(s.stage_prog_data)->uses_kill) { + assert(sample_mask.file == ARF && + sample_mask.nr == brw_flag_subreg(subreg).nr && + sample_mask.subnr == brw_flag_subreg( + subreg + inst->group / 16).subnr); + } else { + bld.group(1, 0).exec_all() + .MOV(brw_flag_subreg(subreg + inst->group / 16), sample_mask); + } + + if (inst->predicate) { + assert(inst->predicate == BRW_PREDICATE_NORMAL); + assert(!inst->predicate_inverse); + assert(inst->flag_subreg == 0); + assert(s.devinfo->ver < 20); + /* Combine the sample mask with the existing predicate by using a + * vertical predication mode. + */ + inst->predicate = BRW_PREDICATE_ALIGN1_ALLV; + } else { + inst->flag_subreg = subreg; + inst->predicate = BRW_PREDICATE_NORMAL; + inst->predicate_inverse = false; + } +} + +static bool +is_mixed_float_with_fp32_dst(const fs_inst *inst) +{ + /* This opcode sometimes uses :W type on the source even if the operand is + * a :HF, because in gfx7 there is no support for :HF, and thus it uses :W. + */ + if (inst->opcode == BRW_OPCODE_F16TO32) + return true; + + if (inst->dst.type != BRW_REGISTER_TYPE_F) + return false; + + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].type == BRW_REGISTER_TYPE_HF) + return true; + } + + return false; +} + +static bool +is_mixed_float_with_packed_fp16_dst(const fs_inst *inst) +{ + /* This opcode sometimes uses :W type on the destination even if the + * destination is a :HF, because in gfx7 there is no support for :HF, and + * thus it uses :W. + */ + if (inst->opcode == BRW_OPCODE_F32TO16 && + inst->dst.stride == 1) + return true; + + if (inst->dst.type != BRW_REGISTER_TYPE_HF || + inst->dst.stride != 1) + return false; + + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].type == BRW_REGISTER_TYPE_F) + return true; + } + + return false; +} + +/** + * Get the closest allowed SIMD width for instruction \p inst accounting for + * some common regioning and execution control restrictions that apply to FPU + * instructions. These restrictions don't necessarily have any relevance to + * instructions not executed by the FPU pipeline like extended math, control + * flow or send message instructions. + * + * For virtual opcodes it's really up to the instruction -- In some cases + * (e.g. where a virtual instruction unrolls into a simple sequence of FPU + * instructions) it may simplify virtual instruction lowering if we can + * enforce FPU-like regioning restrictions already on the virtual instruction, + * in other cases (e.g. virtual send-like instructions) this may be + * excessively restrictive. + */ +static unsigned +get_fpu_lowered_simd_width(const fs_visitor *shader, + const fs_inst *inst) +{ + const struct brw_compiler *compiler = shader->compiler; + const struct intel_device_info *devinfo = compiler->devinfo; + + /* Maximum execution size representable in the instruction controls. */ + unsigned max_width = MIN2(32, inst->exec_size); + + /* Number of channels per polygon handled by a multipolygon PS shader. */ + const unsigned poly_width = shader->dispatch_width / + MAX2(1, shader->max_polygons); + + /* Number of registers that will be read by an ATTR source if + * present for multipolygon PS shaders, since the PS vertex setup + * data for each polygon is stored in different contiguous GRFs. + */ + const unsigned attr_reg_count = (shader->stage != MESA_SHADER_FRAGMENT || + shader->max_polygons < 2 ? 0 : + DIV_ROUND_UP(inst->exec_size, + poly_width) * reg_unit(devinfo)); + + /* According to the PRMs: + * "A. In Direct Addressing mode, a source cannot span more than 2 + * adjacent GRF registers. + * B. A destination cannot span more than 2 adjacent GRF registers." + * + * Look for the source or destination with the largest register region + * which is the one that is going to limit the overall execution size of + * the instruction due to this rule. + */ + unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE); + + for (unsigned i = 0; i < inst->sources; i++) + reg_count = MAX3(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE), + (inst->src[i].file == ATTR ? attr_reg_count : 0)); + + /* Calculate the maximum execution size of the instruction based on the + * factor by which it goes over the hardware limit of 2 GRFs. + */ + const unsigned max_reg_count = 2 * reg_unit(devinfo); + if (reg_count > max_reg_count) + max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, max_reg_count)); + + /* According to the IVB PRMs: + * "When destination spans two registers, the source MUST span two + * registers. The exception to the above rule: + * + * - When source is scalar, the source registers are not incremented. + * - When source is packed integer Word and destination is packed + * integer DWord, the source register is not incremented but the + * source sub register is incremented." + * + * The hardware specs from Gfx4 to Gfx7.5 mention similar regioning + * restrictions. The code below intentionally doesn't check whether the + * destination type is integer because empirically the hardware doesn't + * seem to care what the actual type is as long as it's dword-aligned. + * + * HSW PRMs also add a note to the second exception: + * "When lower 8 channels are disabled, the sub register of source1 + * operand is not incremented. If the lower 8 channels are expected + * to be disabled, say by predication, the instruction must be split + * into pair of simd8 operations." + * + * We can't reliably know if the channels won't be disabled due to, + * for example, IMASK. So, play it safe and disallow packed-word exception + * for src1. + */ + if (devinfo->ver < 8) { + for (unsigned i = 0; i < inst->sources; i++) { + /* IVB implements DF scalars as <0;2,1> regions. */ + const bool is_scalar_exception = is_uniform(inst->src[i]) && + (devinfo->platform == INTEL_PLATFORM_HSW || type_sz(inst->src[i].type) != 8); + const bool is_packed_word_exception = i != 1 && + type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 && + type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1; + + /* We check size_read(i) against size_written instead of REG_SIZE + * because we want to properly handle SIMD32. In SIMD32, you can end + * up with writes to 4 registers and a source that reads 2 registers + * and we may still need to lower all the way to SIMD8 in that case. + */ + if (inst->size_written > REG_SIZE && + inst->size_read(i) != 0 && + inst->size_read(i) < inst->size_written && + !is_scalar_exception && !is_packed_word_exception) { + const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE); + max_width = MIN2(max_width, inst->exec_size / reg_count); + } + } + } + + if (devinfo->ver < 6) { + /* From the G45 PRM, Volume 4 Page 361: + * + * "Operand Alignment Rule: With the exceptions listed below, a + * source/destination operand in general should be aligned to even + * 256-bit physical register with a region size equal to two 256-bit + * physical registers." + * + * Normally we enforce this by allocating virtual registers to the + * even-aligned class. But we need to handle payload registers. + */ + for (unsigned i = 0; i < inst->sources; i++) { + if (inst->src[i].file == FIXED_GRF && (inst->src[i].nr & 1) && + inst->size_read(i) > REG_SIZE) { + max_width = MIN2(max_width, 8); + } + } + } + + /* From the IVB PRMs: + * "When an instruction is SIMD32, the low 16 bits of the execution mask + * are applied for both halves of the SIMD32 instruction. If different + * execution mask channels are required, split the instruction into two + * SIMD16 instructions." + * + * There is similar text in the HSW PRMs. Gfx4-6 don't even implement + * 32-wide control flow support in hardware and will behave similarly. + */ + if (devinfo->ver < 8 && !inst->force_writemask_all) + max_width = MIN2(max_width, 16); + + /* From the IVB PRMs (applies to HSW too): + * "Instructions with condition modifiers must not use SIMD32." + * + * From the BDW PRMs (applies to later hardware too): + * "Ternary instruction with condition modifiers must not use SIMD32." + */ + if (inst->conditional_mod && (devinfo->ver < 8 || + (inst->is_3src(compiler) && devinfo->ver < 12))) + max_width = MIN2(max_width, 16); + + /* From the IVB PRMs (applies to other devices that don't have the + * intel_device_info::supports_simd16_3src flag set): + * "In Align16 access mode, SIMD16 is not allowed for DW operations and + * SIMD8 is not allowed for DF operations." + */ + if (inst->is_3src(compiler) && !devinfo->supports_simd16_3src) + max_width = MIN2(max_width, inst->exec_size / reg_count); + + /* Pre-Gfx8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is + * the 8-bit quarter of the execution mask signals specified in the + * instruction control fields) for the second compressed half of any + * single-precision instruction (for double-precision instructions + * it's hardwired to use NibCtrl+1, at least on HSW), which means that + * the EU will apply the wrong execution controls for the second + * sequential GRF write if the number of channels per GRF is not exactly + * eight in single-precision mode (or four in double-float mode). + * + * In this situation we calculate the maximum size of the split + * instructions so they only ever write to a single register. + */ + if (devinfo->ver < 8 && inst->size_written > REG_SIZE && + !inst->force_writemask_all) { + const unsigned channels_per_grf = inst->exec_size / + DIV_ROUND_UP(inst->size_written, REG_SIZE); + const unsigned exec_type_size = get_exec_type_size(inst); + assert(exec_type_size); + + /* The hardware shifts exactly 8 channels per compressed half of the + * instruction in single-precision mode and exactly 4 in double-precision. + */ + if (channels_per_grf != (exec_type_size == 8 ? 4 : 8)) + max_width = MIN2(max_width, channels_per_grf); + + /* Lower all non-force_writemask_all DF instructions to SIMD4 on IVB/BYT + * because HW applies the same channel enable signals to both halves of + * the compressed instruction which will be just wrong under + * non-uniform control flow. + */ + if (devinfo->verx10 == 70 && + (exec_type_size == 8 || type_sz(inst->dst.type) == 8)) + max_width = MIN2(max_width, 4); + } + + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode + * Float Operations: + * + * "No SIMD16 in mixed mode when destination is f32. Instruction + * execution size must be no more than 8." + * + * FIXME: the simulator doesn't seem to complain if we don't do this and + * empirical testing with existing CTS tests show that they pass just fine + * without implementing this, however, since our interpretation of the PRM + * is that conversion MOVs between HF and F are still mixed-float + * instructions (and therefore subject to this restriction) we decided to + * split them to be safe. Might be useful to do additional investigation to + * lift the restriction if we can ensure that it is safe though, since these + * conversions are common when half-float types are involved since many + * instructions do not support HF types and conversions from/to F are + * required. + */ + if (is_mixed_float_with_fp32_dst(inst) && devinfo->ver < 20) + max_width = MIN2(max_width, 8); + + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode + * Float Operations: + * + * "No SIMD16 in mixed mode when destination is packed f16 for both + * Align1 and Align16." + */ + if (is_mixed_float_with_packed_fp16_dst(inst) && devinfo->ver < 20) + max_width = MIN2(max_width, 8); + + /* Only power-of-two execution sizes are representable in the instruction + * control fields. + */ + return 1 << util_logbase2(max_width); +} + +/** + * Get the maximum allowed SIMD width for instruction \p inst accounting for + * various payload size restrictions that apply to sampler message + * instructions. + * + * This is only intended to provide a maximum theoretical bound for the + * execution size of the message based on the number of argument components + * alone, which in most cases will determine whether the SIMD8 or SIMD16 + * variant of the message can be used, though some messages may have + * additional restrictions not accounted for here (e.g. pre-ILK hardware uses + * the message length to determine the exact SIMD width and argument count, + * which makes a number of sampler message combinations impossible to + * represent). + * + * Note: Platforms with monolithic SIMD16 double the possible SIMD widths + * change from (SIMD8, SIMD16) to (SIMD16, SIMD32). + */ +static unsigned +get_sampler_lowered_simd_width(const struct intel_device_info *devinfo, + const fs_inst *inst) +{ + /* If we have a min_lod parameter on anything other than a simple sample + * message, it will push it over 5 arguments and we have to fall back to + * SIMD8. + */ + if (inst->opcode != SHADER_OPCODE_TEX && + inst->components_read(TEX_LOGICAL_SRC_MIN_LOD)) + return devinfo->ver < 20 ? 8 : 16; + + /* Calculate the number of coordinate components that have to be present + * assuming that additional arguments follow the texel coordinates in the + * message payload. On IVB+ there is no need for padding, on ILK-SNB we + * need to pad to four or three components depending on the message, + * pre-ILK we need to pad to at most three components. + */ + const unsigned req_coord_components = + (devinfo->ver >= 7 || + !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 : + (devinfo->ver >= 5 && inst->opcode != SHADER_OPCODE_TXF_LOGICAL && + inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 : + 3; + + /* On Gfx9+ the LOD argument is for free if we're able to use the LZ + * variant of the TXL or TXF message. + */ + const bool implicit_lod = devinfo->ver >= 9 && + (inst->opcode == SHADER_OPCODE_TXL || + inst->opcode == SHADER_OPCODE_TXF) && + inst->src[TEX_LOGICAL_SRC_LOD].is_zero(); + + /* Calculate the total number of argument components that need to be passed + * to the sampler unit. + */ + const unsigned num_payload_components = + MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE), + req_coord_components) + + inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) + + (implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) + + inst->components_read(TEX_LOGICAL_SRC_LOD2) + + inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) + + (inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ? + inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) + + inst->components_read(TEX_LOGICAL_SRC_MCS); + + const unsigned simd_limit = reg_unit(devinfo) * + (num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16); + + /* SIMD16 (SIMD32 on Xe2) messages with more than five arguments exceed the + * maximum message size supported by the sampler, regardless of whether a + * header is provided or not. + */ + return MIN2(inst->exec_size, simd_limit); +} + +/** + * Get the closest native SIMD width supported by the hardware for instruction + * \p inst. The instruction will be left untouched by + * fs_visitor::lower_simd_width() if the returned value is equal to the + * original execution size. + */ +static unsigned +get_lowered_simd_width(const fs_visitor *shader, const fs_inst *inst) +{ + const struct brw_compiler *compiler = shader->compiler; + const struct intel_device_info *devinfo = compiler->devinfo; + + switch (inst->opcode) { + case BRW_OPCODE_DP4A: + case BRW_OPCODE_MOV: + case BRW_OPCODE_SEL: + case BRW_OPCODE_NOT: + case BRW_OPCODE_AND: + case BRW_OPCODE_OR: + case BRW_OPCODE_XOR: + case BRW_OPCODE_SHR: + case BRW_OPCODE_SHL: + case BRW_OPCODE_ASR: + case BRW_OPCODE_ROR: + case BRW_OPCODE_ROL: + case BRW_OPCODE_CMPN: + case BRW_OPCODE_CSEL: + case BRW_OPCODE_F32TO16: + case BRW_OPCODE_F16TO32: + case BRW_OPCODE_BFREV: + case BRW_OPCODE_BFE: + case BRW_OPCODE_ADD: + case BRW_OPCODE_MUL: + case BRW_OPCODE_AVG: + case BRW_OPCODE_FRC: + case BRW_OPCODE_RNDU: + case BRW_OPCODE_RNDD: + case BRW_OPCODE_RNDE: + case BRW_OPCODE_RNDZ: + case BRW_OPCODE_LZD: + case BRW_OPCODE_FBH: + case BRW_OPCODE_FBL: + case BRW_OPCODE_CBIT: + case BRW_OPCODE_SAD2: + case BRW_OPCODE_MAD: + case BRW_OPCODE_LRP: + case BRW_OPCODE_ADD3: + case FS_OPCODE_PACK: + case SHADER_OPCODE_SEL_EXEC: + case SHADER_OPCODE_CLUSTER_BROADCAST: + case SHADER_OPCODE_MOV_RELOC_IMM: + return get_fpu_lowered_simd_width(shader, inst); + + case BRW_OPCODE_CMP: { + /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that + * when the destination is a GRF the dependency-clear bit on the flag + * register is cleared early. + * + * Suggested workarounds are to disable coissuing CMP instructions + * or to split CMP(16) instructions into two CMP(8) instructions. + * + * We choose to split into CMP(8) instructions since disabling + * coissuing would affect CMP instructions not otherwise affected by + * the errata. + */ + const unsigned max_width = (devinfo->verx10 == 70 && + !inst->dst.is_null() ? 8 : ~0); + return MIN2(max_width, get_fpu_lowered_simd_width(shader, inst)); + } + case BRW_OPCODE_BFI1: + case BRW_OPCODE_BFI2: + /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we + * should + * "Force BFI instructions to be executed always in SIMD8." + */ + return MIN2(devinfo->platform == INTEL_PLATFORM_HSW ? 8 : ~0u, + get_fpu_lowered_simd_width(shader, inst)); + + case BRW_OPCODE_IF: + assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16); + return inst->exec_size; + + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: { + /* Unary extended math instructions are limited to SIMD8 on Gfx4 and + * Gfx6. Extended Math Function is limited to SIMD8 with half-float. + */ + if (devinfo->ver == 6 || devinfo->verx10 == 40) + return MIN2(8, inst->exec_size); + if (inst->dst.type == BRW_REGISTER_TYPE_HF) + return MIN2(8, inst->exec_size); + return MIN2(16, inst->exec_size); + } + + case SHADER_OPCODE_POW: { + /* SIMD16 is only allowed on Gfx7+. Extended Math Function is limited + * to SIMD8 with half-float + */ + if (devinfo->ver < 7) + return MIN2(8, inst->exec_size); + if (inst->dst.type == BRW_REGISTER_TYPE_HF) + return MIN2(8, inst->exec_size); + return MIN2(16, inst->exec_size); + } + + case SHADER_OPCODE_USUB_SAT: + case SHADER_OPCODE_ISUB_SAT: + return get_fpu_lowered_simd_width(shader, inst); + + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + /* Integer division is limited to SIMD8 on all generations. */ + return MIN2(8, inst->exec_size); + + case FS_OPCODE_LINTERP: + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: + case FS_OPCODE_PACK_HALF_2x16_SPLIT: + case FS_OPCODE_INTERPOLATE_AT_SAMPLE: + case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: + case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: + return MIN2(16, inst->exec_size); + + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL: + /* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch + * message used to implement varying pull constant loads, so expand it + * to SIMD16. An alternative with longer message payload length but + * shorter return payload would be to use the SIMD8 sampler message that + * takes (header, u, v, r) as parameters instead of (header, u). + */ + return (devinfo->ver == 4 ? 16 : MIN2(16, inst->exec_size)); + + case FS_OPCODE_DDX_COARSE: + case FS_OPCODE_DDX_FINE: + case FS_OPCODE_DDY_COARSE: + case FS_OPCODE_DDY_FINE: + /* The implementation of this virtual opcode may require emitting + * compressed Align16 instructions, which are severely limited on some + * generations. + * + * From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register + * Region Restrictions): + * + * "In Align16 access mode, SIMD16 is not allowed for DW operations + * and SIMD8 is not allowed for DF operations." + * + * In this context, "DW operations" means "operations acting on 32-bit + * values", so it includes operations on floats. + * + * Gfx4 has a similar restriction. From the i965 PRM, section 11.5.3 + * (Instruction Compression -> Rules and Restrictions): + * + * "A compressed instruction must be in Align1 access mode. Align16 + * mode instructions cannot be compressed." + * + * Similar text exists in the g45 PRM. + * + * Empirically, compressed align16 instructions using odd register + * numbers don't appear to work on Sandybridge either. + */ + return (devinfo->ver == 4 || devinfo->ver == 6 || + (devinfo->verx10 == 70) ? + MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size)); + + case SHADER_OPCODE_MULH: + /* MULH is lowered to the MUL/MACH sequence using the accumulator, which + * is 8-wide on Gfx7+. + */ + return (devinfo->ver >= 20 ? 16 : + devinfo->ver >= 7 ? 8 : + get_fpu_lowered_simd_width(shader, inst)); + + case FS_OPCODE_FB_WRITE_LOGICAL: + /* Gfx6 doesn't support SIMD16 depth writes but we cannot handle them + * here. + */ + assert(devinfo->ver != 6 || + inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE || + inst->exec_size == 8); + /* Dual-source FB writes are unsupported in SIMD16 mode. */ + return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ? + 8 : MIN2(16, inst->exec_size)); + + case FS_OPCODE_FB_READ_LOGICAL: + return MIN2(16, inst->exec_size); + + case SHADER_OPCODE_TEX_LOGICAL: + case SHADER_OPCODE_TXF_CMS_LOGICAL: + case SHADER_OPCODE_TXF_UMS_LOGICAL: + case SHADER_OPCODE_TXF_MCS_LOGICAL: + case SHADER_OPCODE_LOD_LOGICAL: + case SHADER_OPCODE_TG4_LOGICAL: + case SHADER_OPCODE_SAMPLEINFO_LOGICAL: + case SHADER_OPCODE_TXF_CMS_W_LOGICAL: + case SHADER_OPCODE_TG4_OFFSET_LOGICAL: + return get_sampler_lowered_simd_width(devinfo, inst); + + /* On gfx12 parameters are fixed to 16-bit values and therefore they all + * always fit regardless of the execution size. + */ + case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL: + return MIN2(16, inst->exec_size); + + case SHADER_OPCODE_TXD_LOGICAL: + /* TXD is unsupported in SIMD16 mode previous to Xe2. SIMD32 is still + * unsuppported on Xe2. + */ + return devinfo->ver < 20 ? 8 : 16; + + case SHADER_OPCODE_TXL_LOGICAL: + case FS_OPCODE_TXB_LOGICAL: + /* Only one execution size is representable pre-ILK depending on whether + * the shadow reference argument is present. + */ + if (devinfo->ver == 4) + return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8; + else + return get_sampler_lowered_simd_width(devinfo, inst); + + case SHADER_OPCODE_TXF_LOGICAL: + case SHADER_OPCODE_TXS_LOGICAL: + /* Gfx4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD + * messages. Use SIMD16 instead. + */ + if (devinfo->ver == 4) + return 16; + else + return get_sampler_lowered_simd_width(devinfo, inst); + + case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: + case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: + case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: + return 8; + + case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: + case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: + case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: + case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: + case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: + case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL: + case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL: + return MIN2(16, inst->exec_size); + + case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: + case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: + case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL: + case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL: + return devinfo->ver <= 8 ? 8 : MIN2(16, inst->exec_size); + + case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL: + case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL: + case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL: + assert(inst->exec_size <= 16); + return inst->exec_size; + + case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL: + return devinfo->has_lsc ? MIN2(16, inst->exec_size) : 8; + + case SHADER_OPCODE_URB_READ_LOGICAL: + case SHADER_OPCODE_URB_WRITE_LOGICAL: + return MIN2(devinfo->ver < 20 ? 8 : 16, inst->exec_size); + + case SHADER_OPCODE_QUAD_SWIZZLE: { + const unsigned swiz = inst->src[1].ud; + return (is_uniform(inst->src[0]) ? + get_fpu_lowered_simd_width(shader, inst) : + devinfo->ver < 11 && type_sz(inst->src[0].type) == 4 ? 8 : + swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? 4 : + get_fpu_lowered_simd_width(shader, inst)); + } + case SHADER_OPCODE_MOV_INDIRECT: { + /* From IVB and HSW PRMs: + * + * "2.When the destination requires two registers and the sources are + * indirect, the sources must use 1x1 regioning mode. + * + * In case of DF instructions in HSW/IVB, the exec_size is limited by + * the EU decompression logic not handling VxH indirect addressing + * correctly. + */ + const unsigned max_size = (devinfo->ver >= 8 ? 2 : 1) * REG_SIZE; + /* Prior to Broadwell, we only have 8 address subregisters. */ + return MIN3(devinfo->ver >= 8 ? 16 : 8, + max_size / (inst->dst.stride * type_sz(inst->dst.type)), + inst->exec_size); + } + + case SHADER_OPCODE_LOAD_PAYLOAD: { + const unsigned reg_count = + DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE); + + if (reg_count > 2) { + /* Only LOAD_PAYLOAD instructions with per-channel destination region + * can be easily lowered (which excludes headers and heterogeneous + * types). + */ + assert(!inst->header_size); + for (unsigned i = 0; i < inst->sources; i++) + assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) || + inst->src[i].file == BAD_FILE); + + return inst->exec_size / DIV_ROUND_UP(reg_count, 2); + } else { + return inst->exec_size; + } + } + default: + return inst->exec_size; + } +} + +/** + * Return true if splitting out the group of channels of instruction \p inst + * given by lbld.group() requires allocating a temporary for the i-th source + * of the lowered instruction. + */ +static inline bool +needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i) +{ + return !(is_periodic(inst->src[i], lbld.dispatch_width()) || + (inst->components_read(i) == 1 && + lbld.dispatch_width() <= inst->exec_size)) || + (inst->flags_written(lbld.shader->devinfo) & + flag_mask(inst->src[i], type_sz(inst->src[i].type))); +} + +/** + * Extract the data that would be consumed by the channel group given by + * lbld.group() from the i-th source region of instruction \p inst and return + * it as result in packed form. + */ +static fs_reg +emit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i) +{ + assert(lbld.group() >= inst->group); + + /* Specified channel group from the source region. */ + const fs_reg src = horiz_offset(inst->src[i], lbld.group() - inst->group); + + if (needs_src_copy(lbld, inst, i)) { + /* Builder of the right width to perform the copy avoiding uninitialized + * data if the lowered execution size is greater than the original + * execution size of the instruction. + */ + const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(), + inst->exec_size), 0); + const fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i)); + + for (unsigned k = 0; k < inst->components_read(i); ++k) + cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k)); + + return tmp; + + } else if (is_periodic(inst->src[i], lbld.dispatch_width())) { + /* The source is invariant for all dispatch_width-wide groups of the + * original region. + */ + return inst->src[i]; + + } else { + /* We can just point the lowered instruction at the right channel group + * from the original region. + */ + return src; + } +} + +/** + * Return true if splitting out the group of channels of instruction \p inst + * given by lbld.group() requires allocating a temporary for the destination + * of the lowered instruction and copying the data back to the original + * destination region. + */ +static inline bool +needs_dst_copy(const fs_builder &lbld, const fs_inst *inst) +{ + if (inst->dst.is_null()) + return false; + + /* If the instruction writes more than one component we'll have to shuffle + * the results of multiple lowered instructions in order to make sure that + * they end up arranged correctly in the original destination region. + */ + if (inst->size_written > inst->dst.component_size(inst->exec_size)) + return true; + + /* If the lowered execution size is larger than the original the result of + * the instruction won't fit in the original destination, so we'll have to + * allocate a temporary in any case. + */ + if (lbld.dispatch_width() > inst->exec_size) + return true; + + for (unsigned i = 0; i < inst->sources; i++) { + /* If we already made a copy of the source for other reasons there won't + * be any overlap with the destination. + */ + if (needs_src_copy(lbld, inst, i)) + continue; + + /* In order to keep the logic simple we emit a copy whenever the + * destination region doesn't exactly match an overlapping source, which + * may point at the source and destination not being aligned group by + * group which could cause one of the lowered instructions to overwrite + * the data read from the same source by other lowered instructions. + */ + if (regions_overlap(inst->dst, inst->size_written, + inst->src[i], inst->size_read(i)) && + !inst->dst.equals(inst->src[i])) + return true; + } + + return false; +} + +/** + * Insert data from a packed temporary into the channel group given by + * lbld.group() of the destination region of instruction \p inst and return + * the temporary as result. Any copy instructions that are required for + * unzipping the previous value (in the case of partial writes) will be + * inserted using \p lbld_before and any copy instructions required for + * zipping up the destination of \p inst will be inserted using \p lbld_after. + */ +static fs_reg +emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after, + fs_inst *inst) +{ + assert(lbld_before.dispatch_width() == lbld_after.dispatch_width()); + assert(lbld_before.group() == lbld_after.group()); + assert(lbld_after.group() >= inst->group); + + const struct intel_device_info *devinfo = lbld_before.shader->devinfo; + + /* Specified channel group from the destination region. */ + const fs_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group); + + if (!needs_dst_copy(lbld_after, inst)) { + /* No need to allocate a temporary for the lowered instruction, just + * take the right group of channels from the original region. + */ + return dst; + } + + /* Deal with the residency data part later */ + const unsigned residency_size = inst->has_sampler_residency() ? + (reg_unit(devinfo) * REG_SIZE) : 0; + const unsigned dst_size = (inst->size_written - residency_size) / + inst->dst.component_size(inst->exec_size); + + const fs_reg tmp = lbld_after.vgrf(inst->dst.type, + dst_size + inst->has_sampler_residency()); + + if (inst->predicate) { + /* Handle predication by copying the original contents of the + * destination into the temporary before emitting the lowered + * instruction. + */ + const fs_builder gbld_before = + lbld_before.group(MIN2(lbld_before.dispatch_width(), + inst->exec_size), 0); + for (unsigned k = 0; k < dst_size; ++k) { + gbld_before.MOV(offset(tmp, lbld_before, k), + offset(dst, inst->exec_size, k)); + } + } + + const fs_builder gbld_after = + lbld_after.group(MIN2(lbld_after.dispatch_width(), + inst->exec_size), 0); + for (unsigned k = 0; k < dst_size; ++k) { + /* Use a builder of the right width to perform the copy avoiding + * uninitialized data if the lowered execution size is greater than the + * original execution size of the instruction. + */ + gbld_after.MOV(offset(dst, inst->exec_size, k), + offset(tmp, lbld_after, k)); + } + + if (inst->has_sampler_residency()) { + /* Sampler messages with residency need a special attention. In the + * first lane of the last component are located the Pixel Null Mask + * (bits 0:15) & some upper bits we need to discard (bits 16:31). We + * have to build a single 32bit value for the SIMD32 message out of 2 + * SIMD16 16 bit values. + */ + const fs_builder rbld = gbld_after.exec_all().group(1, 0); + fs_reg local_res_reg = component( + retype(offset(tmp, lbld_before, dst_size), + BRW_REGISTER_TYPE_UW), 0); + fs_reg final_res_reg = + retype(byte_offset(inst->dst, + inst->size_written - residency_size + + gbld_after.group() / 8), + BRW_REGISTER_TYPE_UW); + rbld.MOV(final_res_reg, local_res_reg); + } + + return tmp; +} + +bool +fs_visitor::lower_simd_width() +{ + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + const unsigned lower_width = get_lowered_simd_width(this, inst); + + if (lower_width != inst->exec_size) { + /* Builder matching the original instruction. We may also need to + * emit an instruction of width larger than the original, set the + * execution size of the builder to the highest of both for now so + * we're sure that both cases can be handled. + */ + const unsigned max_width = MAX2(inst->exec_size, lower_width); + + const fs_builder bld = fs_builder(this).at_end(); + const fs_builder ibld = bld.at(block, inst) + .exec_all(inst->force_writemask_all) + .group(max_width, inst->group / max_width); + + /* Split the copies in chunks of the execution width of either the + * original or the lowered instruction, whichever is lower. + */ + const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width); + const unsigned residency_size = inst->has_sampler_residency() ? + (reg_unit(devinfo) * REG_SIZE) : 0; + const unsigned dst_size = + (inst->size_written - residency_size) / + inst->dst.component_size(inst->exec_size); + + assert(!inst->writes_accumulator && !inst->mlen); + + /* Inserting the zip, unzip, and duplicated instructions in all of + * the right spots is somewhat tricky. All of the unzip and any + * instructions from the zip which unzip the destination prior to + * writing need to happen before all of the per-group instructions + * and the zip instructions need to happen after. In order to sort + * this all out, we insert the unzip instructions before \p inst, + * insert the per-group instructions after \p inst (i.e. before + * inst->next), and insert the zip instructions before the + * instruction after \p inst. Since we are inserting instructions + * after \p inst, inst->next is a moving target and we need to save + * it off here so that we insert the zip instructions in the right + * place. + * + * Since we're inserting split instructions after after_inst, the + * instructions will end up in the reverse order that we insert them. + * However, certain render target writes require that the low group + * instructions come before the high group. From the Ivy Bridge PRM + * Vol. 4, Pt. 1, Section 3.9.11: + * + * "If multiple SIMD8 Dual Source messages are delivered by the + * pixel shader thread, each SIMD8_DUALSRC_LO message must be + * issued before the SIMD8_DUALSRC_HI message with the same Slot + * Group Select setting." + * + * And, from Section 3.9.11.1 of the same PRM: + * + * "When SIMD32 or SIMD16 PS threads send render target writes + * with multiple SIMD8 and SIMD16 messages, the following must + * hold: + * + * All the slots (as described above) must have a corresponding + * render target write irrespective of the slot's validity. A slot + * is considered valid when at least one sample is enabled. For + * example, a SIMD16 PS thread must send two SIMD8 render target + * writes to cover all the slots. + * + * PS thread must send SIMD render target write messages with + * increasing slot numbers. For example, SIMD16 thread has + * Slot[15:0] and if two SIMD8 render target writes are used, the + * first SIMD8 render target write must send Slot[7:0] and the + * next one must send Slot[15:8]." + * + * In order to make low group instructions come before high group + * instructions (this is required for some render target writes), we + * split from the highest group to lowest. + */ + exec_node *const after_inst = inst->next; + for (int i = n - 1; i >= 0; i--) { + /* Emit a copy of the original instruction with the lowered width. + * If the EOT flag was set throw it away except for the last + * instruction to avoid killing the thread prematurely. + */ + fs_inst split_inst = *inst; + split_inst.exec_size = lower_width; + split_inst.eot = inst->eot && i == int(n - 1); + + /* Select the correct channel enables for the i-th group, then + * transform the sources and destination and emit the lowered + * instruction. + */ + const fs_builder lbld = ibld.group(lower_width, i); + + for (unsigned j = 0; j < inst->sources; j++) + split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j); + + split_inst.dst = emit_zip(lbld.at(block, inst), + lbld.at(block, after_inst), inst); + split_inst.size_written = + split_inst.dst.component_size(lower_width) * dst_size + + residency_size; + + lbld.at(block, inst->next).emit(split_inst); + } + + inst->remove(block); + progress = true; + } + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + + return progress; +} + +/** + * Transform barycentric vectors into the interleaved form expected by the PLN + * instruction and returned by the Gfx7+ PI shared function. + * + * For channels 0-15 in SIMD16 mode they are expected to be laid out as + * follows in the register file: + * + * rN+0: X[0-7] + * rN+1: Y[0-7] + * rN+2: X[8-15] + * rN+3: Y[8-15] + * + * There is no need to handle SIMD32 here -- This is expected to be run after + * SIMD lowering, since SIMD lowering relies on vectors having the standard + * component layout. + */ +bool +fs_visitor::lower_barycentrics() +{ + const bool has_interleaved_layout = devinfo->has_pln || + (devinfo->ver >= 7 && devinfo->ver < 20); + bool progress = false; + + if (stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout) + return false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + if (inst->exec_size < 16) + continue; + + const fs_builder ibld(this, block, inst); + const fs_builder ubld = ibld.exec_all().group(8, 0); + + switch (inst->opcode) { + case FS_OPCODE_LINTERP : { + assert(inst->exec_size == 16); + const fs_reg tmp = ibld.vgrf(inst->src[0].type, 2); + fs_reg srcs[4]; + + for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++) + srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2), + 8 * (i / 2)); + + ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs)); + + inst->src[0] = tmp; + progress = true; + break; + } + case FS_OPCODE_INTERPOLATE_AT_SAMPLE: + case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: + case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: { + assert(inst->exec_size == 16); + const fs_reg tmp = ibld.vgrf(inst->dst.type, 2); + + for (unsigned i = 0; i < 2; i++) { + for (unsigned g = 0; g < inst->exec_size / 8; g++) { + fs_inst *mov = ibld.at(block, inst->next).group(8, g) + .MOV(horiz_offset(offset(inst->dst, ibld, i), + 8 * g), + offset(tmp, ubld, 2 * g + i)); + mov->predicate = inst->predicate; + mov->predicate_inverse = inst->predicate_inverse; + mov->flag_subreg = inst->flag_subreg; + } + } + + inst->dst = tmp; + progress = true; + break; + } + default: + break; + } + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + + return progress; +} + +/** + * Lower a derivative instruction as the floating-point difference of two + * swizzles of the source, specified as \p swz0 and \p swz1. + */ +static bool +lower_derivative(fs_visitor *v, bblock_t *block, fs_inst *inst, + unsigned swz0, unsigned swz1) +{ + const fs_builder ubld = fs_builder(v, block, inst).exec_all(); + const fs_reg tmp0 = ubld.vgrf(inst->src[0].type); + const fs_reg tmp1 = ubld.vgrf(inst->src[0].type); + + ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0)); + ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1)); + + inst->resize_sources(2); + inst->src[0] = negate(tmp0); + inst->src[1] = tmp1; + inst->opcode = BRW_OPCODE_ADD; + + return true; +} + +/** + * Lower derivative instructions on platforms where codegen cannot implement + * them efficiently (i.e. XeHP). + */ +bool +fs_visitor::lower_derivatives() +{ + bool progress = false; + + if (devinfo->verx10 < 125) + return false; + + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (inst->opcode == FS_OPCODE_DDX_COARSE) + progress |= lower_derivative(this, block, inst, + BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY); + + else if (inst->opcode == FS_OPCODE_DDX_FINE) + progress |= lower_derivative(this, block, inst, + BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW); + + else if (inst->opcode == FS_OPCODE_DDY_COARSE) + progress |= lower_derivative(this, block, inst, + BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ); + + else if (inst->opcode == FS_OPCODE_DDY_FINE) + progress |= lower_derivative(this, block, inst, + BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW); + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + + return progress; +} + +bool +fs_visitor::lower_find_live_channel() +{ + bool progress = false; + + if (devinfo->ver < 8) + return false; + + bool packed_dispatch = + brw_stage_has_packed_dispatch(devinfo, stage, max_polygons, + stage_prog_data); + bool vmask = + stage == MESA_SHADER_FRAGMENT && + brw_wm_prog_data(stage_prog_data)->uses_vmask; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + if (inst->opcode != SHADER_OPCODE_FIND_LIVE_CHANNEL && + inst->opcode != SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL) + continue; + + bool first = inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL; + + /* Getting the first active channel index is easy on Gfx8: Just find + * the first bit set in the execution mask. The register exists on + * HSW already but it reads back as all ones when the current + * instruction has execution masking disabled, so it's kind of + * useless there. + */ + fs_reg exec_mask(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)); + + const fs_builder ibld(this, block, inst); + if (!inst->is_partial_write()) + ibld.emit_undef_for_dst(inst); + + const fs_builder ubld = fs_builder(this, block, inst).exec_all().group(1, 0); + + /* ce0 doesn't consider the thread dispatch mask (DMask or VMask), + * so combine the execution and dispatch masks to obtain the true mask. + * + * If we're looking for the first live channel, and we have packed + * dispatch, we can skip this step, as we know all dispatched channels + * will appear at the front of the mask. + */ + if (!(first && packed_dispatch)) { + fs_reg mask = ubld.vgrf(BRW_REGISTER_TYPE_UD); + ubld.UNDEF(mask); + ubld.emit(SHADER_OPCODE_READ_SR_REG, mask, brw_imm_ud(vmask ? 3 : 2)); + + /* Quarter control has the effect of magically shifting the value of + * ce0 so you'll get the first/last active channel relative to the + * specified quarter control as result. + */ + if (inst->group > 0) + ubld.SHR(mask, mask, brw_imm_ud(ALIGN(inst->group, 8))); + + ubld.AND(mask, exec_mask, mask); + exec_mask = mask; + } + + if (first) { + ubld.FBL(inst->dst, exec_mask); + } else { + fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 1); + ubld.UNDEF(tmp); + ubld.LZD(tmp, exec_mask); + ubld.ADD(inst->dst, negate(tmp), brw_imm_uw(31)); + } + + inst->remove(block); + progress = true; + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + + return progress; +} + +void +fs_visitor::dump_instructions_to_file(FILE *file) const +{ + if (cfg) { + const register_pressure &rp = regpressure_analysis.require(); + unsigned ip = 0, max_pressure = 0; + unsigned cf_count = 0; + foreach_block_and_inst(block, backend_instruction, inst, cfg) { + if (inst->is_control_flow_end()) + cf_count -= 1; + + max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]); + fprintf(file, "{%3d} %4d: ", rp.regs_live_at_ip[ip], ip); + for (unsigned i = 0; i < cf_count; i++) + fprintf(file, " "); + dump_instruction(inst, file); + ip++; + + if (inst->is_control_flow_begin()) + cf_count += 1; + } + fprintf(file, "Maximum %3d registers live at once.\n", max_pressure); + } else { + int ip = 0; + foreach_in_list(backend_instruction, inst, &instructions) { + fprintf(file, "%4d: ", ip++); + dump_instruction(inst, file); + } + } +} + +void +fs_visitor::dump_instruction_to_file(const backend_instruction *be_inst, FILE *file) const +{ + const fs_inst *inst = (const fs_inst *)be_inst; + + if (inst->predicate) { + fprintf(file, "(%cf%d.%d) ", + inst->predicate_inverse ? '-' : '+', + inst->flag_subreg / 2, + inst->flag_subreg % 2); + } + + fprintf(file, "%s", brw_instruction_name(&compiler->isa, inst->opcode)); + if (inst->saturate) + fprintf(file, ".sat"); + if (inst->conditional_mod) { + fprintf(file, "%s", conditional_modifier[inst->conditional_mod]); + if (!inst->predicate && + (devinfo->ver < 5 || (inst->opcode != BRW_OPCODE_SEL && + inst->opcode != BRW_OPCODE_CSEL && + inst->opcode != BRW_OPCODE_IF && + inst->opcode != BRW_OPCODE_WHILE))) { + fprintf(file, ".f%d.%d", inst->flag_subreg / 2, + inst->flag_subreg % 2); + } + } + fprintf(file, "(%d) ", inst->exec_size); + + if (inst->mlen) { + fprintf(file, "(mlen: %d) ", inst->mlen); + } + + if (inst->ex_mlen) { + fprintf(file, "(ex_mlen: %d) ", inst->ex_mlen); + } + + if (inst->eot) { + fprintf(file, "(EOT) "); + } + + switch (inst->dst.file) { + case VGRF: + fprintf(file, "vgrf%d", inst->dst.nr); + break; + case FIXED_GRF: + fprintf(file, "g%d", inst->dst.nr); + break; + case MRF: + fprintf(file, "m%d", inst->dst.nr); + break; + case BAD_FILE: + fprintf(file, "(null)"); + break; + case UNIFORM: + fprintf(file, "***u%d***", inst->dst.nr); + break; + case ATTR: + fprintf(file, "***attr%d***", inst->dst.nr); + break; + case ARF: + switch (inst->dst.nr) { + case BRW_ARF_NULL: + fprintf(file, "null"); + break; + case BRW_ARF_ADDRESS: + fprintf(file, "a0.%d", inst->dst.subnr); + break; + case BRW_ARF_ACCUMULATOR: + fprintf(file, "acc%d", inst->dst.subnr); + break; + case BRW_ARF_FLAG: + fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr); + break; + default: + fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr); + break; + } + break; + case IMM: + unreachable("not reached"); + } + + if (inst->dst.offset || + (inst->dst.file == VGRF && + alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) { + const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE); + fprintf(file, "+%d.%d", inst->dst.offset / reg_size, + inst->dst.offset % reg_size); + } + + if (inst->dst.stride != 1) + fprintf(file, "<%u>", inst->dst.stride); + fprintf(file, ":%s, ", brw_reg_type_to_letters(inst->dst.type)); + + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].negate) + fprintf(file, "-"); + if (inst->src[i].abs) + fprintf(file, "|"); + switch (inst->src[i].file) { + case VGRF: + fprintf(file, "vgrf%d", inst->src[i].nr); + break; + case FIXED_GRF: + fprintf(file, "g%d", inst->src[i].nr); + break; + case MRF: + fprintf(file, "***m%d***", inst->src[i].nr); + break; + case ATTR: + fprintf(file, "attr%d", inst->src[i].nr); + break; + case UNIFORM: + fprintf(file, "u%d", inst->src[i].nr); + break; + case BAD_FILE: + fprintf(file, "(null)"); + break; + case IMM: + switch (inst->src[i].type) { + case BRW_REGISTER_TYPE_HF: + fprintf(file, "%-ghf", _mesa_half_to_float(inst->src[i].ud & 0xffff)); + break; + case BRW_REGISTER_TYPE_F: + fprintf(file, "%-gf", inst->src[i].f); + break; + case BRW_REGISTER_TYPE_DF: + fprintf(file, "%fdf", inst->src[i].df); + break; + case BRW_REGISTER_TYPE_W: + case BRW_REGISTER_TYPE_D: + fprintf(file, "%dd", inst->src[i].d); + break; + case BRW_REGISTER_TYPE_UW: + case BRW_REGISTER_TYPE_UD: + fprintf(file, "%uu", inst->src[i].ud); + break; + case BRW_REGISTER_TYPE_Q: + fprintf(file, "%" PRId64 "q", inst->src[i].d64); + break; + case BRW_REGISTER_TYPE_UQ: + fprintf(file, "%" PRIu64 "uq", inst->src[i].u64); + break; + case BRW_REGISTER_TYPE_VF: + fprintf(file, "[%-gF, %-gF, %-gF, %-gF]", + brw_vf_to_float((inst->src[i].ud >> 0) & 0xff), + brw_vf_to_float((inst->src[i].ud >> 8) & 0xff), + brw_vf_to_float((inst->src[i].ud >> 16) & 0xff), + brw_vf_to_float((inst->src[i].ud >> 24) & 0xff)); + break; + case BRW_REGISTER_TYPE_V: + case BRW_REGISTER_TYPE_UV: + fprintf(file, "%08x%s", inst->src[i].ud, + inst->src[i].type == BRW_REGISTER_TYPE_V ? "V" : "UV"); + break; + default: + fprintf(file, "???"); + break; + } + break; + case ARF: + switch (inst->src[i].nr) { + case BRW_ARF_NULL: + fprintf(file, "null"); + break; + case BRW_ARF_ADDRESS: + fprintf(file, "a0.%d", inst->src[i].subnr); + break; + case BRW_ARF_ACCUMULATOR: + fprintf(file, "acc%d", inst->src[i].subnr); + break; + case BRW_ARF_FLAG: + fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr); + break; + default: + fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr); + break; + } + break; + } + + if (inst->src[i].offset || + (inst->src[i].file == VGRF && + alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) { + const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE); + fprintf(file, "+%d.%d", inst->src[i].offset / reg_size, + inst->src[i].offset % reg_size); + } + + if (inst->src[i].abs) + fprintf(file, "|"); + + if (inst->src[i].file != IMM) { + unsigned stride; + if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) { + unsigned hstride = inst->src[i].hstride; + stride = (hstride == 0 ? 0 : (1 << (hstride - 1))); + } else { + stride = inst->src[i].stride; + } + if (stride != 1) + fprintf(file, "<%u>", stride); + + fprintf(file, ":%s", brw_reg_type_to_letters(inst->src[i].type)); + } + + if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE) + fprintf(file, ", "); + } + + fprintf(file, " "); + + if (inst->force_writemask_all) + fprintf(file, "NoMask "); + + if (inst->exec_size != dispatch_width) + fprintf(file, "group%d ", inst->group); + + fprintf(file, "\n"); +} + +brw::register_pressure::register_pressure(const fs_visitor *v) +{ + const fs_live_variables &live = v->live_analysis.require(); + const unsigned num_instructions = v->cfg->num_blocks ? + v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0; + + regs_live_at_ip = new unsigned[num_instructions](); + + for (unsigned reg = 0; reg < v->alloc.count; reg++) { + for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++) + regs_live_at_ip[ip] += v->alloc.sizes[reg]; + } + + const unsigned payload_count = v->first_non_payload_grf; + + int *payload_last_use_ip = new int[payload_count]; + v->calculate_payload_ranges(payload_count, payload_last_use_ip); + + for (unsigned reg = 0; reg < payload_count; reg++) { + for (int ip = 0; ip < payload_last_use_ip[reg]; ip++) + ++regs_live_at_ip[ip]; + } + + delete[] payload_last_use_ip; +} + +brw::register_pressure::~register_pressure() +{ + delete[] regs_live_at_ip; +} + +void +fs_visitor::invalidate_analysis(brw::analysis_dependency_class c) +{ + backend_shader::invalidate_analysis(c); + live_analysis.invalidate(c); + regpressure_analysis.invalidate(c); +} + +void +fs_visitor::debug_optimizer(const nir_shader *nir, + const char *pass_name, + int iteration, int pass_num) const +{ + if (!brw_should_print_shader(nir, DEBUG_OPTIMIZER)) + return; + + char *filename; + int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s", + debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"), + _mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name, + iteration, pass_num, pass_name); + if (ret == -1) + return; + dump_instructions(filename); + free(filename); +} + +void +fs_visitor::optimize() +{ + debug_optimizer(nir, "start", 0, 0); + + /* Start by validating the shader we currently have. */ + validate(); + + bool progress = false; + int iteration = 0; + int pass_num = 0; + +#define OPT(pass, args...) ({ \ + pass_num++; \ + bool this_progress = pass(args); \ + \ + if (this_progress) \ + debug_optimizer(nir, #pass, iteration, pass_num); \ + \ + validate(); \ + \ + progress = progress || this_progress; \ + this_progress; \ + }) + + assign_constant_locations(); + OPT(lower_constant_loads); + + validate(); + + if (compiler->lower_dpas) + OPT(brw_lower_dpas, *this); + + OPT(split_virtual_grfs); + + /* Before anything else, eliminate dead code. The results of some NIR + * instructions may effectively be calculated twice. Once when the + * instruction is encountered, and again when the user of that result is + * encountered. Wipe those away before algebraic optimizations and + * especially copy propagation can mix things up. + */ + OPT(dead_code_eliminate); + + OPT(remove_extra_rounding_modes); + + do { + progress = false; + pass_num = 0; + iteration++; + + OPT(remove_duplicate_mrf_writes); + + OPT(opt_algebraic); + OPT(opt_cse); + OPT(opt_copy_propagation); + OPT(opt_predicated_break, this); + OPT(opt_cmod_propagation); + OPT(dead_code_eliminate); + OPT(opt_peephole_sel); + OPT(dead_control_flow_eliminate, this); + OPT(opt_saturate_propagation); + OPT(register_coalesce); + OPT(compute_to_mrf); + OPT(eliminate_find_live_channel); + + OPT(compact_virtual_grfs); + } while (progress); + + progress = false; + pass_num = 0; + + if (OPT(lower_pack)) { + OPT(register_coalesce); + OPT(dead_code_eliminate); + } + + OPT(lower_simd_width); + OPT(lower_barycentrics); + OPT(lower_logical_sends); + + /* After logical SEND lowering. */ + + if (OPT(opt_copy_propagation)) + OPT(opt_algebraic); + + /* Identify trailing zeros LOAD_PAYLOAD of sampler messages. + * Do this before splitting SENDs. + */ + if (devinfo->ver >= 7) { + if (OPT(opt_zero_samples) && OPT(opt_copy_propagation)) + OPT(opt_algebraic); + } + + OPT(opt_split_sends); + OPT(fixup_nomask_control_flow); + + if (progress) { + if (OPT(opt_copy_propagation)) + OPT(opt_algebraic); + + /* Run after logical send lowering to give it a chance to CSE the + * LOAD_PAYLOAD instructions created to construct the payloads of + * e.g. texturing messages in cases where it wasn't possible to CSE the + * whole logical instruction. + */ + OPT(opt_cse); + OPT(register_coalesce); + OPT(compute_to_mrf); + OPT(dead_code_eliminate); + OPT(remove_duplicate_mrf_writes); + OPT(opt_peephole_sel); + } + + OPT(opt_redundant_halt); + + if (OPT(lower_load_payload)) { + OPT(split_virtual_grfs); + + /* Lower 64 bit MOVs generated by payload lowering. */ + if (!devinfo->has_64bit_float || !devinfo->has_64bit_int) + OPT(opt_algebraic); + + OPT(register_coalesce); + OPT(lower_simd_width); + OPT(compute_to_mrf); + OPT(dead_code_eliminate); + } + + OPT(opt_combine_constants); + if (OPT(lower_integer_multiplication)) { + /* If lower_integer_multiplication made progress, it may have produced + * some 32x32-bit MULs in the process of lowering 64-bit MULs. Run it + * one more time to clean those up if they exist. + */ + OPT(lower_integer_multiplication); + } + OPT(lower_sub_sat); + + if (devinfo->ver <= 5 && OPT(lower_minmax)) { + OPT(opt_cmod_propagation); + OPT(opt_cse); + if (OPT(opt_copy_propagation)) + OPT(opt_algebraic); + OPT(dead_code_eliminate); + } + + progress = false; + OPT(lower_derivatives); + OPT(lower_regioning); + if (progress) { + if (OPT(opt_copy_propagation)) + OPT(opt_algebraic); + OPT(dead_code_eliminate); + OPT(lower_simd_width); + } + + OPT(fixup_sends_duplicate_payload); + + OPT(lower_uniform_pull_constant_loads); + + OPT(lower_find_live_channel); + + validate(); +} + +/** + * From the Skylake PRM Vol. 2a docs for sends: + * + * "It is required that the second block of GRFs does not overlap with the + * first block." + * + * There are plenty of cases where we may accidentally violate this due to + * having, for instance, both sources be the constant 0. This little pass + * just adds a new vgrf for the second payload and copies it over. + */ +bool +fs_visitor::fixup_sends_duplicate_payload() +{ + bool progress = false; + + foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { + if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 && + regions_overlap(inst->src[2], inst->mlen * REG_SIZE, + inst->src[3], inst->ex_mlen * REG_SIZE)) { + fs_reg tmp = fs_reg(VGRF, alloc.allocate(inst->ex_mlen), + BRW_REGISTER_TYPE_UD); + /* Sadly, we've lost all notion of channels and bit sizes at this + * point. Just WE_all it. + */ + const fs_builder ibld = fs_builder(this, block, inst).exec_all().group(16, 0); + fs_reg copy_src = retype(inst->src[3], BRW_REGISTER_TYPE_UD); + fs_reg copy_dst = tmp; + for (unsigned i = 0; i < inst->ex_mlen; i += 2) { + if (inst->ex_mlen == i + 1) { + /* Only one register left; do SIMD8 */ + ibld.group(8, 0).MOV(copy_dst, copy_src); + } else { + ibld.MOV(copy_dst, copy_src); + } + copy_src = offset(copy_src, ibld, 1); + copy_dst = offset(copy_dst, ibld, 1); + } + inst->src[3] = tmp; + progress = true; + } + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + + return progress; +} + +/** + * Three source instruction must have a GRF/MRF destination register. + * ARF NULL is not allowed. Fix that up by allocating a temporary GRF. + */ +void +fs_visitor::fixup_3src_null_dest() +{ + bool progress = false; + + foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { + if (inst->is_3src(compiler) && inst->dst.is_null()) { + inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8), + inst->dst.type); + progress = true; + } + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | + DEPENDENCY_VARIABLES); +} + +static bool +needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst) +{ + /* This workaround is about making sure that any instruction writing + * through UGM has completed before we hit EOT. + */ + if (inst->sfid != GFX12_SFID_UGM) + return false; + + /* Any UGM, non-Scratch-surface Stores (not including Atomic) messages, + * where the L1-cache override is NOT among {WB, WS, WT} + */ + enum lsc_opcode opcode = lsc_msg_desc_opcode(devinfo, inst->desc); + if (lsc_opcode_is_store(opcode)) { + switch (lsc_msg_desc_cache_ctrl(devinfo, inst->desc)) { + case LSC_CACHE_STORE_L1STATE_L3MOCS: + case LSC_CACHE_STORE_L1WB_L3WB: + case LSC_CACHE_STORE_L1S_L3UC: + case LSC_CACHE_STORE_L1S_L3WB: + case LSC_CACHE_STORE_L1WT_L3UC: + case LSC_CACHE_STORE_L1WT_L3WB: + return false; + + default: + return true; + } + } + + /* Any UGM Atomic message WITHOUT return value */ + if (lsc_opcode_is_atomic(opcode) && inst->dst.file == BAD_FILE) + return true; + + return false; +} + +/* Wa_14015360517 + * + * The first instruction of any kernel should have non-zero emask. + * Make sure this happens by introducing a dummy mov instruction. + */ +void +fs_visitor::emit_dummy_mov_instruction() +{ + if (!intel_needs_workaround(devinfo, 14015360517)) + return; + + struct backend_instruction *first_inst = + cfg->first_block()->start(); + + /* We can skip the WA if first instruction is marked with + * force_writemask_all or exec_size equals dispatch_width. + */ + if (first_inst->force_writemask_all || + first_inst->exec_size == dispatch_width) + return; + + /* Insert dummy mov as first instruction. */ + const fs_builder ubld = + fs_builder(this, cfg->first_block(), (fs_inst *)first_inst).exec_all().group(8, 0); + ubld.MOV(ubld.null_reg_ud(), brw_imm_ud(0u)); + + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); +} + +/* Wa_22013689345 + * + * We need to emit UGM fence message before EOT, if shader has any UGM write + * or atomic message. + * + * TODO/FINISHME: According to Curro we could avoid the fence in some cases. + * We probably need a better criteria in needs_dummy_fence(). + */ +void +fs_visitor::emit_dummy_memory_fence_before_eot() +{ + bool progress = false; + bool has_ugm_write_or_atomic = false; + + if (!intel_needs_workaround(devinfo, 22013689345)) + return; + + foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { + if (!inst->eot) { + if (needs_dummy_fence(devinfo, inst)) + has_ugm_write_or_atomic = true; + continue; + } + + if (!has_ugm_write_or_atomic) + break; + + const fs_builder ibld(this, block, inst); + const fs_builder ubld = ibld.exec_all().group(1, 0); + + fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD); + fs_inst *dummy_fence = ubld.emit(SHADER_OPCODE_MEMORY_FENCE, + dst, brw_vec8_grf(0, 0), + /* commit enable */ brw_imm_ud(1), + /* bti */ brw_imm_ud(0)); + dummy_fence->sfid = GFX12_SFID_UGM; + dummy_fence->desc = lsc_fence_msg_desc(devinfo, LSC_FENCE_TILE, + LSC_FLUSH_TYPE_NONE_6, false); + ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), dst); + progress = true; + /* TODO: remove this break if we ever have shader with multiple EOT. */ + break; + } + + if (progress) { + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | + DEPENDENCY_VARIABLES); + } +} + +/** + * Find the first instruction in the program that might start a region of + * divergent control flow due to a HALT jump. There is no + * find_halt_control_flow_region_end(), the region of divergence extends until + * the only SHADER_OPCODE_HALT_TARGET in the program. + */ +static const fs_inst * +find_halt_control_flow_region_start(const fs_visitor *v) +{ + foreach_block_and_inst(block, fs_inst, inst, v->cfg) { + if (inst->opcode == BRW_OPCODE_HALT || + inst->opcode == SHADER_OPCODE_HALT_TARGET) + return inst; + } + + return NULL; +} + +/** + * Work around the Gfx12 hardware bug filed as Wa_1407528679. EU fusion + * can cause a BB to be executed with all channels disabled, which will lead + * to the execution of any NoMask instructions in it, even though any + * execution-masked instructions will be correctly shot down. This may break + * assumptions of some NoMask SEND messages whose descriptor depends on data + * generated by live invocations of the shader. + * + * This avoids the problem by predicating certain instructions on an ANY + * horizontal predicate that makes sure that their execution is omitted when + * all channels of the program are disabled. + */ +bool +fs_visitor::fixup_nomask_control_flow() +{ + if (devinfo->ver != 12) + return false; + + const brw_predicate pred = dispatch_width > 16 ? BRW_PREDICATE_ALIGN1_ANY32H : + dispatch_width > 8 ? BRW_PREDICATE_ALIGN1_ANY16H : + BRW_PREDICATE_ALIGN1_ANY8H; + const fs_inst *halt_start = find_halt_control_flow_region_start(this); + unsigned depth = 0; + bool progress = false; + + const fs_live_variables &live_vars = live_analysis.require(); + + /* Scan the program backwards in order to be able to easily determine + * whether the flag register is live at any point. + */ + foreach_block_reverse_safe(block, cfg) { + BITSET_WORD flag_liveout = live_vars.block_data[block->num] + .flag_liveout[0]; + STATIC_ASSERT(ARRAY_SIZE(live_vars.block_data[0].flag_liveout) == 1); + + foreach_inst_in_block_reverse_safe(fs_inst, inst, block) { + if (!inst->predicate && inst->exec_size >= 8) + flag_liveout &= ~inst->flags_written(devinfo); + + switch (inst->opcode) { + case BRW_OPCODE_DO: + case BRW_OPCODE_IF: + /* Note that this doesn't handle BRW_OPCODE_HALT since only + * the first one in the program closes the region of divergent + * control flow due to any HALT instructions -- Instead this is + * handled with the halt_start check below. + */ + depth--; + break; + + case BRW_OPCODE_WHILE: + case BRW_OPCODE_ENDIF: + case SHADER_OPCODE_HALT_TARGET: + depth++; + break; + + default: + /* Note that the vast majority of NoMask SEND instructions in the + * program are harmless while executed in a block with all + * channels disabled, since any instructions with side effects we + * could hit here should be execution-masked. + * + * The main concern is NoMask SEND instructions where the message + * descriptor or header depends on data generated by live + * invocations of the shader (RESINFO and + * FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically + * computed surface index seem to be the only examples right now + * where this could easily lead to GPU hangs). Unfortunately we + * have no straightforward way to detect that currently, so just + * predicate any NoMask SEND instructions we find under control + * flow. + * + * If this proves to have a measurable performance impact it can + * be easily extended with a whitelist of messages we know we can + * safely omit the predication for. + */ + if (depth && inst->force_writemask_all && + is_send(inst) && !inst->predicate) { + /* We need to load the execution mask into the flag register by + * using a builder with channel group matching the whole shader + * (rather than the default which is derived from the original + * instruction), in order to avoid getting a right-shifted + * value. + */ + const fs_builder ubld = fs_builder(this, block, inst) + .exec_all().group(dispatch_width, 0); + const fs_reg flag = retype(brw_flag_reg(0, 0), + BRW_REGISTER_TYPE_UD); + + /* Due to the lack of flag register allocation we need to save + * and restore the flag register if it's live. + */ + const bool save_flag = flag_liveout & + flag_mask(flag, dispatch_width / 8); + const fs_reg tmp = ubld.group(8, 0).vgrf(flag.type); + + if (save_flag) { + ubld.group(8, 0).UNDEF(tmp); + ubld.group(1, 0).MOV(tmp, flag); + } + + ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS); + + set_predicate(pred, inst); + inst->flag_subreg = 0; + inst->predicate_trivial = true; + + if (save_flag) + ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp); + + progress = true; + } + break; + } + + if (inst == halt_start) + depth--; + + flag_liveout |= inst->flags_read(devinfo); + } + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + + return progress; +} + +uint32_t +fs_visitor::compute_max_register_pressure() +{ + const register_pressure &rp = regpressure_analysis.require(); + uint32_t ip = 0, max_pressure = 0; + foreach_block_and_inst(block, backend_instruction, inst, cfg) { + max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]); + ip++; + } + return max_pressure; +} + +static fs_inst ** +save_instruction_order(const struct cfg_t *cfg) +{ + /* Before we schedule anything, stash off the instruction order as an array + * of fs_inst *. This way, we can reset it between scheduling passes to + * prevent dependencies between the different scheduling modes. + */ + int num_insts = cfg->last_block()->end_ip + 1; + fs_inst **inst_arr = new fs_inst * [num_insts]; + + int ip = 0; + foreach_block_and_inst(block, fs_inst, inst, cfg) { + assert(ip >= block->start_ip && ip <= block->end_ip); + inst_arr[ip++] = inst; + } + assert(ip == num_insts); + + return inst_arr; +} + +static void +restore_instruction_order(struct cfg_t *cfg, fs_inst **inst_arr) +{ + ASSERTED int num_insts = cfg->last_block()->end_ip + 1; + + int ip = 0; + foreach_block (block, cfg) { + block->instructions.make_empty(); + + assert(ip == block->start_ip); + for (; ip <= block->end_ip; ip++) + block->instructions.push_tail(inst_arr[ip]); + } + assert(ip == num_insts); +} + +void +fs_visitor::allocate_registers(bool allow_spilling) +{ + bool allocated; + + static const enum instruction_scheduler_mode pre_modes[] = { + SCHEDULE_PRE, + SCHEDULE_PRE_NON_LIFO, + SCHEDULE_NONE, + SCHEDULE_PRE_LIFO, + }; + + static const char *scheduler_mode_name[] = { + [SCHEDULE_PRE] = "top-down", + [SCHEDULE_PRE_NON_LIFO] = "non-lifo", + [SCHEDULE_PRE_LIFO] = "lifo", + [SCHEDULE_POST] = "post", + [SCHEDULE_NONE] = "none", + }; + + uint32_t best_register_pressure = UINT32_MAX; + enum instruction_scheduler_mode best_sched = SCHEDULE_NONE; + + compact_virtual_grfs(); + + if (needs_register_pressure) + shader_stats.max_register_pressure = compute_max_register_pressure(); + + debug_optimizer(nir, "pre_register_allocate", 90, 90); + + bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS); + + /* Before we schedule anything, stash off the instruction order as an array + * of fs_inst *. This way, we can reset it between scheduling passes to + * prevent dependencies between the different scheduling modes. + */ + fs_inst **orig_order = save_instruction_order(cfg); + fs_inst **best_pressure_order = NULL; + + void *scheduler_ctx = ralloc_context(NULL); + fs_instruction_scheduler *sched = prepare_scheduler(scheduler_ctx); + + /* Try each scheduling heuristic to see if it can successfully register + * allocate without spilling. They should be ordered by decreasing + * performance but increasing likelihood of allocating. + */ + for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) { + enum instruction_scheduler_mode sched_mode = pre_modes[i]; + + schedule_instructions_pre_ra(sched, sched_mode); + this->shader_stats.scheduler_mode = scheduler_mode_name[sched_mode]; + + debug_optimizer(nir, shader_stats.scheduler_mode, 95, i); + + if (0) { + assign_regs_trivial(); + allocated = true; + break; + } + + /* We should only spill registers on the last scheduling. */ + assert(!spilled_any_registers); + + allocated = assign_regs(false, spill_all); + if (allocated) + break; + + /* Save the maximum register pressure */ + uint32_t this_pressure = compute_max_register_pressure(); + + if (0) { + fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n", + scheduler_mode_name[sched_mode], this_pressure); + } + + if (this_pressure < best_register_pressure) { + best_register_pressure = this_pressure; + best_sched = sched_mode; + delete[] best_pressure_order; + best_pressure_order = save_instruction_order(cfg); + } + + /* Reset back to the original order before trying the next mode */ + restore_instruction_order(cfg, orig_order); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + } + + ralloc_free(scheduler_ctx); + + if (!allocated) { + if (0) { + fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n", + scheduler_mode_name[best_sched]); + } + restore_instruction_order(cfg, best_pressure_order); + shader_stats.scheduler_mode = scheduler_mode_name[best_sched]; + + allocated = assign_regs(allow_spilling, spill_all); + } + + delete[] orig_order; + delete[] best_pressure_order; + + if (!allocated) { + fail("Failure to register allocate. Reduce number of " + "live scalar values to avoid this."); + } else if (spilled_any_registers) { + brw_shader_perf_log(compiler, log_data, + "%s shader triggered register spilling. " + "Try reducing the number of live scalar " + "values to improve performance.\n", + _mesa_shader_stage_to_string(stage)); + } + + /* This must come after all optimization and register allocation, since + * it inserts dead code that happens to have side effects, and it does + * so based on the actual physical registers in use. + */ + insert_gfx4_send_dependency_workarounds(); + + if (failed) + return; + + opt_bank_conflicts(); + + schedule_instructions_post_ra(); + + if (last_scratch > 0) { + ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024; + + /* Take the max of any previously compiled variant of the shader. In the + * case of bindless shaders with return parts, this will also take the + * max of all parts. + */ + prog_data->total_scratch = MAX2(brw_get_scratch_size(last_scratch), + prog_data->total_scratch); + + if (gl_shader_stage_is_compute(stage)) { + if (devinfo->platform == INTEL_PLATFORM_HSW) { + /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space" + * field documentation, Haswell supports a minimum of 2kB of + * scratch space for compute shaders, unlike every other stage + * and platform. + */ + prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048); + } else if (devinfo->ver <= 7) { + /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space" + * field documentation, platforms prior to Haswell measure scratch + * size linearly with a range of [1kB, 12kB] and 1kB granularity. + */ + prog_data->total_scratch = ALIGN(last_scratch, 1024); + max_scratch_size = 12 * 1024; + } + } + + /* We currently only support up to 2MB of scratch space. If we + * need to support more eventually, the documentation suggests + * that we could allocate a larger buffer, and partition it out + * ourselves. We'd just have to undo the hardware's address + * calculation by subtracting (FFTID * Per Thread Scratch Space) + * and then add FFTID * (Larger Per Thread Scratch Space). + * + * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline > + * Thread Group Tracking > Local Memory/Scratch Space. + */ + assert(prog_data->total_scratch < max_scratch_size); + } + + lower_scoreboard(); +} + +bool +fs_visitor::run_vs() +{ + assert(stage == MESA_SHADER_VERTEX); + + payload_ = new vs_thread_payload(*this); + + nir_to_brw(this); + + if (failed) + return false; + + emit_urb_writes(); + + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + assign_vs_urb_setup(); + + fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); + + /* Wa_14015360517 */ + emit_dummy_mov_instruction(); + + allocate_registers(true /* allow_spilling */); + + return !failed; +} + +void +fs_visitor::set_tcs_invocation_id() +{ + struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); + struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base; + const fs_builder bld = fs_builder(this).at_end(); + + const unsigned instance_id_mask = + (devinfo->verx10 >= 125) ? INTEL_MASK(7, 0) : + (devinfo->ver >= 11) ? INTEL_MASK(22, 16) : + INTEL_MASK(23, 17); + const unsigned instance_id_shift = + (devinfo->verx10 >= 125) ? 0 : (devinfo->ver >= 11) ? 16 : 17; + + /* Get instance number from g0.2 bits: + * * 7:0 on DG2+ + * * 22:16 on gfx11+ + * * 23:17 otherwise + */ + fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)), + brw_imm_ud(instance_id_mask)); + + invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD); + + if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH) { + /* gl_InvocationID is just the thread number */ + bld.SHR(invocation_id, t, brw_imm_ud(instance_id_shift)); + return; + } + + assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH); + + fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW); + fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210))); + bld.MOV(channels_ud, channels_uw); + + if (tcs_prog_data->instances == 1) { + invocation_id = channels_ud; + } else { + fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.SHR(instance_times_8, t, brw_imm_ud(instance_id_shift - 3)); + bld.ADD(invocation_id, instance_times_8, channels_ud); + } +} + +void +fs_visitor::emit_tcs_thread_end() +{ + /* Try and tag the last URB write with EOT instead of emitting a whole + * separate write just to finish the thread. There isn't guaranteed to + * be one, so this may not succeed. + */ + if (devinfo->ver != 8 && mark_last_urb_write_with_eot()) + return; + + const fs_builder bld = fs_builder(this).at_end(); + + /* Emit a URB write to end the thread. On Broadwell, we use this to write + * zero to the "TR DS Cache Disable" bit (we haven't implemented a fancy + * algorithm to set it optimally). On other platforms, we simply write + * zero to a reserved/MBZ patch header DWord which has no consequence. + */ + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = tcs_payload().patch_urb_output; + srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(WRITEMASK_X << 16); + srcs[URB_LOGICAL_SRC_DATA] = brw_imm_ud(0); + srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1); + fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, + reg_undef, srcs, ARRAY_SIZE(srcs)); + inst->eot = true; +} + +bool +fs_visitor::run_tcs() +{ + assert(stage == MESA_SHADER_TESS_CTRL); + + struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data); + const fs_builder bld = fs_builder(this).at_end(); + + assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH || + vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH); + + payload_ = new tcs_thread_payload(*this); + + /* Initialize gl_InvocationID */ + set_tcs_invocation_id(); + + const bool fix_dispatch_mask = + vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH && + (nir->info.tess.tcs_vertices_out % 8) != 0; + + /* Fix the disptach mask */ + if (fix_dispatch_mask) { + bld.CMP(bld.null_reg_ud(), invocation_id, + brw_imm_ud(nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L); + bld.IF(BRW_PREDICATE_NORMAL); + } + + nir_to_brw(this); + + if (fix_dispatch_mask) { + bld.emit(BRW_OPCODE_ENDIF); + } + + emit_tcs_thread_end(); + + if (failed) + return false; + + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + assign_tcs_urb_setup(); + + fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); + + /* Wa_14015360517 */ + emit_dummy_mov_instruction(); + + allocate_registers(true /* allow_spilling */); + + return !failed; +} + +bool +fs_visitor::run_tes() +{ + assert(stage == MESA_SHADER_TESS_EVAL); + + payload_ = new tes_thread_payload(*this); + + nir_to_brw(this); + + if (failed) + return false; + + emit_urb_writes(); + + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + assign_tes_urb_setup(); + + fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); + + /* Wa_14015360517 */ + emit_dummy_mov_instruction(); + + allocate_registers(true /* allow_spilling */); + + return !failed; +} + +bool +fs_visitor::run_gs() +{ + assert(stage == MESA_SHADER_GEOMETRY); + + payload_ = new gs_thread_payload(*this); + + this->final_gs_vertex_count = vgrf(glsl_uint_type()); + + if (gs_compile->control_data_header_size_bits > 0) { + /* Create a VGRF to store accumulated control data bits. */ + this->control_data_bits = vgrf(glsl_uint_type()); + + /* If we're outputting more than 32 control data bits, then EmitVertex() + * will set control_data_bits to 0 after emitting the first vertex. + * Otherwise, we need to initialize it to 0 here. + */ + if (gs_compile->control_data_header_size_bits <= 32) { + const fs_builder bld = fs_builder(this).at_end(); + const fs_builder abld = bld.annotate("initialize control data bits"); + abld.MOV(this->control_data_bits, brw_imm_ud(0u)); + } + } + + nir_to_brw(this); + + emit_gs_thread_end(); + + if (failed) + return false; + + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + assign_gs_urb_setup(); + + fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); + + /* Wa_14015360517 */ + emit_dummy_mov_instruction(); + + allocate_registers(true /* allow_spilling */); + + return !failed; +} + +/* From the SKL PRM, Volume 16, Workarounds: + * + * 0877 3D Pixel Shader Hang possible when pixel shader dispatched with + * only header phases (R0-R2) + * + * WA: Enable a non-header phase (e.g. push constant) when dispatch would + * have been header only. + * + * Instead of enabling push constants one can alternatively enable one of the + * inputs. Here one simply chooses "layer" which shouldn't impose much + * overhead. + */ +static void +gfx9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data) +{ + if (wm_prog_data->num_varying_inputs) + return; + + if (wm_prog_data->base.curb_read_length) + return; + + wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0; + wm_prog_data->num_varying_inputs = 1; + + brw_compute_urb_setup_index(wm_prog_data); +} + +bool +fs_visitor::run_fs(bool allow_spilling, bool do_rep_send) +{ + struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data); + brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key; + const fs_builder bld = fs_builder(this).at_end(); + + assert(stage == MESA_SHADER_FRAGMENT); + + payload_ = new fs_thread_payload(*this, source_depth_to_render_target, + runtime_check_aads_emit); + + if (do_rep_send) { + assert(dispatch_width == 16); + emit_repclear_shader(); + } else { + if (nir->info.inputs_read > 0 || + BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) || + (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) { + if (devinfo->ver < 6) + emit_interpolation_setup_gfx4(); + else + emit_interpolation_setup_gfx6(); + } + + /* We handle discards by keeping track of the still-live pixels in f0.1. + * Initialize it with the dispatched pixels. + */ + if (wm_prog_data->uses_kill) { + const unsigned lower_width = MIN2(dispatch_width, 16); + for (unsigned i = 0; i < dispatch_width / lower_width; i++) { + /* According to the "PS Thread Payload for Normal + * Dispatch" pages on the BSpec, the dispatch mask is + * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on + * gfx6+. + */ + const fs_reg dispatch_mask = + devinfo->ver >= 20 ? xe2_vec1_grf(i, 15) : + devinfo->ver >= 6 ? brw_vec1_grf(i + 1, 7) : + brw_vec1_grf(0, 0); + bld.exec_all().group(1, 0) + .MOV(brw_sample_mask_reg(bld.group(lower_width, i)), + retype(dispatch_mask, BRW_REGISTER_TYPE_UW)); + } + } + + if (nir->info.writes_memory) + wm_prog_data->has_side_effects = true; + + nir_to_brw(this); + + if (failed) + return false; + + if (wm_key->emit_alpha_test) + emit_alpha_test(); + + emit_fb_writes(); + + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + + if (devinfo->ver == 9) + gfx9_ps_header_only_workaround(wm_prog_data); + + assign_urb_setup(); + + fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); + + /* Wa_14015360517 */ + emit_dummy_mov_instruction(); + + allocate_registers(allow_spilling); + } + + return !failed; +} + +bool +fs_visitor::run_cs(bool allow_spilling) +{ + assert(gl_shader_stage_is_compute(stage)); + assert(devinfo->ver >= 7); + const fs_builder bld = fs_builder(this).at_end(); + + payload_ = new cs_thread_payload(*this); + + if (devinfo->platform == INTEL_PLATFORM_HSW && prog_data->total_shared > 0) { + /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */ + const fs_builder abld = bld.exec_all().group(1, 0); + abld.MOV(retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW), + suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1)); + } + + nir_to_brw(this); + + if (failed) + return false; + + emit_cs_terminate(); + + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + + fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); + + /* Wa_14015360517 */ + emit_dummy_mov_instruction(); + + allocate_registers(allow_spilling); + + return !failed; +} + +bool +fs_visitor::run_bs(bool allow_spilling) +{ + assert(stage >= MESA_SHADER_RAYGEN && stage <= MESA_SHADER_CALLABLE); + + payload_ = new bs_thread_payload(*this); + + nir_to_brw(this); + + if (failed) + return false; + + /* TODO(RT): Perhaps rename this? */ + emit_cs_terminate(); + + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + + fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); + + /* Wa_14015360517 */ + emit_dummy_mov_instruction(); + + allocate_registers(allow_spilling); + + return !failed; +} + +bool +fs_visitor::run_task(bool allow_spilling) +{ + assert(stage == MESA_SHADER_TASK); + + payload_ = new task_mesh_thread_payload(*this); + + nir_to_brw(this); + + if (failed) + return false; + + emit_urb_fence(); + + emit_cs_terminate(); + + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + + fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); + + /* Wa_14015360517 */ + emit_dummy_mov_instruction(); + + allocate_registers(allow_spilling); + + return !failed; +} + +bool +fs_visitor::run_mesh(bool allow_spilling) +{ + assert(stage == MESA_SHADER_MESH); + + payload_ = new task_mesh_thread_payload(*this); + + nir_to_brw(this); + + if (failed) + return false; + + emit_urb_fence(); + + emit_cs_terminate(); + + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + + fixup_3src_null_dest(); + emit_dummy_memory_fence_before_eot(); + + /* Wa_14015360517 */ + emit_dummy_mov_instruction(); + + allocate_registers(allow_spilling); + + return !failed; +} + +static bool +is_used_in_not_interp_frag_coord(nir_def *def) +{ + nir_foreach_use_including_if(src, def) { + if (nir_src_is_if(src)) + return true; + + if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic) + return true; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(nir_src_parent_instr(src)); + if (intrin->intrinsic != nir_intrinsic_load_frag_coord) + return true; + } + + return false; +} + +/** + * Return a bitfield where bit n is set if barycentric interpolation mode n + * (see enum brw_barycentric_mode) is needed by the fragment shader. + * + * We examine the load_barycentric intrinsics rather than looking at input + * variables so that we catch interpolateAtCentroid() messages too, which + * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up. + */ +static unsigned +brw_compute_barycentric_interp_modes(const struct intel_device_info *devinfo, + const nir_shader *shader) +{ + unsigned barycentric_interp_modes = 0; + + nir_foreach_function_impl(impl, shader) { + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_centroid: + case nir_intrinsic_load_barycentric_sample: + case nir_intrinsic_load_barycentric_at_sample: + case nir_intrinsic_load_barycentric_at_offset: + break; + default: + continue; + } + + /* Ignore WPOS; it doesn't require interpolation. */ + if (!is_used_in_not_interp_frag_coord(&intrin->def)) + continue; + + nir_intrinsic_op bary_op = intrin->intrinsic; + enum brw_barycentric_mode bary = + brw_barycentric_mode(intrin); + + barycentric_interp_modes |= 1 << bary; + + if (devinfo->needs_unlit_centroid_workaround && + bary_op == nir_intrinsic_load_barycentric_centroid) + barycentric_interp_modes |= 1 << centroid_to_pixel(bary); + } + } + } + + return barycentric_interp_modes; +} + +static void +brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data, + const nir_shader *shader) +{ + prog_data->flat_inputs = 0; + + nir_foreach_shader_in_variable(var, shader) { + /* flat shading */ + if (var->data.interpolation != INTERP_MODE_FLAT) + continue; + + if (var->data.per_primitive) + continue; + + unsigned slots = glsl_count_attribute_slots(var->type, false); + for (unsigned s = 0; s < slots; s++) { + int input_index = prog_data->urb_setup[var->data.location + s]; + + if (input_index >= 0) + prog_data->flat_inputs |= 1 << input_index; + } + } +} + +static uint8_t +computed_depth_mode(const nir_shader *shader) +{ + if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { + switch (shader->info.fs.depth_layout) { + case FRAG_DEPTH_LAYOUT_NONE: + case FRAG_DEPTH_LAYOUT_ANY: + return BRW_PSCDEPTH_ON; + case FRAG_DEPTH_LAYOUT_GREATER: + return BRW_PSCDEPTH_ON_GE; + case FRAG_DEPTH_LAYOUT_LESS: + return BRW_PSCDEPTH_ON_LE; + case FRAG_DEPTH_LAYOUT_UNCHANGED: + /* We initially set this to OFF, but having the shader write the + * depth means we allocate register space in the SEND message. The + * difference between the SEND register count and the OFF state + * programming makes the HW hang. + * + * Removing the depth writes also leads to test failures. So use + * LesserThanOrEqual, which fits writing the same value + * (unchanged/equal). + * + */ + return BRW_PSCDEPTH_ON_LE; + } + } + return BRW_PSCDEPTH_OFF; +} + +/** + * Move load_interpolated_input with simple (payload-based) barycentric modes + * to the top of the program so we don't emit multiple PLNs for the same input. + * + * This works around CSE not being able to handle non-dominating cases + * such as: + * + * if (...) { + * interpolate input + * } else { + * interpolate the same exact input + * } + * + * This should be replaced by global value numbering someday. + */ +bool +brw_nir_move_interpolation_to_top(nir_shader *nir) +{ + bool progress = false; + + nir_foreach_function_impl(impl, nir) { + nir_block *top = nir_start_block(impl); + nir_cursor cursor = nir_before_instr(nir_block_first_instr(top)); + bool impl_progress = false; + + for (nir_block *block = nir_block_cf_tree_next(top); + block != NULL; + block = nir_block_cf_tree_next(block)) { + + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_load_interpolated_input) + continue; + nir_intrinsic_instr *bary_intrinsic = + nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr); + nir_intrinsic_op op = bary_intrinsic->intrinsic; + + /* Leave interpolateAtSample/Offset() where they are. */ + if (op == nir_intrinsic_load_barycentric_at_sample || + op == nir_intrinsic_load_barycentric_at_offset) + continue; + + nir_instr *move[3] = { + &bary_intrinsic->instr, + intrin->src[1].ssa->parent_instr, + instr + }; + + for (unsigned i = 0; i < ARRAY_SIZE(move); i++) { + if (move[i]->block != top) { + nir_instr_move(cursor, move[i]); + impl_progress = true; + } + } + } + } + + progress = progress || impl_progress; + + nir_metadata_preserve(impl, impl_progress ? (nir_metadata_block_index | + nir_metadata_dominance) + : nir_metadata_all); + } + + return progress; +} + +static void +brw_nir_populate_wm_prog_data(nir_shader *shader, + const struct intel_device_info *devinfo, + const struct brw_wm_prog_key *key, + struct brw_wm_prog_data *prog_data, + const struct brw_mue_map *mue_map) +{ + /* key->alpha_test_func means simulating alpha testing via discards, + * so the shader definitely kills pixels. + */ + prog_data->uses_kill = shader->info.fs.uses_discard || + shader->info.fs.uses_demote || + key->emit_alpha_test; + prog_data->uses_omask = !key->ignore_sample_mask_out && + (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)); + prog_data->color_outputs_written = key->color_outputs_valid; + prog_data->max_polygons = 1; + prog_data->computed_depth_mode = computed_depth_mode(shader); + prog_data->computed_stencil = + shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL); + + prog_data->sample_shading = + shader->info.fs.uses_sample_shading || + shader->info.outputs_read; + + assert(key->multisample_fbo != BRW_NEVER || + key->persample_interp == BRW_NEVER); + + prog_data->persample_dispatch = key->persample_interp; + if (prog_data->sample_shading) + prog_data->persample_dispatch = BRW_ALWAYS; + + /* We can only persample dispatch if we have a multisample FBO */ + prog_data->persample_dispatch = MIN2(prog_data->persample_dispatch, + key->multisample_fbo); + + /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If + * persample_dispatch & multisample_fbo are not dynamic, Anv should be able + * to definitively tell whether alpha_to_coverage is on or off. + */ + prog_data->alpha_to_coverage = key->alpha_to_coverage; + assert(prog_data->alpha_to_coverage != BRW_SOMETIMES || + prog_data->persample_dispatch == BRW_SOMETIMES); + + if (devinfo->ver >= 6) { + prog_data->uses_sample_mask = + BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN); + + /* From the Ivy Bridge PRM documentation for 3DSTATE_PS: + * + * "MSDISPMODE_PERSAMPLE is required in order to select + * POSOFFSET_SAMPLE" + * + * So we can only really get sample positions if we are doing real + * per-sample dispatch. If we need gl_SamplePosition and we don't have + * persample dispatch, we hard-code it to 0.5. + */ + prog_data->uses_pos_offset = + prog_data->persample_dispatch != BRW_NEVER && + (BITSET_TEST(shader->info.system_values_read, + SYSTEM_VALUE_SAMPLE_POS) || + BITSET_TEST(shader->info.system_values_read, + SYSTEM_VALUE_SAMPLE_POS_OR_CENTER)); + } + + prog_data->has_render_target_reads = shader->info.outputs_read != 0ull; + + prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests; + prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage; + prog_data->inner_coverage = shader->info.fs.inner_coverage; + + prog_data->barycentric_interp_modes = + brw_compute_barycentric_interp_modes(devinfo, shader); + + /* From the BDW PRM documentation for 3DSTATE_WM: + * + * "MSDISPMODE_PERSAMPLE is required in order to select Perspective + * Sample or Non- perspective Sample barycentric coordinates." + * + * So cleanup any potentially set sample barycentric mode when not in per + * sample dispatch. + */ + if (prog_data->persample_dispatch == BRW_NEVER) { + prog_data->barycentric_interp_modes &= + ~BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE); + } + + prog_data->uses_nonperspective_interp_modes |= + (prog_data->barycentric_interp_modes & + BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0; + + /* The current VK_EXT_graphics_pipeline_library specification requires + * coarse to specified at compile time. But per sample interpolation can be + * dynamic. So we should never be in a situation where coarse & + * persample_interp are both respectively true & BRW_ALWAYS. + * + * Coarse will dynamically turned off when persample_interp is active. + */ + assert(!key->coarse_pixel || key->persample_interp != BRW_ALWAYS); + + prog_data->coarse_pixel_dispatch = + brw_sometimes_invert(prog_data->persample_dispatch); + if (!key->coarse_pixel || + prog_data->uses_omask || + prog_data->sample_shading || + prog_data->uses_sample_mask || + (prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) || + prog_data->computed_stencil) { + prog_data->coarse_pixel_dispatch = BRW_NEVER; + } + + /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater, + * Message Descriptor : + * + * "Message Type. Specifies the type of message being sent when + * pixel-rate evaluation is requested : + * + * Format = U2 + * 0: Per Message Offset (eval_snapped with immediate offset) + * 1: Sample Position Offset (eval_sindex) + * 2: Centroid Position Offset (eval_centroid) + * 3: Per Slot Offset (eval_snapped with register offset) + * + * Message Type. Specifies the type of message being sent when + * coarse-rate evaluation is requested : + * + * Format = U2 + * 0: Coarse to Pixel Mapping Message (internal message) + * 1: Reserved + * 2: Coarse Centroid Position (eval_centroid) + * 3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)" + * + * The Sample Position Offset is marked as reserved for coarse rate + * evaluation and leads to hangs if we try to use it. So disable coarse + * pixel shading if we have any intrinsic that will result in a pixel + * interpolater message at sample. + */ + if (intel_nir_pulls_at_sample(shader)) + prog_data->coarse_pixel_dispatch = BRW_NEVER; + + /* We choose to always enable VMask prior to XeHP, as it would cause + * us to lose out on the eliminate_find_live_channel() optimization. + */ + prog_data->uses_vmask = devinfo->verx10 < 125 || + shader->info.fs.needs_quad_helper_invocations || + shader->info.uses_wide_subgroup_intrinsics || + prog_data->coarse_pixel_dispatch != BRW_NEVER; + + prog_data->uses_src_w = + BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD); + prog_data->uses_src_depth = + BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) && + prog_data->coarse_pixel_dispatch != BRW_ALWAYS; + prog_data->uses_depth_w_coefficients = + BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) && + prog_data->coarse_pixel_dispatch != BRW_NEVER; + + calculate_urb_setup(devinfo, key, prog_data, shader, mue_map); + brw_compute_flat_inputs(prog_data, shader); +} + +/** + * Pre-gfx6, the register file of the EUs was shared between threads, + * and each thread used some subset allocated on a 16-register block + * granularity. The unit states wanted these block counts. + */ +static inline int +brw_register_blocks(int reg_count) +{ + return ALIGN(reg_count, 16) / 16 - 1; +} + +const unsigned * +brw_compile_fs(const struct brw_compiler *compiler, + struct brw_compile_fs_params *params) +{ + struct nir_shader *nir = params->base.nir; + const struct brw_wm_prog_key *key = params->key; + struct brw_wm_prog_data *prog_data = params->prog_data; + bool allow_spilling = params->allow_spilling; + const bool debug_enabled = + brw_should_print_shader(nir, params->base.debug_flag ? + params->base.debug_flag : DEBUG_WM); + + prog_data->base.stage = MESA_SHADER_FRAGMENT; + prog_data->base.ray_queries = nir->info.ray_queries; + prog_data->base.total_scratch = 0; + + const struct intel_device_info *devinfo = compiler->devinfo; + const unsigned max_subgroup_size = compiler->devinfo->ver >= 6 ? 32 : 16; + + brw_nir_apply_key(nir, compiler, &key->base, max_subgroup_size); + brw_nir_lower_fs_inputs(nir, devinfo, key); + brw_nir_lower_fs_outputs(nir); + + if (devinfo->ver < 6) + brw_setup_vue_interpolation(params->vue_map, nir, prog_data); + + /* From the SKL PRM, Volume 7, "Alpha Coverage": + * "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in + * hardware, regardless of the state setting for this feature." + */ + if (devinfo->ver > 6 && key->alpha_to_coverage != BRW_NEVER) { + /* Run constant fold optimization in order to get the correct source + * offset to determine render target 0 store instruction in + * emit_alpha_to_coverage pass. + */ + NIR_PASS(_, nir, nir_opt_constant_folding); + NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage, key, prog_data); + } + + NIR_PASS(_, nir, brw_nir_move_interpolation_to_top); + brw_postprocess_nir(nir, compiler, debug_enabled, + key->base.robust_flags); + + brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data, + params->mue_map); + + std::unique_ptr v8, v16, v32, vmulti; + cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL, + *multi_cfg = NULL; + float throughput = 0; + bool has_spilled = false; + + if (devinfo->ver < 20) { + v8 = std::make_unique(compiler, ¶ms->base, key, + prog_data, nir, 8, 1, + params->base.stats != NULL, + debug_enabled); + if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) { + params->base.error_str = ralloc_strdup(params->base.mem_ctx, + v8->fail_msg); + return NULL; + } else if (INTEL_SIMD(FS, 8)) { + simd8_cfg = v8->cfg; + + assert(v8->payload().num_regs % reg_unit(devinfo) == 0); + prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo); + + prog_data->reg_blocks_8 = brw_register_blocks(v8->grf_used); + const performance &perf = v8->performance_analysis.require(); + throughput = MAX2(throughput, perf.throughput); + has_spilled = v8->spilled_any_registers; + allow_spilling = false; + } + } + + /* Limit dispatch width to simd8 with dual source blending on gfx8. + * See: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1917 + */ + if (devinfo->ver == 8 && prog_data->dual_src_blend && + INTEL_SIMD(FS, 8)) { + assert(!params->use_rep_send); + v8->limit_dispatch_width(8, "gfx8 workaround: " + "using SIMD8 when dual src blending.\n"); + } + + if (key->coarse_pixel && devinfo->ver < 20) { + if (prog_data->dual_src_blend) { + v8->limit_dispatch_width(8, "SIMD16 coarse pixel shading cannot" + " use SIMD8 messages.\n"); + } + v8->limit_dispatch_width(16, "SIMD32 not supported with coarse" + " pixel shading.\n"); + } + + if (nir->info.ray_queries > 0 && v8) + v8->limit_dispatch_width(16, "SIMD32 with ray queries.\n"); + + if (!has_spilled && + (!v8 || v8->max_dispatch_width >= 16) && + (INTEL_SIMD(FS, 16) || params->use_rep_send)) { + /* Try a SIMD16 compile */ + v16 = std::make_unique(compiler, ¶ms->base, key, + prog_data, nir, 16, 1, + params->base.stats != NULL, + debug_enabled); + if (v8) + v16->import_uniforms(v8.get()); + if (!v16->run_fs(allow_spilling, params->use_rep_send)) { + brw_shader_perf_log(compiler, params->base.log_data, + "SIMD16 shader failed to compile: %s\n", + v16->fail_msg); + } else { + simd16_cfg = v16->cfg; + + assert(v16->payload().num_regs % reg_unit(devinfo) == 0); + prog_data->dispatch_grf_start_reg_16 = v16->payload().num_regs / reg_unit(devinfo); + + prog_data->reg_blocks_16 = brw_register_blocks(v16->grf_used); + const performance &perf = v16->performance_analysis.require(); + throughput = MAX2(throughput, perf.throughput); + has_spilled = v16->spilled_any_registers; + allow_spilling = false; + } + } + + const bool simd16_failed = v16 && !simd16_cfg; + + /* Currently, the compiler only supports SIMD32 on SNB+ */ + if (!has_spilled && + (!v8 || v8->max_dispatch_width >= 32) && + (!v16 || v16->max_dispatch_width >= 32) && !params->use_rep_send && + devinfo->ver >= 6 && !simd16_failed && + INTEL_SIMD(FS, 32)) { + /* Try a SIMD32 compile */ + v32 = std::make_unique(compiler, ¶ms->base, key, + prog_data, nir, 32, 1, + params->base.stats != NULL, + debug_enabled); + if (v8) + v32->import_uniforms(v8.get()); + else if (v16) + v32->import_uniforms(v16.get()); + + if (!v32->run_fs(allow_spilling, false)) { + brw_shader_perf_log(compiler, params->base.log_data, + "SIMD32 shader failed to compile: %s\n", + v32->fail_msg); + } else { + const performance &perf = v32->performance_analysis.require(); + + if (!INTEL_DEBUG(DEBUG_DO32) && throughput >= perf.throughput) { + brw_shader_perf_log(compiler, params->base.log_data, + "SIMD32 shader inefficient\n"); + } else { + simd32_cfg = v32->cfg; + + assert(v32->payload().num_regs % reg_unit(devinfo) == 0); + prog_data->dispatch_grf_start_reg_32 = v32->payload().num_regs / reg_unit(devinfo); + + prog_data->reg_blocks_32 = brw_register_blocks(v32->grf_used); + throughput = MAX2(throughput, perf.throughput); + } + } + } + + if (devinfo->ver >= 12 && !has_spilled && + params->max_polygons >= 2 && !key->coarse_pixel) { + fs_visitor *vbase = v8 ? v8.get() : v16 ? v16.get() : v32.get(); + assert(vbase); + + if (devinfo->ver >= 20 && + params->max_polygons >= 4 && + vbase->max_dispatch_width >= 32 && + 4 * prog_data->num_varying_inputs <= MAX_VARYING && + INTEL_SIMD(FS, 4X8)) { + /* Try a quad-SIMD8 compile */ + vmulti = std::make_unique(compiler, ¶ms->base, key, + prog_data, nir, 32, 4, + params->base.stats != NULL, + debug_enabled); + vmulti->import_uniforms(vbase); + if (!vmulti->run_fs(false, params->use_rep_send)) { + brw_shader_perf_log(compiler, params->base.log_data, + "Quad-SIMD8 shader failed to compile: %s\n", + vmulti->fail_msg); + } else { + multi_cfg = vmulti->cfg; + assert(!vmulti->spilled_any_registers); + } + } + + if (!multi_cfg && devinfo->ver >= 20 && + vbase->max_dispatch_width >= 32 && + 2 * prog_data->num_varying_inputs <= MAX_VARYING && + INTEL_SIMD(FS, 2X16)) { + /* Try a dual-SIMD16 compile */ + vmulti = std::make_unique(compiler, ¶ms->base, key, + prog_data, nir, 32, 2, + params->base.stats != NULL, + debug_enabled); + vmulti->import_uniforms(vbase); + if (!vmulti->run_fs(false, params->use_rep_send)) { + brw_shader_perf_log(compiler, params->base.log_data, + "Dual-SIMD16 shader failed to compile: %s\n", + vmulti->fail_msg); + } else { + multi_cfg = vmulti->cfg; + assert(!vmulti->spilled_any_registers); + } + } + + if (!multi_cfg && vbase->max_dispatch_width >= 16 && + 2 * prog_data->num_varying_inputs <= MAX_VARYING && + INTEL_SIMD(FS, 2X8)) { + /* Try a dual-SIMD8 compile */ + vmulti = std::make_unique(compiler, ¶ms->base, key, + prog_data, nir, 16, 2, + params->base.stats != NULL, + debug_enabled); + vmulti->import_uniforms(vbase); + if (!vmulti->run_fs(allow_spilling, params->use_rep_send)) { + brw_shader_perf_log(compiler, params->base.log_data, + "Dual-SIMD8 shader failed to compile: %s\n", + vmulti->fail_msg); + } else { + multi_cfg = vmulti->cfg; + } + } + + if (multi_cfg) { + assert(vmulti->payload().num_regs % reg_unit(devinfo) == 0); + prog_data->base.dispatch_grf_start_reg = vmulti->payload().num_regs / reg_unit(devinfo); + + prog_data->reg_blocks_8 = brw_register_blocks(vmulti->grf_used); + } + } + + /* When the caller requests a repclear shader, they want SIMD16-only */ + if (params->use_rep_send) + simd8_cfg = NULL; + + /* Prior to Iron Lake, the PS had a single shader offset with a jump table + * at the top to select the shader. We've never implemented that. + * Instead, we just give them exactly one shader and we pick the widest one + * available. + */ + if (compiler->devinfo->ver < 5) { + if (simd32_cfg || simd16_cfg) + simd8_cfg = NULL; + if (simd32_cfg) + simd16_cfg = NULL; + } + + /* If computed depth is enabled SNB only allows SIMD8. */ + if (compiler->devinfo->ver == 6 && + prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) + assert(simd16_cfg == NULL && simd32_cfg == NULL); + + if (compiler->devinfo->ver <= 5 && !simd8_cfg) { + /* Iron lake and earlier only have one Dispatch GRF start field. Make + * the data available in the base prog data struct for convenience. + */ + if (simd16_cfg) { + prog_data->base.dispatch_grf_start_reg = + prog_data->dispatch_grf_start_reg_16; + } else if (simd32_cfg) { + prog_data->base.dispatch_grf_start_reg = + prog_data->dispatch_grf_start_reg_32; + } + } + + fs_generator g(compiler, ¶ms->base, &prog_data->base, + v8 && v8->runtime_check_aads_emit, MESA_SHADER_FRAGMENT); + + if (unlikely(debug_enabled)) { + g.enable_debug(ralloc_asprintf(params->base.mem_ctx, + "%s fragment shader %s", + nir->info.label ? + nir->info.label : "unnamed", + nir->info.name)); + } + + struct brw_compile_stats *stats = params->base.stats; + uint32_t max_dispatch_width = 0; + + if (multi_cfg) { + prog_data->dispatch_multi = vmulti->dispatch_width; + prog_data->max_polygons = vmulti->max_polygons; + g.generate_code(multi_cfg, vmulti->dispatch_width, vmulti->shader_stats, + vmulti->performance_analysis.require(), + stats, vmulti->max_polygons); + stats = stats ? stats + 1 : NULL; + max_dispatch_width = vmulti->dispatch_width; + + } else if (simd8_cfg) { + prog_data->dispatch_8 = true; + g.generate_code(simd8_cfg, 8, v8->shader_stats, + v8->performance_analysis.require(), stats, 1); + stats = stats ? stats + 1 : NULL; + max_dispatch_width = 8; + } + + if (simd16_cfg) { + prog_data->dispatch_16 = true; + prog_data->prog_offset_16 = g.generate_code( + simd16_cfg, 16, v16->shader_stats, + v16->performance_analysis.require(), stats, 1); + stats = stats ? stats + 1 : NULL; + max_dispatch_width = 16; + } + + if (simd32_cfg) { + prog_data->dispatch_32 = true; + prog_data->prog_offset_32 = g.generate_code( + simd32_cfg, 32, v32->shader_stats, + v32->performance_analysis.require(), stats, 1); + stats = stats ? stats + 1 : NULL; + max_dispatch_width = 32; + } + + for (struct brw_compile_stats *s = params->base.stats; s != NULL && s != stats; s++) + s->max_dispatch_width = max_dispatch_width; + + g.add_const_data(nir->constant_data, nir->constant_data_size); + return g.get_assembly(); +} + +unsigned +brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data, + unsigned threads) +{ + assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0); + assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0); + return cs_prog_data->push.per_thread.size * threads + + cs_prog_data->push.cross_thread.size; +} + +static void +fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords) +{ + block->dwords = dwords; + block->regs = DIV_ROUND_UP(dwords, 8); + block->size = block->regs * 32; +} + +static void +cs_fill_push_const_info(const struct intel_device_info *devinfo, + struct brw_cs_prog_data *cs_prog_data) +{ + const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; + int subgroup_id_index = brw_get_subgroup_id_param_index(devinfo, prog_data); + bool cross_thread_supported = devinfo->verx10 >= 75; + + /* The thread ID should be stored in the last param dword */ + assert(subgroup_id_index == -1 || + subgroup_id_index == (int)prog_data->nr_params - 1); + + unsigned cross_thread_dwords, per_thread_dwords; + if (!cross_thread_supported) { + cross_thread_dwords = 0u; + per_thread_dwords = prog_data->nr_params; + } else if (subgroup_id_index >= 0) { + /* Fill all but the last register with cross-thread payload */ + cross_thread_dwords = 8 * (subgroup_id_index / 8); + per_thread_dwords = prog_data->nr_params - cross_thread_dwords; + assert(per_thread_dwords > 0 && per_thread_dwords <= 8); + } else { + /* Fill all data using cross-thread payload */ + cross_thread_dwords = prog_data->nr_params; + per_thread_dwords = 0u; + } + + fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords); + fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords); + + assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 || + cs_prog_data->push.per_thread.size == 0); + assert(cs_prog_data->push.cross_thread.dwords + + cs_prog_data->push.per_thread.dwords == + prog_data->nr_params); +} + +static bool +filter_simd(const nir_instr *instr, const void * /* options */) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + switch (nir_instr_as_intrinsic(instr)->intrinsic) { + case nir_intrinsic_load_simd_width_intel: + case nir_intrinsic_load_subgroup_id: + return true; + + default: + return false; + } +} + +static nir_def * +lower_simd(nir_builder *b, nir_instr *instr, void *options) +{ + uintptr_t simd_width = (uintptr_t)options; + + switch (nir_instr_as_intrinsic(instr)->intrinsic) { + case nir_intrinsic_load_simd_width_intel: + return nir_imm_int(b, simd_width); + + case nir_intrinsic_load_subgroup_id: + /* If the whole workgroup fits in one thread, we can lower subgroup_id + * to a constant zero. + */ + if (!b->shader->info.workgroup_size_variable) { + unsigned local_workgroup_size = b->shader->info.workgroup_size[0] * + b->shader->info.workgroup_size[1] * + b->shader->info.workgroup_size[2]; + if (local_workgroup_size <= simd_width) + return nir_imm_int(b, 0); + } + return NULL; + + default: + return NULL; + } +} + +bool +brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width) +{ + return nir_shader_lower_instructions(nir, filter_simd, lower_simd, + (void *)(uintptr_t)dispatch_width); +} + +const unsigned * +brw_compile_cs(const struct brw_compiler *compiler, + struct brw_compile_cs_params *params) +{ + const nir_shader *nir = params->base.nir; + const struct brw_cs_prog_key *key = params->key; + struct brw_cs_prog_data *prog_data = params->prog_data; + + const bool debug_enabled = + brw_should_print_shader(nir, params->base.debug_flag ? + params->base.debug_flag : DEBUG_CS); + + prog_data->base.stage = MESA_SHADER_COMPUTE; + prog_data->base.total_shared = nir->info.shared_size; + prog_data->base.ray_queries = nir->info.ray_queries; + prog_data->base.total_scratch = 0; + + if (!nir->info.workgroup_size_variable) { + prog_data->local_size[0] = nir->info.workgroup_size[0]; + prog_data->local_size[1] = nir->info.workgroup_size[1]; + prog_data->local_size[2] = nir->info.workgroup_size[2]; + } + + brw_simd_selection_state simd_state{ + .devinfo = compiler->devinfo, + .prog_data = prog_data, + .required_width = brw_required_dispatch_width(&nir->info), + }; + + std::unique_ptr v[3]; + + for (unsigned simd = 0; simd < 3; simd++) { + if (!brw_simd_should_compile(simd_state, simd)) + continue; + + const unsigned dispatch_width = 8u << simd; + + nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir); + brw_nir_apply_key(shader, compiler, &key->base, + dispatch_width); + + NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width); + + /* Clean up after the local index and ID calculations. */ + NIR_PASS(_, shader, nir_opt_constant_folding); + NIR_PASS(_, shader, nir_opt_dce); + + brw_postprocess_nir(shader, compiler, debug_enabled, + key->base.robust_flags); + + v[simd] = std::make_unique(compiler, ¶ms->base, + &key->base, + &prog_data->base, + shader, dispatch_width, + params->base.stats != NULL, + debug_enabled); + + const int first = brw_simd_first_compiled(simd_state); + if (first >= 0) + v[simd]->import_uniforms(v[first].get()); + + const bool allow_spilling = first < 0 || nir->info.workgroup_size_variable; + + if (v[simd]->run_cs(allow_spilling)) { + cs_fill_push_const_info(compiler->devinfo, prog_data); + + brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers); + } else { + simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg); + if (simd > 0) { + brw_shader_perf_log(compiler, params->base.log_data, + "SIMD%u shader failed to compile: %s\n", + dispatch_width, v[simd]->fail_msg); + } + } + } + + const int selected_simd = brw_simd_select(simd_state); + if (selected_simd < 0) { + params->base.error_str = + ralloc_asprintf(params->base.mem_ctx, + "Can't compile shader: " + "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n", + simd_state.error[0], simd_state.error[1], + simd_state.error[2]); + return NULL; + } + + assert(selected_simd < 3); + fs_visitor *selected = v[selected_simd].get(); + + if (!nir->info.workgroup_size_variable) + prog_data->prog_mask = 1 << selected_simd; + + fs_generator g(compiler, ¶ms->base, &prog_data->base, + selected->runtime_check_aads_emit, MESA_SHADER_COMPUTE); + if (unlikely(debug_enabled)) { + char *name = ralloc_asprintf(params->base.mem_ctx, + "%s compute shader %s", + nir->info.label ? + nir->info.label : "unnamed", + nir->info.name); + g.enable_debug(name); + } + + uint32_t max_dispatch_width = 8u << (util_last_bit(prog_data->prog_mask) - 1); + + struct brw_compile_stats *stats = params->base.stats; + for (unsigned simd = 0; simd < 3; simd++) { + if (prog_data->prog_mask & (1u << simd)) { + assert(v[simd]); + prog_data->prog_offset[simd] = + g.generate_code(v[simd]->cfg, 8u << simd, v[simd]->shader_stats, + v[simd]->performance_analysis.require(), stats); + if (stats) + stats->max_dispatch_width = max_dispatch_width; + stats = stats ? stats + 1 : NULL; + max_dispatch_width = 8u << simd; + } + } + + g.add_const_data(nir->constant_data, nir->constant_data_size); + + return g.get_assembly(); +} + +struct intel_cs_dispatch_info +brw_cs_get_dispatch_info(const struct intel_device_info *devinfo, + const struct brw_cs_prog_data *prog_data, + const unsigned *override_local_size) +{ + struct intel_cs_dispatch_info info = {}; + + const unsigned *sizes = + override_local_size ? override_local_size : + prog_data->local_size; + + const int simd = brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes); + assert(simd >= 0 && simd < 3); + + info.group_size = sizes[0] * sizes[1] * sizes[2]; + info.simd_size = 8u << simd; + info.threads = DIV_ROUND_UP(info.group_size, info.simd_size); + + const uint32_t remainder = info.group_size & (info.simd_size - 1); + if (remainder > 0) + info.right_mask = ~0u >> (32 - remainder); + else + info.right_mask = ~0u >> (32 - info.simd_size); + + return info; +} + +static uint8_t +compile_single_bs(const struct brw_compiler *compiler, + struct brw_compile_bs_params *params, + const struct brw_bs_prog_key *key, + struct brw_bs_prog_data *prog_data, + nir_shader *shader, + fs_generator *g, + struct brw_compile_stats *stats, + int *prog_offset) +{ + const bool debug_enabled = brw_should_print_shader(shader, DEBUG_RT); + + prog_data->base.stage = shader->info.stage; + prog_data->max_stack_size = MAX2(prog_data->max_stack_size, + shader->scratch_size); + + const unsigned max_dispatch_width = 16; + brw_nir_apply_key(shader, compiler, &key->base, max_dispatch_width); + brw_postprocess_nir(shader, compiler, debug_enabled, + key->base.robust_flags); + + brw_simd_selection_state simd_state{ + .devinfo = compiler->devinfo, + .prog_data = prog_data, + + /* Since divergence is a lot more likely in RT than compute, it makes + * sense to limit ourselves to the smallest available SIMD for now. + */ + .required_width = compiler->devinfo->ver >= 20 ? 16u : 8u, + }; + + std::unique_ptr v[2]; + + for (unsigned simd = 0; simd < ARRAY_SIZE(v); simd++) { + if (!brw_simd_should_compile(simd_state, simd)) + continue; + + const unsigned dispatch_width = 8u << simd; + + if (dispatch_width == 8 && compiler->devinfo->ver >= 20) + continue; + + v[simd] = std::make_unique(compiler, ¶ms->base, + &key->base, + &prog_data->base, shader, + dispatch_width, + stats != NULL, + debug_enabled); + + const bool allow_spilling = !brw_simd_any_compiled(simd_state); + if (v[simd]->run_bs(allow_spilling)) { + brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers); + } else { + simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, + v[simd]->fail_msg); + if (simd > 0) { + brw_shader_perf_log(compiler, params->base.log_data, + "SIMD%u shader failed to compile: %s", + dispatch_width, v[simd]->fail_msg); + } + } + } + + const int selected_simd = brw_simd_select(simd_state); + if (selected_simd < 0) { + params->base.error_str = + ralloc_asprintf(params->base.mem_ctx, + "Can't compile shader: " + "SIMD8 '%s' and SIMD16 '%s'.\n", + simd_state.error[0], simd_state.error[1]); + return 0; + } + + assert(selected_simd < int(ARRAY_SIZE(v))); + fs_visitor *selected = v[selected_simd].get(); + assert(selected); + + const unsigned dispatch_width = selected->dispatch_width; + + int offset = g->generate_code(selected->cfg, dispatch_width, selected->shader_stats, + selected->performance_analysis.require(), stats); + if (prog_offset) + *prog_offset = offset; + else + assert(offset == 0); + + return dispatch_width; +} + +uint64_t +brw_bsr(const struct intel_device_info *devinfo, + uint32_t offset, uint8_t simd_size, uint8_t local_arg_offset) +{ + assert(offset % 64 == 0); + assert(simd_size == 8 || simd_size == 16); + assert(local_arg_offset % 8 == 0); + + return offset | + SET_BITS(simd_size == 8, 4, 4) | + SET_BITS(local_arg_offset / 8, 2, 0); +} + +const unsigned * +brw_compile_bs(const struct brw_compiler *compiler, + struct brw_compile_bs_params *params) +{ + nir_shader *shader = params->base.nir; + struct brw_bs_prog_data *prog_data = params->prog_data; + unsigned num_resume_shaders = params->num_resume_shaders; + nir_shader **resume_shaders = params->resume_shaders; + const bool debug_enabled = brw_should_print_shader(shader, DEBUG_RT); + + prog_data->base.stage = shader->info.stage; + prog_data->base.ray_queries = shader->info.ray_queries; + prog_data->base.total_scratch = 0; + + prog_data->max_stack_size = 0; + prog_data->num_resume_shaders = num_resume_shaders; + + fs_generator g(compiler, ¶ms->base, &prog_data->base, + false, shader->info.stage); + if (unlikely(debug_enabled)) { + char *name = ralloc_asprintf(params->base.mem_ctx, + "%s %s shader %s", + shader->info.label ? + shader->info.label : "unnamed", + gl_shader_stage_name(shader->info.stage), + shader->info.name); + g.enable_debug(name); + } + + prog_data->simd_size = + compile_single_bs(compiler, params, params->key, prog_data, + shader, &g, params->base.stats, NULL); + if (prog_data->simd_size == 0) + return NULL; + + uint64_t *resume_sbt = ralloc_array(params->base.mem_ctx, + uint64_t, num_resume_shaders); + for (unsigned i = 0; i < num_resume_shaders; i++) { + if (INTEL_DEBUG(DEBUG_RT)) { + char *name = ralloc_asprintf(params->base.mem_ctx, + "%s %s resume(%u) shader %s", + shader->info.label ? + shader->info.label : "unnamed", + gl_shader_stage_name(shader->info.stage), + i, shader->info.name); + g.enable_debug(name); + } + + /* TODO: Figure out shader stats etc. for resume shaders */ + int offset = 0; + uint8_t simd_size = + compile_single_bs(compiler, params, params->key, + prog_data, resume_shaders[i], &g, NULL, &offset); + if (simd_size == 0) + return NULL; + + assert(offset > 0); + resume_sbt[i] = brw_bsr(compiler->devinfo, offset, simd_size, 0); + } + + /* We only have one constant data so we want to make sure they're all the + * same. + */ + for (unsigned i = 0; i < num_resume_shaders; i++) { + assert(resume_shaders[i]->constant_data_size == + shader->constant_data_size); + assert(memcmp(resume_shaders[i]->constant_data, + shader->constant_data, + shader->constant_data_size) == 0); + } + + g.add_const_data(shader->constant_data, shader->constant_data_size); + g.add_resume_sbt(num_resume_shaders, resume_sbt); + + return g.get_assembly(); +} + +/** + * Test the dispatch mask packing assumptions of + * brw_stage_has_packed_dispatch(). Call this from e.g. the top of + * fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is + * executed with an unexpected dispatch mask. + */ +static UNUSED void +brw_fs_test_dispatch_packing(const fs_builder &bld) +{ + const fs_visitor *shader = static_cast(bld.shader); + const gl_shader_stage stage = shader->stage; + const bool uses_vmask = + stage == MESA_SHADER_FRAGMENT && + brw_wm_prog_data(shader->stage_prog_data)->uses_vmask; + + if (brw_stage_has_packed_dispatch(shader->devinfo, stage, + shader->max_polygons, + shader->stage_prog_data)) { + const fs_builder ubld = bld.exec_all().group(1, 0); + const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0); + const fs_reg mask = uses_vmask ? brw_vmask_reg() : brw_dmask_reg(); + + ubld.ADD(tmp, mask, brw_imm_ud(1)); + ubld.AND(tmp, mask, tmp); + + /* This will loop forever if the dispatch mask doesn't have the expected + * form '2^n-1', in which case tmp will be non-zero. + */ + bld.emit(BRW_OPCODE_DO); + bld.CMP(bld.null_reg_ud(), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ); + set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE)); + } +} + +unsigned +fs_visitor::workgroup_size() const +{ + assert(gl_shader_stage_uses_workgroup(stage)); + const struct brw_cs_prog_data *cs = brw_cs_prog_data(prog_data); + return cs->local_size[0] * cs->local_size[1] * cs->local_size[2]; +} + +bool brw_should_print_shader(const nir_shader *shader, uint64_t debug_flag) +{ + return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL)); +} + +namespace brw { + fs_reg + fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2], + brw_reg_type type, unsigned n) + { + if (!regs[0]) + return fs_reg(); + + if (bld.dispatch_width() > 16) { + const fs_reg tmp = bld.vgrf(type, n); + const brw::fs_builder hbld = bld.exec_all().group(16, 0); + const unsigned m = bld.dispatch_width() / hbld.dispatch_width(); + fs_reg *const components = new fs_reg[m * n]; + + for (unsigned c = 0; c < n; c++) { + for (unsigned g = 0; g < m; g++) + components[c * m + g] = + offset(retype(brw_vec8_grf(regs[g], 0), type), hbld, c); + } + + hbld.LOAD_PAYLOAD(tmp, components, m * n, 0); + + delete[] components; + return tmp; + + } else { + return fs_reg(retype(brw_vec8_grf(regs[0], 0), type)); + } + } + + fs_reg + fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2]) + { + if (!regs[0]) + return fs_reg(); + else if (bld.shader->devinfo->ver >= 20) + return fetch_payload_reg(bld, regs, BRW_REGISTER_TYPE_F, 2); + + const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 2); + const brw::fs_builder hbld = bld.exec_all().group(8, 0); + const unsigned m = bld.dispatch_width() / hbld.dispatch_width(); + fs_reg *const components = new fs_reg[2 * m]; + + for (unsigned c = 0; c < 2; c++) { + for (unsigned g = 0; g < m; g++) + components[c * m + g] = offset(brw_vec8_grf(regs[g / 2], 0), + hbld, c + 2 * (g % 2)); + } + + hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0); + + delete[] components; + return tmp; + } + + void + check_dynamic_msaa_flag(const fs_builder &bld, + const struct brw_wm_prog_data *wm_prog_data, + enum intel_msaa_flags flag) + { + fs_inst *inst = bld.AND(bld.null_reg_ud(), + dynamic_msaa_flags(wm_prog_data), + brw_imm_ud(flag)); + inst->conditional_mod = BRW_CONDITIONAL_NZ; + } +} diff --git a/src/intel/compiler/elk/brw_fs.h b/src/intel/compiler/elk/brw_fs.h new file mode 100644 index 00000000000..0ee32403541 --- /dev/null +++ b/src/intel/compiler/elk/brw_fs.h @@ -0,0 +1,637 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt + * + */ + +#ifndef BRW_FS_H +#define BRW_FS_H + +#include "brw_shader.h" +#include "brw_ir_fs.h" +#include "brw_fs_live_variables.h" +#include "brw_ir_performance.h" +#include "compiler/nir/nir.h" + +struct bblock_t; +namespace { + struct acp_entry; +} + +class fs_visitor; + +namespace brw { + /** + * Register pressure analysis of a shader. Estimates how many registers + * are live at any point of the program in GRF units. + */ + struct register_pressure { + register_pressure(const fs_visitor *v); + ~register_pressure(); + + analysis_dependency_class + dependency_class() const + { + return (DEPENDENCY_INSTRUCTION_IDENTITY | + DEPENDENCY_INSTRUCTION_DATA_FLOW | + DEPENDENCY_VARIABLES); + } + + bool + validate(const fs_visitor *) const + { + /* FINISHME */ + return true; + } + + unsigned *regs_live_at_ip; + }; +} + +struct brw_gs_compile; + +namespace brw { +class fs_builder; +} + +struct shader_stats { + const char *scheduler_mode; + unsigned promoted_constants; + unsigned spill_count; + unsigned fill_count; + unsigned max_register_pressure; +}; + +/** Register numbers for thread payload fields. */ +struct thread_payload { + /** The number of thread payload registers the hardware will supply. */ + uint8_t num_regs; + + virtual ~thread_payload() = default; + +protected: + thread_payload() : num_regs() {} +}; + +struct vs_thread_payload : public thread_payload { + vs_thread_payload(const fs_visitor &v); + + fs_reg urb_handles; +}; + +struct tcs_thread_payload : public thread_payload { + tcs_thread_payload(const fs_visitor &v); + + fs_reg patch_urb_output; + fs_reg primitive_id; + fs_reg icp_handle_start; +}; + +struct tes_thread_payload : public thread_payload { + tes_thread_payload(const fs_visitor &v); + + fs_reg patch_urb_input; + fs_reg primitive_id; + fs_reg coords[3]; + fs_reg urb_output; +}; + +struct gs_thread_payload : public thread_payload { + gs_thread_payload(fs_visitor &v); + + fs_reg urb_handles; + fs_reg primitive_id; + fs_reg instance_id; + fs_reg icp_handle_start; +}; + +struct fs_thread_payload : public thread_payload { + fs_thread_payload(const fs_visitor &v, + bool &source_depth_to_render_target, + bool &runtime_check_aads_emit); + + uint8_t subspan_coord_reg[2]; + uint8_t source_depth_reg[2]; + uint8_t source_w_reg[2]; + uint8_t aa_dest_stencil_reg[2]; + uint8_t dest_depth_reg[2]; + uint8_t sample_pos_reg[2]; + uint8_t sample_mask_in_reg[2]; + uint8_t depth_w_coef_reg; + uint8_t barycentric_coord_reg[BRW_BARYCENTRIC_MODE_COUNT][2]; +}; + +struct cs_thread_payload : public thread_payload { + cs_thread_payload(const fs_visitor &v); + + void load_subgroup_id(const brw::fs_builder &bld, fs_reg &dest) const; + + fs_reg local_invocation_id[3]; + +protected: + fs_reg subgroup_id_; +}; + +struct task_mesh_thread_payload : public cs_thread_payload { + task_mesh_thread_payload(fs_visitor &v); + + fs_reg extended_parameter_0; + fs_reg local_index; + fs_reg inline_parameter; + + fs_reg urb_output; + + /* URB to read Task memory inputs. Only valid for MESH stage. */ + fs_reg task_urb_input; +}; + +struct bs_thread_payload : public thread_payload { + bs_thread_payload(const fs_visitor &v); + + fs_reg global_arg_ptr; + fs_reg local_arg_ptr; + + void load_shader_type(const brw::fs_builder &bld, fs_reg &dest) const; +}; + +class fs_instruction_scheduler; + +/** + * The fragment shader front-end. + * + * Translates either GLSL IR or Mesa IR (for ARB_fragment_program) into FS IR. + */ +class fs_visitor : public backend_shader +{ +public: + fs_visitor(const struct brw_compiler *compiler, + const struct brw_compile_params *params, + const brw_base_prog_key *key, + struct brw_stage_prog_data *prog_data, + const nir_shader *shader, + unsigned dispatch_width, + bool needs_register_pressure, + bool debug_enabled); + fs_visitor(const struct brw_compiler *compiler, + const struct brw_compile_params *params, + const brw_wm_prog_key *key, + struct brw_wm_prog_data *prog_data, + const nir_shader *shader, + unsigned dispatch_width, + unsigned num_polygons, + bool needs_register_pressure, + bool debug_enabled); + fs_visitor(const struct brw_compiler *compiler, + const struct brw_compile_params *params, + struct brw_gs_compile *gs_compile, + struct brw_gs_prog_data *prog_data, + const nir_shader *shader, + bool needs_register_pressure, + bool debug_enabled); + void init(); + ~fs_visitor(); + + fs_reg vgrf(const glsl_type *const type); + void import_uniforms(fs_visitor *v); + + void VARYING_PULL_CONSTANT_LOAD(const brw::fs_builder &bld, + const fs_reg &dst, + const fs_reg &surface, + const fs_reg &surface_handle, + const fs_reg &varying_offset, + uint32_t const_offset, + uint8_t alignment, + unsigned components); + void DEP_RESOLVE_MOV(const brw::fs_builder &bld, int grf); + + bool run_fs(bool allow_spilling, bool do_rep_send); + bool run_vs(); + bool run_tcs(); + bool run_tes(); + bool run_gs(); + bool run_cs(bool allow_spilling); + bool run_bs(bool allow_spilling); + bool run_task(bool allow_spilling); + bool run_mesh(bool allow_spilling); + void optimize(); + void allocate_registers(bool allow_spilling); + uint32_t compute_max_register_pressure(); + bool fixup_sends_duplicate_payload(); + void fixup_3src_null_dest(); + void emit_dummy_memory_fence_before_eot(); + void emit_dummy_mov_instruction(); + bool fixup_nomask_control_flow(); + void assign_curb_setup(); + void assign_urb_setup(); + void convert_attr_sources_to_hw_regs(fs_inst *inst); + void assign_vs_urb_setup(); + void assign_tcs_urb_setup(); + void assign_tes_urb_setup(); + void assign_gs_urb_setup(); + bool assign_regs(bool allow_spilling, bool spill_all); + void assign_regs_trivial(); + void calculate_payload_ranges(unsigned payload_node_count, + int *payload_last_use_ip) const; + bool split_virtual_grfs(); + bool compact_virtual_grfs(); + void assign_constant_locations(); + bool get_pull_locs(const fs_reg &src, unsigned *out_surf_index, + unsigned *out_pull_index); + bool lower_constant_loads(); + virtual void invalidate_analysis(brw::analysis_dependency_class c); + +#ifndef NDEBUG + void validate(); +#else + void validate() {} +#endif + + bool opt_algebraic(); + bool opt_redundant_halt(); + bool opt_cse(); + bool opt_cse_local(const brw::fs_live_variables &live, bblock_t *block, int &ip); + + bool opt_copy_propagation(); + bool opt_bank_conflicts(); + bool opt_split_sends(); + bool register_coalesce(); + bool compute_to_mrf(); + bool eliminate_find_live_channel(); + bool dead_code_eliminate(); + bool remove_duplicate_mrf_writes(); + bool remove_extra_rounding_modes(); + + fs_instruction_scheduler *prepare_scheduler(void *mem_ctx); + void schedule_instructions_pre_ra(fs_instruction_scheduler *sched, + instruction_scheduler_mode mode); + void schedule_instructions_post_ra(); + + void insert_gfx4_send_dependency_workarounds(); + void insert_gfx4_pre_send_dependency_workarounds(bblock_t *block, + fs_inst *inst); + void insert_gfx4_post_send_dependency_workarounds(bblock_t *block, + fs_inst *inst); + void vfail(const char *msg, va_list args); + void fail(const char *msg, ...); + void limit_dispatch_width(unsigned n, const char *msg); + bool lower_uniform_pull_constant_loads(); + bool lower_load_payload(); + bool lower_pack(); + bool lower_regioning(); + bool lower_logical_sends(); + bool lower_integer_multiplication(); + bool lower_minmax(); + bool lower_simd_width(); + bool lower_barycentrics(); + bool lower_derivatives(); + bool lower_find_live_channel(); + bool lower_scoreboard(); + bool lower_sub_sat(); + bool opt_combine_constants(); + + void emit_repclear_shader(); + void emit_interpolation_setup_gfx4(); + void emit_interpolation_setup_gfx6(); + bool opt_peephole_sel(); + bool opt_saturate_propagation(); + bool opt_cmod_propagation(); + bool opt_zero_samples(); + + void set_tcs_invocation_id(); + + void emit_alpha_test(); + fs_inst *emit_single_fb_write(const brw::fs_builder &bld, + fs_reg color1, fs_reg color2, + fs_reg src0_alpha, unsigned components); + void do_emit_fb_writes(int nr_color_regions, bool replicate_alpha); + void emit_fb_writes(); + void emit_urb_writes(const fs_reg &gs_vertex_count = fs_reg()); + void emit_gs_control_data_bits(const fs_reg &vertex_count); + void emit_gs_thread_end(); + bool mark_last_urb_write_with_eot(); + void emit_tcs_thread_end(); + void emit_urb_fence(); + void emit_cs_terminate(); + + fs_reg interp_reg(const brw::fs_builder &bld, unsigned location, + unsigned channel, unsigned comp); + fs_reg per_primitive_reg(const brw::fs_builder &bld, + int location, unsigned comp); + + virtual void dump_instruction_to_file(const backend_instruction *inst, FILE *file) const; + virtual void dump_instructions_to_file(FILE *file) const; + + const brw_base_prog_key *const key; + const struct brw_sampler_prog_key_data *key_tex; + + struct brw_gs_compile *gs_compile; + + struct brw_stage_prog_data *prog_data; + + brw_analysis live_analysis; + brw_analysis regpressure_analysis; + brw_analysis performance_analysis; + + /** Number of uniform variable components visited. */ + unsigned uniforms; + + /** Byte-offset for the next available spot in the scratch space buffer. */ + unsigned last_scratch; + + /** + * Array mapping UNIFORM register numbers to the push parameter index, + * or -1 if this uniform register isn't being uploaded as a push constant. + */ + int *push_constant_loc; + + fs_reg frag_depth; + fs_reg frag_stencil; + fs_reg sample_mask; + fs_reg outputs[VARYING_SLOT_MAX]; + fs_reg dual_src_output; + int first_non_payload_grf; + /** Either BRW_MAX_GRF or GFX7_MRF_HACK_START */ + unsigned max_grf; + + bool failed; + char *fail_msg; + + thread_payload *payload_; + + thread_payload &payload() { + return *this->payload_; + } + + vs_thread_payload &vs_payload() { + assert(stage == MESA_SHADER_VERTEX); + return *static_cast(this->payload_); + } + + tcs_thread_payload &tcs_payload() { + assert(stage == MESA_SHADER_TESS_CTRL); + return *static_cast(this->payload_); + } + + tes_thread_payload &tes_payload() { + assert(stage == MESA_SHADER_TESS_EVAL); + return *static_cast(this->payload_); + } + + gs_thread_payload &gs_payload() { + assert(stage == MESA_SHADER_GEOMETRY); + return *static_cast(this->payload_); + } + + fs_thread_payload &fs_payload() { + assert(stage == MESA_SHADER_FRAGMENT); + return *static_cast(this->payload_); + }; + + cs_thread_payload &cs_payload() { + assert(gl_shader_stage_uses_workgroup(stage)); + return *static_cast(this->payload_); + } + + task_mesh_thread_payload &task_mesh_payload() { + assert(stage == MESA_SHADER_TASK || stage == MESA_SHADER_MESH); + return *static_cast(this->payload_); + } + + bs_thread_payload &bs_payload() { + assert(stage >= MESA_SHADER_RAYGEN && stage <= MESA_SHADER_CALLABLE); + return *static_cast(this->payload_); + } + + bool source_depth_to_render_target; + bool runtime_check_aads_emit; + + fs_reg pixel_x; + fs_reg pixel_y; + fs_reg pixel_z; + fs_reg wpos_w; + fs_reg pixel_w; + fs_reg delta_xy[BRW_BARYCENTRIC_MODE_COUNT]; + fs_reg final_gs_vertex_count; + fs_reg control_data_bits; + fs_reg invocation_id; + + unsigned grf_used; + bool spilled_any_registers; + bool needs_register_pressure; + + const unsigned dispatch_width; /**< 8, 16 or 32 */ + const unsigned max_polygons; + unsigned max_dispatch_width; + + /* The API selected subgroup size */ + unsigned api_subgroup_size; /**< 0, 8, 16, 32 */ + + struct shader_stats shader_stats; + + void lower_mul_dword_inst(fs_inst *inst, bblock_t *block); + void lower_mul_qword_inst(fs_inst *inst, bblock_t *block); + void lower_mulh_inst(fs_inst *inst, bblock_t *block); + + unsigned workgroup_size() const; + + void debug_optimizer(const nir_shader *nir, + const char *pass_name, + int iteration, int pass_num) const; +}; + +/** + * Return the flag register used in fragment shaders to keep track of live + * samples. On Gfx7+ we use f1.0-f1.1 to allow discard jumps in SIMD32 + * dispatch mode, while earlier generations are constrained to f0.1, which + * limits the dispatch width to SIMD16 for fragment shaders that use discard. + */ +static inline unsigned +sample_mask_flag_subreg(const fs_visitor &s) +{ + assert(s.stage == MESA_SHADER_FRAGMENT); + return s.devinfo->ver >= 7 ? 2 : 1; +} + +/** + * The fragment shader code generator. + * + * Translates FS IR to actual i965 assembly code. + */ +class fs_generator +{ +public: + fs_generator(const struct brw_compiler *compiler, + const struct brw_compile_params *params, + struct brw_stage_prog_data *prog_data, + bool runtime_check_aads_emit, + gl_shader_stage stage); + ~fs_generator(); + + void enable_debug(const char *shader_name); + int generate_code(const cfg_t *cfg, int dispatch_width, + struct shader_stats shader_stats, + const brw::performance &perf, + struct brw_compile_stats *stats, + unsigned max_polygons = 0); + void add_const_data(void *data, unsigned size); + void add_resume_sbt(unsigned num_resume_shaders, uint64_t *sbt); + const unsigned *get_assembly(); + +private: + void fire_fb_write(fs_inst *inst, + struct brw_reg payload, + struct brw_reg implied_header, + GLuint nr); + void generate_send(fs_inst *inst, + struct brw_reg dst, + struct brw_reg desc, + struct brw_reg ex_desc, + struct brw_reg payload, + struct brw_reg payload2); + void generate_fb_write(fs_inst *inst, struct brw_reg payload); + void generate_fb_read(fs_inst *inst, struct brw_reg dst, + struct brw_reg payload); + void generate_cs_terminate(fs_inst *inst, struct brw_reg payload); + void generate_barrier(fs_inst *inst, struct brw_reg src); + bool generate_linterp(fs_inst *inst, struct brw_reg dst, + struct brw_reg *src); + void generate_tex(fs_inst *inst, struct brw_reg dst, + struct brw_reg surface_index, + struct brw_reg sampler_index); + void generate_ddx(const fs_inst *inst, + struct brw_reg dst, struct brw_reg src); + void generate_ddy(const fs_inst *inst, + struct brw_reg dst, struct brw_reg src); + void generate_scratch_write(fs_inst *inst, struct brw_reg src); + void generate_scratch_read(fs_inst *inst, struct brw_reg dst); + void generate_scratch_read_gfx7(fs_inst *inst, struct brw_reg dst); + void generate_scratch_header(fs_inst *inst, struct brw_reg dst); + void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst, + struct brw_reg index, + struct brw_reg offset); + void generate_varying_pull_constant_load_gfx4(fs_inst *inst, + struct brw_reg dst, + struct brw_reg index); + + void generate_set_sample_id(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1); + + void generate_halt(fs_inst *inst); + + void generate_mov_indirect(fs_inst *inst, + struct brw_reg dst, + struct brw_reg reg, + struct brw_reg indirect_byte_offset); + + void generate_shuffle(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src, + struct brw_reg idx); + + void generate_quad_swizzle(const fs_inst *inst, + struct brw_reg dst, struct brw_reg src, + unsigned swiz); + + bool patch_halt_jumps(); + + const struct brw_compiler *compiler; + const struct brw_compile_params *params; + + const struct intel_device_info *devinfo; + + struct brw_codegen *p; + struct brw_stage_prog_data * const prog_data; + + unsigned dispatch_width; /**< 8, 16 or 32 */ + + exec_list discard_halt_patches; + bool runtime_check_aads_emit; + bool debug_flag; + const char *shader_name; + gl_shader_stage stage; + void *mem_ctx; +}; + +namespace brw { + fs_reg + fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2], + brw_reg_type type = BRW_REGISTER_TYPE_F, + unsigned n = 1); + + fs_reg + fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2]); + + inline fs_reg + dynamic_msaa_flags(const struct brw_wm_prog_data *wm_prog_data) + { + return fs_reg(UNIFORM, wm_prog_data->msaa_flags_param, + BRW_REGISTER_TYPE_UD); + } + + void + check_dynamic_msaa_flag(const fs_builder &bld, + const struct brw_wm_prog_data *wm_prog_data, + enum intel_msaa_flags flag); + + bool + lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i); +} + +void shuffle_from_32bit_read(const brw::fs_builder &bld, + const fs_reg &dst, + const fs_reg &src, + uint32_t first_component, + uint32_t components); + +fs_reg setup_imm_df(const brw::fs_builder &bld, + double v); + +fs_reg setup_imm_b(const brw::fs_builder &bld, + int8_t v); + +fs_reg setup_imm_ub(const brw::fs_builder &bld, + uint8_t v); + +enum brw_barycentric_mode brw_barycentric_mode(nir_intrinsic_instr *intr); + +uint32_t brw_fb_write_msg_control(const fs_inst *inst, + const struct brw_wm_prog_data *prog_data); + +void brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data); + +bool brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width); + +fs_reg brw_sample_mask_reg(const brw::fs_builder &bld); +void brw_emit_predicate_on_sample_mask(const brw::fs_builder &bld, fs_inst *inst); + +int brw_get_subgroup_id_param_index(const intel_device_info *devinfo, + const brw_stage_prog_data *prog_data); + +bool brw_lower_dpas(fs_visitor &v); + +void nir_to_brw(fs_visitor *s); + +#endif /* BRW_FS_H */ diff --git a/src/intel/compiler/elk/brw_fs_bank_conflicts.cpp b/src/intel/compiler/elk/brw_fs_bank_conflicts.cpp new file mode 100644 index 00000000000..8505748b0f8 --- /dev/null +++ b/src/intel/compiler/elk/brw_fs_bank_conflicts.cpp @@ -0,0 +1,955 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_fs_bank_conflicts.cpp + * + * This file contains a GRF bank conflict mitigation pass. The pass is + * intended to be run after register allocation and works by rearranging the + * layout of the GRF space (without altering the semantics of the program) in + * a way that minimizes the number of GRF bank conflicts incurred by ternary + * instructions. + * + * Unfortunately there is close to no information about bank conflicts in the + * hardware spec, but experimentally on Gfx7-Gfx9 ternary instructions seem to + * incur an average bank conflict penalty of one cycle per SIMD8 op whenever + * the second and third source are stored in the same GRF bank (\sa bank_of() + * for the exact bank layout) which cannot be fetched during the same cycle by + * the EU, unless the EU logic manages to optimize out the read cycle of a + * duplicate source register (\sa is_conflict_optimized_out()). + * + * The asymptotic run-time of the algorithm is dominated by the + * shader_conflict_weight_matrix() computation below, which is O(n) on the + * number of instructions in the program, however for small and medium-sized + * programs the run-time is likely to be dominated by + * optimize_reg_permutation() which is O(m^3) on the number of GRF atoms of + * the program (\sa partitioning), which is bounded (since the program uses a + * bounded number of registers post-regalloc) and of the order of 100. For + * that reason optimize_reg_permutation() is vectorized in order to keep the + * cubic term within reasonable bounds for m close to its theoretical maximum. + */ + +#include "brw_fs.h" +#include "brw_cfg.h" + +#ifdef __SSE2__ + +#include + +/** + * Thin layer around vector intrinsics so they can be easily replaced with + * e.g. the fall-back scalar path, an implementation with different vector + * width or using different SIMD architectures (AVX-512?!). + * + * This implementation operates on pairs of independent SSE2 integer vectors à + * la SIMD16 for somewhat improved throughput. SSE2 is supported by virtually + * all platforms that care about bank conflicts, so this path should almost + * always be available in practice. + */ +namespace { + /** + * SIMD integer vector data type. + */ + struct vector_type { + __m128i v[2]; + }; + + /** + * Scalar data type matching the representation of a single component of \p + * vector_type. + */ + typedef int16_t scalar_type; + + /** + * Maximum integer value representable as a \p scalar_type. + */ + const scalar_type max_scalar = INT16_MAX; + + /** + * Number of components of a \p vector_type. + */ + const unsigned vector_width = 2 * sizeof(__m128i) / sizeof(scalar_type); + + /** + * Set the i-th component of vector \p v to \p x. + */ + void + set(vector_type &v, unsigned i, scalar_type x) + { + assert(i < vector_width); + memcpy((char *)v.v + i * sizeof(x), &x, sizeof(x)); + } + + /** + * Get the i-th component of vector \p v. + */ + scalar_type + get(const vector_type &v, unsigned i) + { + assert(i < vector_width); + scalar_type x; + memcpy(&x, (char *)v.v + i * sizeof(x), sizeof(x)); + return x; + } + + /** + * Add two vectors with saturation. + */ + vector_type + adds(const vector_type &v, const vector_type &w) + { + const vector_type u = {{ + _mm_adds_epi16(v.v[0], w.v[0]), + _mm_adds_epi16(v.v[1], w.v[1]) + }}; + return u; + } + + /** + * Subtract two vectors with saturation. + */ + vector_type + subs(const vector_type &v, const vector_type &w) + { + const vector_type u = {{ + _mm_subs_epi16(v.v[0], w.v[0]), + _mm_subs_epi16(v.v[1], w.v[1]) + }}; + return u; + } + + /** + * Compute the bitwise conjunction of two vectors. + */ + vector_type + mask(const vector_type &v, const vector_type &w) + { + const vector_type u = {{ + _mm_and_si128(v.v[0], w.v[0]), + _mm_and_si128(v.v[1], w.v[1]) + }}; + return u; + } + + /** + * Reduce the components of a vector using saturating addition. + */ + scalar_type + sums(const vector_type &v) + { + const __m128i v8 = _mm_adds_epi16(v.v[0], v.v[1]); + const __m128i v4 = _mm_adds_epi16(v8, _mm_shuffle_epi32(v8, 0x4e)); + const __m128i v2 = _mm_adds_epi16(v4, _mm_shuffle_epi32(v4, 0xb1)); + const __m128i v1 = _mm_adds_epi16(v2, _mm_shufflelo_epi16(v2, 0xb1)); + return _mm_extract_epi16(v1, 0); + } +} + +#else + +/** + * Thin layer around vector intrinsics so they can be easily replaced with + * e.g. the fall-back scalar path, an implementation with different vector + * width or using different SIMD architectures (AVX-512?!). + * + * This implementation operates on scalar values and doesn't rely on + * any vector extensions. This is mainly intended for debugging and + * to keep this file building on exotic platforms. + */ +namespace { + /** + * SIMD integer vector data type. + */ + typedef int16_t vector_type; + + /** + * Scalar data type matching the representation of a single component of \p + * vector_type. + */ + typedef int16_t scalar_type; + + /** + * Maximum integer value representable as a \p scalar_type. + */ + const scalar_type max_scalar = INT16_MAX; + + /** + * Number of components of a \p vector_type. + */ + const unsigned vector_width = 1; + + /** + * Set the i-th component of vector \p v to \p x. + */ + void + set(vector_type &v, unsigned i, scalar_type x) + { + assert(i < vector_width); + v = x; + } + + /** + * Get the i-th component of vector \p v. + */ + scalar_type + get(const vector_type &v, unsigned i) + { + assert(i < vector_width); + return v; + } + + /** + * Add two vectors with saturation. + */ + vector_type + adds(vector_type v, vector_type w) + { + return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) + w)); + } + + /** + * Subtract two vectors with saturation. + */ + vector_type + subs(vector_type v, vector_type w) + { + return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) - w)); + } + + /** + * Compute the bitwise conjunction of two vectors. + */ + vector_type + mask(vector_type v, vector_type w) + { + return v & w; + } + + /** + * Reduce the components of a vector using saturating addition. + */ + scalar_type + sums(vector_type v) + { + return v; + } +} + +#endif + +/** + * Swap \p x and \p y. + */ +#define SWAP(x, y) do { \ + __typeof(y) _swap_tmp = y; \ + y = x; \ + x = _swap_tmp; \ + } while (0) + +namespace { + /** + * Variable-length vector type intended to represent cycle-count costs for + * arbitrary atom-to-bank assignments. It's indexed by a pair of integers + * (i, p), where i is an atom index and p in {0, 1} indicates the parity of + * the conflict (respectively, whether the cost is incurred whenever the + * atoms are assigned the same bank b or opposite-parity banks b and b^1). + * \sa shader_conflict_weight_matrix() + */ + struct weight_vector_type { + weight_vector_type() : v(NULL), size(0) {} + + weight_vector_type(unsigned n) : v(alloc(n)), size(n) {} + + weight_vector_type(const weight_vector_type &u) : + v(alloc(u.size)), size(u.size) + { + memcpy(v, u.v, + DIV_ROUND_UP(u.size, vector_width) * sizeof(vector_type)); + } + + ~weight_vector_type() + { + free(v); + } + + weight_vector_type & + operator=(weight_vector_type u) + { + SWAP(v, u.v); + SWAP(size, u.size); + return *this; + } + + vector_type *v; + unsigned size; + + private: + static vector_type * + alloc(unsigned n) + { + const unsigned align = MAX2(sizeof(void *), __alignof__(vector_type)); + const unsigned size = DIV_ROUND_UP(n, vector_width) * sizeof(vector_type); + void *p; + if (posix_memalign(&p, align, size)) + return NULL; + memset(p, 0, size); + return reinterpret_cast(p); + } + }; + + /** + * Set the (i, p)-th component of weight vector \p v to \p x. + */ + void + set(weight_vector_type &v, unsigned i, unsigned p, scalar_type x) + { + set(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width, x); + } + + /** + * Get the (i, p)-th component of weight vector \p v. + */ + scalar_type + get(const weight_vector_type &v, unsigned i, unsigned p) + { + return get(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width); + } + + /** + * Swap the (i, p)-th and (j, q)-th components of weight vector \p v. + */ + void + swap(weight_vector_type &v, + unsigned i, unsigned p, + unsigned j, unsigned q) + { + const scalar_type tmp = get(v, i, p); + set(v, i, p, get(v, j, q)); + set(v, j, q, tmp); + } +} + +namespace { + /** + * Object that represents the partitioning of an arbitrary register space + * into indivisible units (referred to as atoms below) that can potentially + * be rearranged independently from other registers. The partitioning is + * inferred from a number of contiguity requirements specified using + * require_contiguous(). This allows efficient look-up of the atom index a + * given register address belongs to, or conversely the range of register + * addresses that belong to a given atom. + */ + struct partitioning { + /** + * Create a (for the moment unrestricted) partitioning of a register + * file of size \p n. The units are arbitrary. + */ + partitioning(unsigned n) : + max_reg(n), + offsets(new unsigned[n + num_terminator_atoms]), + atoms(new unsigned[n + num_terminator_atoms]) + { + for (unsigned i = 0; i < n + num_terminator_atoms; i++) { + offsets[i] = i; + atoms[i] = i; + } + } + + partitioning(const partitioning &p) : + max_reg(p.max_reg), + offsets(new unsigned[p.num_atoms() + num_terminator_atoms]), + atoms(new unsigned[p.max_reg + num_terminator_atoms]) + { + memcpy(offsets, p.offsets, + sizeof(unsigned) * (p.num_atoms() + num_terminator_atoms)); + memcpy(atoms, p.atoms, + sizeof(unsigned) * (p.max_reg + num_terminator_atoms)); + } + + ~partitioning() + { + delete[] offsets; + delete[] atoms; + } + + partitioning & + operator=(partitioning p) + { + SWAP(max_reg, p.max_reg); + SWAP(offsets, p.offsets); + SWAP(atoms, p.atoms); + return *this; + } + + /** + * Require register range [reg, reg + n[ to be considered part of the + * same atom. + */ + void + require_contiguous(unsigned reg, unsigned n) + { + unsigned r = atoms[reg]; + + /* Renumber atoms[reg...] = { r... } and their offsets[r...] for the + * case that the specified contiguity requirement leads to the fusion + * (yay) of one or more existing atoms. + */ + for (unsigned reg1 = reg + 1; reg1 <= max_reg; reg1++) { + if (offsets[atoms[reg1]] < reg + n) { + atoms[reg1] = r; + } else { + if (offsets[atoms[reg1 - 1]] != offsets[atoms[reg1]]) + r++; + + offsets[r] = offsets[atoms[reg1]]; + atoms[reg1] = r; + } + } + } + + /** + * Get the atom index register address \p reg belongs to. + */ + unsigned + atom_of_reg(unsigned reg) const + { + return atoms[reg]; + } + + /** + * Get the base register address that belongs to atom \p r. + */ + unsigned + reg_of_atom(unsigned r) const + { + return offsets[r]; + } + + /** + * Get the size of atom \p r in register address units. + */ + unsigned + size_of_atom(unsigned r) const + { + assert(r < num_atoms()); + return reg_of_atom(r + 1) - reg_of_atom(r); + } + + /** + * Get the number of atoms the whole register space is partitioned into. + */ + unsigned + num_atoms() const + { + return atoms[max_reg]; + } + + private: + /** + * Number of trailing atoms inserted for convenience so among other + * things we don't need to special-case the last element in + * size_of_atom(). + */ + static const unsigned num_terminator_atoms = 1; + unsigned max_reg; + unsigned *offsets; + unsigned *atoms; + }; + + /** + * Only GRF sources (whether they have been register-allocated or not) can + * possibly incur bank conflicts. + */ + bool + is_grf(const fs_reg &r) + { + return r.file == VGRF || r.file == FIXED_GRF; + } + + /** + * Register offset of \p r in GRF units. Useful because the representation + * of GRFs post-register allocation is somewhat inconsistent and depends on + * whether the register already had a fixed GRF offset prior to register + * allocation or whether it was part of a VGRF allocation. + */ + unsigned + reg_of(const fs_reg &r) + { + assert(is_grf(r)); + if (r.file == VGRF) + return r.nr + r.offset / REG_SIZE; + else + return reg_offset(r) / REG_SIZE; + } + + /** + * Calculate the finest partitioning of the GRF space compatible with the + * register contiguity requirements derived from all instructions part of + * the program. + */ + partitioning + shader_reg_partitioning(const fs_visitor *v) + { + partitioning p(BRW_MAX_GRF); + + foreach_block_and_inst(block, fs_inst, inst, v->cfg) { + if (is_grf(inst->dst)) + p.require_contiguous(reg_of(inst->dst), regs_written(inst)); + + for (int i = 0; i < inst->sources; i++) { + if (is_grf(inst->src[i])) + p.require_contiguous(reg_of(inst->src[i]), regs_read(inst, i)); + } + } + + return p; + } + + /** + * Return the set of GRF atoms that should be left untouched at their + * original location to avoid violating hardware or software assumptions. + */ + bool * + shader_reg_constraints(const fs_visitor *v, const partitioning &p) + { + bool *constrained = new bool[p.num_atoms()](); + + /* These are read implicitly by some send-message instructions without + * any indication at the IR level. Assume they are unsafe to move + * around. + */ + for (unsigned reg = 0; reg < 2; reg++) + constrained[p.atom_of_reg(reg)] = true; + + /* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference", + * subsection "EUISA Instructions", Send Message (page 990): + * + * "r127 must not be used for return address when there is a src and + * dest overlap in send instruction." + * + * Register allocation ensures that, so don't move 127 around to avoid + * breaking that property. + */ + if (v->devinfo->ver >= 8) + constrained[p.atom_of_reg(127)] = true; + + foreach_block_and_inst(block, fs_inst, inst, v->cfg) { + /* Assume that anything referenced via fixed GRFs is baked into the + * hardware's fixed-function logic and may be unsafe to move around. + * Also take into account the source GRF restrictions of EOT + * send-message instructions. + */ + if (inst->dst.file == FIXED_GRF) + constrained[p.atom_of_reg(reg_of(inst->dst))] = true; + + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == FIXED_GRF || + (is_grf(inst->src[i]) && inst->eot)) + constrained[p.atom_of_reg(reg_of(inst->src[i]))] = true; + } + + /* Preserve the original allocation of VGRFs used by the barycentric + * source of the LINTERP instruction on Gfx6, since pair-aligned + * barycentrics allow the PLN instruction to be used. + */ + if (v->devinfo->has_pln && v->devinfo->ver <= 6 && + inst->opcode == FS_OPCODE_LINTERP) + constrained[p.atom_of_reg(reg_of(inst->src[0]))] = true; + + /* The location of the Gfx7 MRF hack registers is hard-coded in the + * rest of the compiler back-end. Don't attempt to move them around. + */ + if (v->devinfo->ver >= 7) { + assert(inst->dst.file != MRF); + + for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { + const unsigned reg = GFX7_MRF_HACK_START + inst->base_mrf + i; + constrained[p.atom_of_reg(reg)] = true; + } + } + } + + return constrained; + } + + /** + * Return whether the hardware will be able to prevent a bank conflict by + * optimizing out the read cycle of a source register. The formula was + * found experimentally. + */ + bool + is_conflict_optimized_out(const intel_device_info *devinfo, + const fs_inst *inst) + { + return devinfo->ver >= 9 && + ((is_grf(inst->src[0]) && (reg_of(inst->src[0]) == reg_of(inst->src[1]) || + reg_of(inst->src[0]) == reg_of(inst->src[2]))) || + reg_of(inst->src[1]) == reg_of(inst->src[2])); + } + + /** + * Return a matrix that allows reasonably efficient computation of the + * cycle-count cost of bank conflicts incurred throughout the whole program + * for any given atom-to-bank assignment. + * + * More precisely, if C_r_s_p is the result of this function, the total + * cost of all bank conflicts involving any given atom r can be readily + * recovered as follows: + * + * S(B) = Sum_s_p(d_(p^B_r)_(B_s) * C_r_s_p) + * + * where d_i_j is the Kronecker delta, and B_r indicates the bank + * assignment of r. \sa delta_conflicts() for a vectorized implementation + * of the expression above. + * + * FINISHME: Teach this about the Gfx10+ bank conflict rules, which are + * somewhat more relaxed than on previous generations. In the + * meantime optimizing based on Gfx9 weights is likely to be more + * helpful than not optimizing at all. + */ + weight_vector_type * + shader_conflict_weight_matrix(const fs_visitor *v, const partitioning &p) + { + weight_vector_type *conflicts = new weight_vector_type[p.num_atoms()]; + for (unsigned r = 0; r < p.num_atoms(); r++) + conflicts[r] = weight_vector_type(2 * p.num_atoms()); + + /* Crude approximation of the number of times the current basic block + * will be executed at run-time. + */ + unsigned block_scale = 1; + + foreach_block_and_inst(block, fs_inst, inst, v->cfg) { + if (inst->opcode == BRW_OPCODE_DO) { + block_scale *= 10; + + } else if (inst->opcode == BRW_OPCODE_WHILE) { + block_scale /= 10; + + } else if (inst->is_3src(v->compiler) && + is_grf(inst->src[1]) && is_grf(inst->src[2])) { + const unsigned r = p.atom_of_reg(reg_of(inst->src[1])); + const unsigned s = p.atom_of_reg(reg_of(inst->src[2])); + + /* Estimate of the cycle-count cost of incurring a bank conflict + * for this instruction. This is only true on the average, for a + * sequence of back-to-back ternary instructions, since the EU + * front-end only seems to be able to issue a new instruction at + * an even cycle. The cost of a bank conflict incurred by an + * isolated ternary instruction may be higher. + */ + const unsigned exec_size = inst->dst.component_size(inst->exec_size); + const unsigned cycle_scale = block_scale * DIV_ROUND_UP(exec_size, + REG_SIZE); + + /* Neglect same-atom conflicts (since they're either trivial or + * impossible to avoid without splitting the atom), and conflicts + * known to be optimized out by the hardware. + */ + if (r != s && !is_conflict_optimized_out(v->devinfo, inst)) { + /* Calculate the parity of the sources relative to the start of + * their respective atoms. If their parity is the same (and + * none of the atoms straddle the 2KB mark), the instruction + * will incur a conflict iff both atoms are assigned the same + * bank b. If their parity is opposite, the instruction will + * incur a conflict iff they are assigned opposite banks (b and + * b^1). + */ + const bool p_r = 1 & (reg_of(inst->src[1]) - p.reg_of_atom(r)); + const bool p_s = 1 & (reg_of(inst->src[2]) - p.reg_of_atom(s)); + const unsigned p = p_r ^ p_s; + + /* Calculate the updated cost of a hypothetical conflict + * between atoms r and s. Note that the weight matrix is + * symmetric with respect to indices r and s by construction. + */ + const scalar_type w = MIN2(unsigned(max_scalar), + get(conflicts[r], s, p) + cycle_scale); + set(conflicts[r], s, p, w); + set(conflicts[s], r, p, w); + } + } + } + + return conflicts; + } + + /** + * Return the set of GRF atoms that could potentially lead to bank + * conflicts if laid out unfavorably in the GRF space according to + * the specified \p conflicts matrix (\sa + * shader_conflict_weight_matrix()). + */ + bool * + have_any_conflicts(const partitioning &p, + const weight_vector_type *conflicts) + { + bool *any_conflicts = new bool[p.num_atoms()](); + + for (unsigned r = 0; r < p.num_atoms(); r++) { + const unsigned m = DIV_ROUND_UP(conflicts[r].size, vector_width); + for (unsigned s = 0; s < m; s++) + any_conflicts[r] |= sums(conflicts[r].v[s]); + } + + return any_conflicts; + } + + /** + * Calculate the difference between two S(B) cost estimates as defined + * above (\sa shader_conflict_weight_matrix()). This represents the + * (partial) cycle-count benefit from moving an atom r from bank p to n. + * The respective bank assignments Bp and Bn are encoded as the \p + * bank_mask_p and \p bank_mask_n bitmasks for efficient computation, + * according to the formula: + * + * bank_mask(B)_s_p = -d_(p^B_r)_(B_s) + * + * Notice the similarity with the delta function in the S(B) expression + * above, and how bank_mask(B) can be precomputed for every possible + * selection of r since bank_mask(B) only depends on it via B_r that may + * only assume one of four different values, so the caller can keep every + * possible bank_mask(B) vector in memory without much hassle (\sa + * bank_characteristics()). + */ + int + delta_conflicts(const weight_vector_type &bank_mask_p, + const weight_vector_type &bank_mask_n, + const weight_vector_type &conflicts) + { + const unsigned m = DIV_ROUND_UP(conflicts.size, vector_width); + vector_type s_p = {}, s_n = {}; + + for (unsigned r = 0; r < m; r++) { + s_p = adds(s_p, mask(bank_mask_p.v[r], conflicts.v[r])); + s_n = adds(s_n, mask(bank_mask_n.v[r], conflicts.v[r])); + } + + return sums(subs(s_p, s_n)); + } + + /** + * Register atom permutation, represented as the start GRF offset each atom + * is mapped into. + */ + struct permutation { + permutation() : v(NULL), size(0) {} + + permutation(unsigned n) : + v(new unsigned[n]()), size(n) {} + + permutation(const permutation &p) : + v(new unsigned[p.size]), size(p.size) + { + memcpy(v, p.v, p.size * sizeof(unsigned)); + } + + ~permutation() + { + delete[] v; + } + + permutation & + operator=(permutation p) + { + SWAP(v, p.v); + SWAP(size, p.size); + return *this; + } + + unsigned *v; + unsigned size; + }; + + /** + * Return an identity permutation of GRF atoms. + */ + permutation + identity_reg_permutation(const partitioning &p) + { + permutation map(p.num_atoms()); + + for (unsigned r = 0; r < map.size; r++) + map.v[r] = p.reg_of_atom(r); + + return map; + } + + /** + * Return the bank index of GRF address \p reg, numbered according to the + * table: + * Even Odd + * Lo 0 1 + * Hi 2 3 + */ + unsigned + bank_of(unsigned reg) + { + return (reg & 0x40) >> 5 | (reg & 1); + } + + /** + * Return bitmasks suitable for use as bank mask arguments for the + * delta_conflicts() computation. Note that this is just the (negative) + * characteristic function of each bank, if you regard it as a set + * containing all atoms assigned to it according to the \p map array. + */ + weight_vector_type * + bank_characteristics(const permutation &map) + { + weight_vector_type *banks = new weight_vector_type[4]; + + for (unsigned b = 0; b < 4; b++) { + banks[b] = weight_vector_type(2 * map.size); + + for (unsigned j = 0; j < map.size; j++) { + for (unsigned p = 0; p < 2; p++) + set(banks[b], j, p, + (b ^ p) == bank_of(map.v[j]) ? -1 : 0); + } + } + + return banks; + } + + /** + * Return an improved permutation of GRF atoms based on \p map attempting + * to reduce the total cycle-count cost of bank conflicts greedily. + * + * Note that this doesn't attempt to merge multiple atoms into one, which + * may allow it to do a better job in some cases -- It simply reorders + * existing atoms in the GRF space without affecting their identity. + */ + permutation + optimize_reg_permutation(const partitioning &p, + const bool *constrained, + const weight_vector_type *conflicts, + permutation map) + { + const bool *any_conflicts = have_any_conflicts(p, conflicts); + weight_vector_type *banks = bank_characteristics(map); + + for (unsigned r = 0; r < map.size; r++) { + const unsigned bank_r = bank_of(map.v[r]); + + if (!constrained[r]) { + unsigned best_s = r; + int best_benefit = 0; + + for (unsigned s = 0; s < map.size; s++) { + const unsigned bank_s = bank_of(map.v[s]); + + if (bank_r != bank_s && !constrained[s] && + p.size_of_atom(r) == p.size_of_atom(s) && + (any_conflicts[r] || any_conflicts[s])) { + const int benefit = + delta_conflicts(banks[bank_r], banks[bank_s], conflicts[r]) + + delta_conflicts(banks[bank_s], banks[bank_r], conflicts[s]); + + if (benefit > best_benefit) { + best_s = s; + best_benefit = benefit; + } + } + } + + if (best_s != r) { + for (unsigned b = 0; b < 4; b++) { + for (unsigned p = 0; p < 2; p++) + swap(banks[b], r, p, best_s, p); + } + + SWAP(map.v[r], map.v[best_s]); + } + } + } + + delete[] banks; + delete[] any_conflicts; + return map; + } + + /** + * Apply the GRF atom permutation given by \p map to register \p r and + * return the result. + */ + fs_reg + transform(const partitioning &p, const permutation &map, fs_reg r) + { + if (r.file == VGRF) { + const unsigned reg = reg_of(r); + const unsigned s = p.atom_of_reg(reg); + r.nr = map.v[s] + reg - p.reg_of_atom(s); + r.offset = r.offset % REG_SIZE; + } + + return r; + } +} + +bool +fs_visitor::opt_bank_conflicts() +{ + assert(grf_used || !"Must be called after register allocation"); + + /* TODO: Re-work this pass for Gfx20+. */ + if (devinfo->ver >= 20) + return false; + + /* No ternary instructions -- No bank conflicts. */ + if (devinfo->ver < 6) + return false; + + const partitioning p = shader_reg_partitioning(this); + const bool *constrained = shader_reg_constraints(this, p); + const weight_vector_type *conflicts = + shader_conflict_weight_matrix(this, p); + const permutation map = + optimize_reg_permutation(p, constrained, conflicts, + identity_reg_permutation(p)); + + foreach_block_and_inst(block, fs_inst, inst, cfg) { + inst->dst = transform(p, map, inst->dst); + + for (int i = 0; i < inst->sources; i++) + inst->src[i] = transform(p, map, inst->src[i]); + } + + delete[] conflicts; + delete[] constrained; + return true; +} + +/** + * Return whether the instruction incurs GRF bank conflict cycles. + * + * Note that this is only accurate after register allocation because otherwise + * we don't know which bank each VGRF is going to end up aligned to. + */ +bool +has_bank_conflict(const struct brw_isa_info *isa, const fs_inst *inst) +{ + return is_3src(isa, inst->opcode) && + is_grf(inst->src[1]) && is_grf(inst->src[2]) && + bank_of(reg_of(inst->src[1])) == bank_of(reg_of(inst->src[2])) && + !is_conflict_optimized_out(isa->devinfo, inst); +} diff --git a/src/intel/compiler/elk/brw_fs_builder.h b/src/intel/compiler/elk/brw_fs_builder.h new file mode 100644 index 00000000000..63244f0b75b --- /dev/null +++ b/src/intel/compiler/elk/brw_fs_builder.h @@ -0,0 +1,965 @@ +/* -*- c++ -*- */ +/* + * Copyright © 2010-2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BRW_FS_BUILDER_H +#define BRW_FS_BUILDER_H + +#include "brw_ir_fs.h" +#include "brw_shader.h" +#include "brw_eu.h" +#include "brw_fs.h" + +namespace brw { + /** + * Toolbox to assemble an FS IR program out of individual instructions. + * + * This object is meant to have an interface consistent with + * brw::vec4_builder. They cannot be fully interchangeable because + * brw::fs_builder generates scalar code while brw::vec4_builder generates + * vector code. + */ + class fs_builder { + public: + /** Type used in this IR to represent a source of an instruction. */ + typedef fs_reg src_reg; + + /** Type used in this IR to represent the destination of an instruction. */ + typedef fs_reg dst_reg; + + /** Type used in this IR to represent an instruction. */ + typedef fs_inst instruction; + + /** + * Construct an fs_builder that inserts instructions into \p shader. + * \p dispatch_width gives the native execution width of the program. + */ + fs_builder(fs_visitor *shader, + unsigned dispatch_width) : + shader(shader), block(NULL), cursor(NULL), + _dispatch_width(dispatch_width), + _group(0), + force_writemask_all(false), + annotation() + { + } + + explicit fs_builder(fs_visitor *s) : fs_builder(s, s->dispatch_width) {} + + /** + * Construct an fs_builder that inserts instructions into \p shader + * before instruction \p inst in basic block \p block. The default + * execution controls and debug annotation are initialized from the + * instruction passed as argument. + */ + fs_builder(fs_visitor *shader, bblock_t *block, fs_inst *inst) : + shader(shader), block(block), cursor(inst), + _dispatch_width(inst->exec_size), + _group(inst->group), + force_writemask_all(inst->force_writemask_all) + { + annotation.str = inst->annotation; + annotation.ir = inst->ir; + } + + /** + * Construct an fs_builder that inserts instructions before \p cursor in + * basic block \p block, inheriting other code generation parameters + * from this. + */ + fs_builder + at(bblock_t *block, exec_node *cursor) const + { + fs_builder bld = *this; + bld.block = block; + bld.cursor = cursor; + return bld; + } + + /** + * Construct an fs_builder appending instructions at the end of the + * instruction list of the shader, inheriting other code generation + * parameters from this. + */ + fs_builder + at_end() const + { + return at(NULL, (exec_node *)&shader->instructions.tail_sentinel); + } + + /** + * Construct a builder specifying the default SIMD width and group of + * channel enable signals, inheriting other code generation parameters + * from this. + * + * \p n gives the default SIMD width, \p i gives the slot group used for + * predication and control flow masking in multiples of \p n channels. + */ + fs_builder + group(unsigned n, unsigned i) const + { + fs_builder bld = *this; + + if (n <= dispatch_width() && i < dispatch_width() / n) { + bld._group += i * n; + } else { + /* The requested channel group isn't a subset of the channel group + * of this builder, which means that the resulting instructions + * would use (potentially undefined) channel enable signals not + * specified by the parent builder. That's only valid if the + * instruction doesn't have per-channel semantics, in which case + * we should clear off the default group index in order to prevent + * emitting instructions with channel group not aligned to their + * own execution size. + */ + assert(force_writemask_all); + bld._group = 0; + } + + bld._dispatch_width = n; + return bld; + } + + /** + * Alias for group() with width equal to eight. + */ + fs_builder + quarter(unsigned i) const + { + return group(8, i); + } + + /** + * Construct a builder with per-channel control flow execution masking + * disabled if \p b is true. If control flow execution masking is + * already disabled this has no effect. + */ + fs_builder + exec_all(bool b = true) const + { + fs_builder bld = *this; + if (b) + bld.force_writemask_all = true; + return bld; + } + + /** + * Construct a builder with the given debug annotation info. + */ + fs_builder + annotate(const char *str, const void *ir = NULL) const + { + fs_builder bld = *this; + bld.annotation.str = str; + bld.annotation.ir = ir; + return bld; + } + + /** + * Get the SIMD width in use. + */ + unsigned + dispatch_width() const + { + return _dispatch_width; + } + + /** + * Get the channel group in use. + */ + unsigned + group() const + { + return _group; + } + + /** + * Allocate a virtual register of natural vector size (one for this IR) + * and SIMD width. \p n gives the amount of space to allocate in + * dispatch_width units (which is just enough space for one logical + * component in this IR). + */ + dst_reg + vgrf(enum brw_reg_type type, unsigned n = 1) const + { + const unsigned unit = reg_unit(shader->devinfo); + assert(dispatch_width() <= 32); + + if (n > 0) + return dst_reg(VGRF, shader->alloc.allocate( + DIV_ROUND_UP(n * type_sz(type) * dispatch_width(), + unit * REG_SIZE) * unit), + type); + else + return retype(null_reg_ud(), type); + } + + /** + * Create a null register of floating type. + */ + dst_reg + null_reg_f() const + { + return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F)); + } + + dst_reg + null_reg_df() const + { + return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF)); + } + + /** + * Create a null register of signed integer type. + */ + dst_reg + null_reg_d() const + { + return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + } + + /** + * Create a null register of unsigned integer type. + */ + dst_reg + null_reg_ud() const + { + return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); + } + + /** + * Insert an instruction into the program. + */ + instruction * + emit(const instruction &inst) const + { + return emit(new(shader->mem_ctx) instruction(inst)); + } + + /** + * Create and insert a nullary control instruction into the program. + */ + instruction * + emit(enum opcode opcode) const + { + return emit(instruction(opcode, dispatch_width())); + } + + /** + * Create and insert a nullary instruction into the program. + */ + instruction * + emit(enum opcode opcode, const dst_reg &dst) const + { + return emit(instruction(opcode, dispatch_width(), dst)); + } + + /** + * Create and insert a unary instruction into the program. + */ + instruction * + emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const + { + switch (opcode) { + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + return emit(instruction(opcode, dispatch_width(), dst, + fix_math_operand(src0))); + + default: + return emit(instruction(opcode, dispatch_width(), dst, src0)); + } + } + + /** + * Create and insert a binary instruction into the program. + */ + instruction * + emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, + const src_reg &src1) const + { + switch (opcode) { + case SHADER_OPCODE_POW: + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + return emit(instruction(opcode, dispatch_width(), dst, + fix_math_operand(src0), + fix_math_operand(src1))); + + default: + return emit(instruction(opcode, dispatch_width(), dst, + src0, src1)); + + } + } + + /** + * Create and insert a ternary instruction into the program. + */ + instruction * + emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, + const src_reg &src1, const src_reg &src2) const + { + switch (opcode) { + case BRW_OPCODE_BFE: + case BRW_OPCODE_BFI2: + case BRW_OPCODE_MAD: + case BRW_OPCODE_LRP: + return emit(instruction(opcode, dispatch_width(), dst, + fix_3src_operand(src0), + fix_3src_operand(src1), + fix_3src_operand(src2))); + + default: + return emit(instruction(opcode, dispatch_width(), dst, + src0, src1, src2)); + } + } + + /** + * Create and insert an instruction with a variable number of sources + * into the program. + */ + instruction * + emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[], + unsigned n) const + { + /* Use the emit() methods for specific operand counts to ensure that + * opcode-specific operand fixups occur. + */ + if (n == 2) { + return emit(opcode, dst, srcs[0], srcs[1]); + } else if (n == 3) { + return emit(opcode, dst, srcs[0], srcs[1], srcs[2]); + } else { + return emit(instruction(opcode, dispatch_width(), dst, srcs, n)); + } + } + + /** + * Insert a preallocated instruction into the program. + */ + instruction * + emit(instruction *inst) const + { + assert(inst->exec_size <= 32); + assert(inst->exec_size == dispatch_width() || + force_writemask_all); + + inst->group = _group; + inst->force_writemask_all = force_writemask_all; + inst->annotation = annotation.str; + inst->ir = annotation.ir; + + if (block) + static_cast(cursor)->insert_before(block, inst); + else + cursor->insert_before(inst); + + return inst; + } + + /** + * Select \p src0 if the comparison of both sources with the given + * conditional mod evaluates to true, otherwise select \p src1. + * + * Generally useful to get the minimum or maximum of two values. + */ + instruction * + emit_minmax(const dst_reg &dst, const src_reg &src0, + const src_reg &src1, brw_conditional_mod mod) const + { + assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L); + + /* In some cases we can't have bytes as operand for src1, so use the + * same type for both operand. + */ + return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0), + fix_unsigned_negate(src1))); + } + + /** + * Copy any live channel from \p src to the first channel of the result. + */ + src_reg + emit_uniformize(const src_reg &src) const + { + /* FIXME: We use a vector chan_index and dst to allow constant and + * copy propagration to move result all the way into the consuming + * instruction (typically a surface index or sampler index for a + * send). This uses 1 or 3 extra hw registers in 16 or 32 wide + * dispatch. Once we teach const/copy propagation about scalars we + * should go back to scalar destinations here. + */ + const fs_builder ubld = exec_all(); + const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD); + const dst_reg dst = vgrf(src.type); + + ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index); + ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0)); + + return src_reg(component(dst, 0)); + } + + src_reg + move_to_vgrf(const src_reg &src, unsigned num_components) const + { + src_reg *const src_comps = new src_reg[num_components]; + for (unsigned i = 0; i < num_components; i++) + src_comps[i] = offset(src, dispatch_width(), i); + + const dst_reg dst = vgrf(src.type, num_components); + LOAD_PAYLOAD(dst, src_comps, num_components, 0); + + delete[] src_comps; + + return src_reg(dst); + } + + void + emit_scan_step(enum opcode opcode, brw_conditional_mod mod, + const dst_reg &tmp, + unsigned left_offset, unsigned left_stride, + unsigned right_offset, unsigned right_stride) const + { + dst_reg left, right; + left = horiz_stride(horiz_offset(tmp, left_offset), left_stride); + right = horiz_stride(horiz_offset(tmp, right_offset), right_stride); + if ((tmp.type == BRW_REGISTER_TYPE_Q || + tmp.type == BRW_REGISTER_TYPE_UQ) && + !shader->devinfo->has_64bit_int) { + switch (opcode) { + case BRW_OPCODE_MUL: + /* This will get lowered by integer MUL lowering */ + set_condmod(mod, emit(opcode, right, left, right)); + break; + + case BRW_OPCODE_SEL: { + /* In order for the comparisons to work out right, we need our + * comparisons to be strict. + */ + assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE); + if (mod == BRW_CONDITIONAL_GE) + mod = BRW_CONDITIONAL_G; + + /* We treat the bottom 32 bits as unsigned regardless of + * whether or not the integer as a whole is signed. + */ + dst_reg right_low = subscript(right, BRW_REGISTER_TYPE_UD, 0); + dst_reg left_low = subscript(left, BRW_REGISTER_TYPE_UD, 0); + + /* The upper bits get the same sign as the 64-bit type */ + brw_reg_type type32 = brw_reg_type_from_bit_size(32, tmp.type); + dst_reg right_high = subscript(right, type32, 1); + dst_reg left_high = subscript(left, type32, 1); + + /* Build up our comparison: + * + * l_hi < r_hi || (l_hi == r_hi && l_low < r_low) + */ + CMP(null_reg_ud(), retype(left_low, BRW_REGISTER_TYPE_UD), + retype(right_low, BRW_REGISTER_TYPE_UD), mod); + set_predicate(BRW_PREDICATE_NORMAL, + CMP(null_reg_ud(), left_high, right_high, + BRW_CONDITIONAL_EQ)); + set_predicate_inv(BRW_PREDICATE_NORMAL, true, + CMP(null_reg_ud(), left_high, right_high, mod)); + + /* We could use selects here or we could use predicated MOVs + * because the destination and second source (if it were a SEL) + * are the same. + */ + set_predicate(BRW_PREDICATE_NORMAL, MOV(right_low, left_low)); + set_predicate(BRW_PREDICATE_NORMAL, MOV(right_high, left_high)); + break; + } + + default: + unreachable("Unsupported 64-bit scan op"); + } + } else { + set_condmod(mod, emit(opcode, right, left, right)); + } + } + + void + emit_scan(enum opcode opcode, const dst_reg &tmp, + unsigned cluster_size, brw_conditional_mod mod) const + { + assert(dispatch_width() >= 8); + + /* The instruction splitting code isn't advanced enough to split + * these so we need to handle that ourselves. + */ + if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) { + const unsigned half_width = dispatch_width() / 2; + const fs_builder ubld = exec_all().group(half_width, 0); + dst_reg left = tmp; + dst_reg right = horiz_offset(tmp, half_width); + ubld.emit_scan(opcode, left, cluster_size, mod); + ubld.emit_scan(opcode, right, cluster_size, mod); + if (cluster_size > half_width) { + ubld.emit_scan_step(opcode, mod, tmp, + half_width - 1, 0, half_width, 1); + } + return; + } + + if (cluster_size > 1) { + const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0); + ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2); + } + + if (cluster_size > 2) { + if (type_sz(tmp.type) <= 4) { + const fs_builder ubld = + exec_all().group(dispatch_width() / 4, 0); + ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4); + ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4); + } else { + /* For 64-bit types, we have to do things differently because + * the code above would land us with destination strides that + * the hardware can't handle. Fortunately, we'll only be + * 8-wide in that case and it's the same number of + * instructions. + */ + const fs_builder ubld = exec_all().group(2, 0); + for (unsigned i = 0; i < dispatch_width(); i += 4) + ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1); + } + } + + for (unsigned i = 4; + i < MIN2(cluster_size, dispatch_width()); + i *= 2) { + const fs_builder ubld = exec_all().group(i, 0); + ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1); + + if (dispatch_width() > i * 2) + ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1); + + if (dispatch_width() > i * 4) { + ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1); + ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1); + } + } + } + + instruction * + emit_undef_for_dst(const instruction *old_inst) const + { + assert(old_inst->dst.file == VGRF); + instruction *inst = emit(SHADER_OPCODE_UNDEF, + retype(old_inst->dst, BRW_REGISTER_TYPE_UD)); + inst->size_written = old_inst->size_written; + + return inst; + } + + /** + * Assorted arithmetic ops. + * @{ + */ +#define ALU1(op) \ + instruction * \ + op(const dst_reg &dst, const src_reg &src0) const \ + { \ + return emit(BRW_OPCODE_##op, dst, src0); \ + } + +#define ALU2(op) \ + instruction * \ + op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ + { \ + return emit(BRW_OPCODE_##op, dst, src0, src1); \ + } + +#define ALU2_ACC(op) \ + instruction * \ + op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ + { \ + instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \ + inst->writes_accumulator = true; \ + return inst; \ + } + +#define ALU3(op) \ + instruction * \ + op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \ + const src_reg &src2) const \ + { \ + return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \ + } + + ALU2(ADD) + ALU3(ADD3) + ALU2_ACC(ADDC) + ALU2(AND) + ALU2(ASR) + ALU2(AVG) + ALU3(BFE) + ALU2(BFI1) + ALU3(BFI2) + ALU1(BFREV) + ALU1(CBIT) + ALU1(DIM) + ALU2(DP2) + ALU2(DP3) + ALU2(DP4) + ALU2(DPH) + ALU1(FBH) + ALU1(FBL) + ALU1(FRC) + ALU3(DP4A) + ALU2(LINE) + ALU1(LZD) + ALU2(MAC) + ALU2_ACC(MACH) + ALU3(MAD) + ALU1(MOV) + ALU2(MUL) + ALU1(NOT) + ALU2(OR) + ALU2(PLN) + ALU1(RNDD) + ALU1(RNDE) + ALU1(RNDU) + ALU1(RNDZ) + ALU2(ROL) + ALU2(ROR) + ALU2(SAD2) + ALU2_ACC(SADA2) + ALU2(SEL) + ALU2(SHL) + ALU2(SHR) + ALU2_ACC(SUBB) + ALU2(XOR) + +#undef ALU3 +#undef ALU2_ACC +#undef ALU2 +#undef ALU1 + + instruction * + F32TO16(const dst_reg &dst, const src_reg &src) const + { + assert(dst.type == BRW_REGISTER_TYPE_HF); + assert(src.type == BRW_REGISTER_TYPE_F); + + if (shader->devinfo->ver >= 8) { + return MOV(dst, src); + } else { + assert(shader->devinfo->ver == 7); + return emit(BRW_OPCODE_F32TO16, + retype(dst, BRW_REGISTER_TYPE_W), src); + } + } + + instruction * + F16TO32(const dst_reg &dst, const src_reg &src) const + { + assert(dst.type == BRW_REGISTER_TYPE_F); + assert(src.type == BRW_REGISTER_TYPE_HF); + + if (shader->devinfo->ver >= 8) { + return MOV(dst, src); + } else { + assert(shader->devinfo->ver == 7); + return emit(BRW_OPCODE_F16TO32, + dst, retype(src, BRW_REGISTER_TYPE_W)); + } + } + /** @} */ + + /** + * CMP: Sets the low bit of the destination channels with the result + * of the comparison, while the upper bits are undefined, and updates + * the flag register with the packed 16 bits of the result. + */ + instruction * + CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1, + brw_conditional_mod condition) const + { + /* Take the instruction: + * + * CMP null src0 src1 + * + * Original gfx4 does type conversion to the destination type + * before comparison, producing garbage results for floating + * point comparisons. + * + * The destination type doesn't matter on newer generations, + * so we set the type to match src0 so we can compact the + * instruction. + */ + return set_condmod(condition, + emit(BRW_OPCODE_CMP, retype(dst, src0.type), + fix_unsigned_negate(src0), + fix_unsigned_negate(src1))); + } + + /** + * CMPN: Behaves like CMP, but produces true if src1 is NaN. + */ + instruction * + CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1, + brw_conditional_mod condition) const + { + /* Take the instruction: + * + * CMP null src0 src1 + * + * Original gfx4 does type conversion to the destination type + * before comparison, producing garbage results for floating + * point comparisons. + * + * The destination type doesn't matter on newer generations, + * so we set the type to match src0 so we can compact the + * instruction. + */ + return set_condmod(condition, + emit(BRW_OPCODE_CMPN, retype(dst, src0.type), + fix_unsigned_negate(src0), + fix_unsigned_negate(src1))); + } + + /** + * Gfx4 predicated IF. + */ + instruction * + IF(brw_predicate predicate) const + { + return set_predicate(predicate, emit(BRW_OPCODE_IF)); + } + + /** + * CSEL: dst = src2 0.0f ? src0 : src1 + */ + instruction * + CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1, + const src_reg &src2, brw_conditional_mod condition) const + { + /* CSEL only operates on floats, so we can't do integer =/> + * comparisons. Zero/non-zero (== and !=) comparisons almost work. + * 0x80000000 fails because it is -0.0, and -0.0 == 0.0. + */ + assert(src2.type == BRW_REGISTER_TYPE_F); + + return set_condmod(condition, + emit(BRW_OPCODE_CSEL, + retype(dst, BRW_REGISTER_TYPE_F), + retype(src0, BRW_REGISTER_TYPE_F), + retype(src1, BRW_REGISTER_TYPE_F), + src2)); + } + + /** + * Emit a linear interpolation instruction. + */ + instruction * + LRP(const dst_reg &dst, const src_reg &x, const src_reg &y, + const src_reg &a) const + { + if (shader->devinfo->ver >= 6 && shader->devinfo->ver <= 10) { + /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so + * we need to reorder the operands. + */ + return emit(BRW_OPCODE_LRP, dst, a, y, x); + + } else { + /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */ + const dst_reg y_times_a = vgrf(dst.type); + const dst_reg one_minus_a = vgrf(dst.type); + const dst_reg x_times_one_minus_a = vgrf(dst.type); + + MUL(y_times_a, y, a); + ADD(one_minus_a, negate(a), brw_imm_f(1.0f)); + MUL(x_times_one_minus_a, x, src_reg(one_minus_a)); + return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)); + } + } + + /** + * Collect a number of registers in a contiguous range of registers. + */ + instruction * + LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src, + unsigned sources, unsigned header_size) const + { + instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources); + inst->header_size = header_size; + inst->size_written = header_size * REG_SIZE; + for (unsigned i = header_size; i < sources; i++) { + inst->size_written += dispatch_width() * type_sz(src[i].type) * + dst.stride; + } + + return inst; + } + + instruction * + UNDEF(const dst_reg &dst) const + { + assert(dst.file == VGRF); + assert(dst.offset % REG_SIZE == 0); + instruction *inst = emit(SHADER_OPCODE_UNDEF, + retype(dst, BRW_REGISTER_TYPE_UD)); + inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE - dst.offset; + + return inst; + } + + instruction * + DPAS(const dst_reg &dst, const src_reg &src0, const src_reg &src1, const src_reg &src2, + unsigned sdepth, unsigned rcount) const + { + assert(_dispatch_width == 8); + assert(sdepth == 8); + assert(rcount == 1 || rcount == 2 || rcount == 4 || rcount == 8); + + instruction *inst = emit(BRW_OPCODE_DPAS, dst, src0, src1, src2); + inst->sdepth = sdepth; + inst->rcount = rcount; + + if (dst.type == BRW_REGISTER_TYPE_HF) { + inst->size_written = rcount * REG_SIZE / 2; + } else { + inst->size_written = rcount * REG_SIZE; + } + + return inst; + } + + fs_visitor *shader; + + fs_inst *BREAK() { return emit(BRW_OPCODE_BREAK); } + fs_inst *DO() { return emit(BRW_OPCODE_DO); } + fs_inst *ENDIF() { return emit(BRW_OPCODE_ENDIF); } + fs_inst *NOP() { return emit(BRW_OPCODE_NOP); } + fs_inst *WHILE() { return emit(BRW_OPCODE_WHILE); } + fs_inst *CONTINUE() { return emit(BRW_OPCODE_CONTINUE); } + + private: + /** + * Workaround for negation of UD registers. See comment in + * fs_generator::generate_code() for more details. + */ + src_reg + fix_unsigned_negate(const src_reg &src) const + { + if (src.type == BRW_REGISTER_TYPE_UD && + src.negate) { + dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD); + MOV(temp, src); + return src_reg(temp); + } else { + return src; + } + } + + /** + * Workaround for source register modes not supported by the ternary + * instruction encoding. + */ + src_reg + fix_3src_operand(const src_reg &src) const + { + switch (src.file) { + case FIXED_GRF: + /* FINISHME: Could handle scalar region, other stride=1 regions */ + if (src.vstride != BRW_VERTICAL_STRIDE_8 || + src.width != BRW_WIDTH_8 || + src.hstride != BRW_HORIZONTAL_STRIDE_1) + break; + FALLTHROUGH; + case ATTR: + case VGRF: + case UNIFORM: + case IMM: + return src; + default: + break; + } + + dst_reg expanded = vgrf(src.type); + MOV(expanded, src); + return expanded; + } + + /** + * Workaround for source register modes not supported by the math + * instruction. + */ + src_reg + fix_math_operand(const src_reg &src) const + { + /* Can't do hstride == 0 args on gfx6 math, so expand it out. We + * might be able to do better by doing execsize = 1 math and then + * expanding that result out, but we would need to be careful with + * masking. + * + * Gfx6 hardware ignores source modifiers (negate and abs) on math + * instructions, so we also move to a temp to set those up. + * + * Gfx7 relaxes most of the above restrictions, but still can't use IMM + * operands to math + */ + if ((shader->devinfo->ver == 6 && + (src.file == IMM || src.file == UNIFORM || + src.abs || src.negate)) || + (shader->devinfo->ver == 7 && src.file == IMM)) { + const dst_reg tmp = vgrf(src.type); + MOV(tmp, src); + return tmp; + } else { + return src; + } + } + + bblock_t *block; + exec_node *cursor; + + unsigned _dispatch_width; + unsigned _group; + bool force_writemask_all; + + /** Debug annotation info. */ + struct { + const char *str; + const void *ir; + } annotation; + }; +} + +static inline fs_reg +offset(const fs_reg ®, const brw::fs_builder &bld, unsigned delta) +{ + return offset(reg, bld.dispatch_width(), delta); +} + +#endif diff --git a/src/intel/compiler/elk/brw_fs_cmod_propagation.cpp b/src/intel/compiler/elk/brw_fs_cmod_propagation.cpp new file mode 100644 index 00000000000..0fadb402172 --- /dev/null +++ b/src/intel/compiler/elk/brw_fs_cmod_propagation.cpp @@ -0,0 +1,568 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_fs.h" +#include "brw_cfg.h" +#include "brw_eu.h" + +/** @file brw_fs_cmod_propagation.cpp + * + * Implements a pass that propagates the conditional modifier from a CMP x 0.0 + * instruction into the instruction that generated x. For instance, in this + * sequence + * + * add(8) g70<1>F g69<8,8,1>F 4096F + * cmp.ge.f0(8) null g70<8,8,1>F 0F + * + * we can do the comparison as part of the ADD instruction directly: + * + * add.ge.f0(8) g70<1>F g69<8,8,1>F 4096F + * + * If there had been a use of the flag register and another CMP using g70 + * + * add.ge.f0(8) g70<1>F g69<8,8,1>F 4096F + * (+f0) sel(8) g71 g72<8,8,1>F g73<8,8,1>F + * cmp.ge.f0(8) null g70<8,8,1>F 0F + * + * we can recognize that the CMP is generating the flag value that already + * exists and therefore remove the instruction. + */ + +using namespace brw; + +static bool +cmod_propagate_cmp_to_add(const intel_device_info *devinfo, bblock_t *block, + fs_inst *inst) +{ + bool read_flag = false; + const unsigned flags_written = inst->flags_written(devinfo); + + foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { + if (scan_inst->opcode == BRW_OPCODE_ADD && + !scan_inst->is_partial_write() && + scan_inst->exec_size == inst->exec_size) { + bool negate; + + /* A CMP is basically a subtraction. The result of the + * subtraction must be the same as the result of the addition. + * This means that one of the operands must be negated. So (a + + * b) vs (a == -b) or (a + -b) vs (a == b). + */ + if ((inst->src[0].equals(scan_inst->src[0]) && + inst->src[1].negative_equals(scan_inst->src[1])) || + (inst->src[0].equals(scan_inst->src[1]) && + inst->src[1].negative_equals(scan_inst->src[0]))) { + negate = false; + } else if ((inst->src[0].negative_equals(scan_inst->src[0]) && + inst->src[1].equals(scan_inst->src[1])) || + (inst->src[0].negative_equals(scan_inst->src[1]) && + inst->src[1].equals(scan_inst->src[0]))) { + negate = true; + } else { + goto not_match; + } + + /* If the scan instruction writes a different flag register than the + * instruction we're trying to propagate from, bail. + * + * FINISHME: The second part of the condition may be too strong. + * Perhaps (scan_inst->flags_written() & flags_written) != + * flags_written? + */ + if (scan_inst->flags_written(devinfo) != 0 && + scan_inst->flags_written(devinfo) != flags_written) + goto not_match; + + /* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags": + * + * * Note that the [post condition signal] bits generated at + * the output of a compute are before the .sat. + * + * Paragraph about post_zero does not mention saturation, but + * testing it on actual GPUs shows that conditional modifiers + * are applied after saturation. + * + * * post_zero bit: This bit reflects whether the final + * result is zero after all the clamping, normalizing, + * or format conversion logic. + * + * For signed types we don't care about saturation: it won't + * change the result of conditional modifier. + * + * For floating and unsigned types there two special cases, + * when we can remove inst even if scan_inst is saturated: G + * and LE. Since conditional modifiers are just comparisons + * against zero, saturating positive values to the upper + * limit never changes the result of comparison. + * + * For negative values: + * (sat(x) > 0) == (x > 0) --- false + * (sat(x) <= 0) == (x <= 0) --- true + */ + const enum brw_conditional_mod cond = + negate ? brw_swap_cmod(inst->conditional_mod) + : inst->conditional_mod; + + if (scan_inst->saturate && + (brw_reg_type_is_floating_point(scan_inst->dst.type) || + brw_reg_type_is_unsigned_integer(scan_inst->dst.type)) && + (cond != BRW_CONDITIONAL_G && + cond != BRW_CONDITIONAL_LE)) + goto not_match; + + /* Otherwise, try propagating the conditional. */ + if (scan_inst->can_do_cmod() && + ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) || + scan_inst->conditional_mod == cond)) { + scan_inst->conditional_mod = cond; + scan_inst->flag_subreg = inst->flag_subreg; + inst->remove(block, true); + return true; + } + break; + } + + not_match: + if ((scan_inst->flags_written(devinfo) & flags_written) != 0) + break; + + read_flag = read_flag || + (scan_inst->flags_read(devinfo) & flags_written) != 0; + } + + return false; +} + +/** + * Propagate conditional modifiers from NOT instructions + * + * Attempt to convert sequences like + * + * or(8) g78<8,8,1> g76<8,8,1>UD g77<8,8,1>UD + * ... + * not.nz.f0(8) null g78<8,8,1>UD + * + * into + * + * or.z.f0(8) g78<8,8,1> g76<8,8,1>UD g77<8,8,1>UD + */ +static bool +cmod_propagate_not(const intel_device_info *devinfo, bblock_t *block, + fs_inst *inst) +{ + const enum brw_conditional_mod cond = brw_negate_cmod(inst->conditional_mod); + bool read_flag = false; + const unsigned flags_written = inst->flags_written(devinfo); + + if (cond != BRW_CONDITIONAL_Z && cond != BRW_CONDITIONAL_NZ) + return false; + + foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { + if (regions_overlap(scan_inst->dst, scan_inst->size_written, + inst->src[0], inst->size_read(0))) { + if (scan_inst->opcode != BRW_OPCODE_OR && + scan_inst->opcode != BRW_OPCODE_AND) + break; + + if (scan_inst->is_partial_write() || + scan_inst->dst.offset != inst->src[0].offset || + scan_inst->exec_size != inst->exec_size) + break; + + /* If the scan instruction writes a different flag register than the + * instruction we're trying to propagate from, bail. + * + * FINISHME: The second part of the condition may be too strong. + * Perhaps (scan_inst->flags_written() & flags_written) != + * flags_written? + */ + if (scan_inst->flags_written(devinfo) != 0 && + scan_inst->flags_written(devinfo) != flags_written) + break; + + if (scan_inst->can_do_cmod() && + ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) || + scan_inst->conditional_mod == cond)) { + scan_inst->conditional_mod = cond; + scan_inst->flag_subreg = inst->flag_subreg; + inst->remove(block, true); + return true; + } + break; + } + + if ((scan_inst->flags_written(devinfo) & flags_written) != 0) + break; + + read_flag = read_flag || + (scan_inst->flags_read(devinfo) & flags_written) != 0; + } + + return false; +} + +static bool +opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block) +{ + bool progress = false; + UNUSED int ip = block->end_ip + 1; + + foreach_inst_in_block_reverse_safe(fs_inst, inst, block) { + ip--; + + if ((inst->opcode != BRW_OPCODE_AND && + inst->opcode != BRW_OPCODE_CMP && + inst->opcode != BRW_OPCODE_MOV && + inst->opcode != BRW_OPCODE_NOT) || + inst->predicate != BRW_PREDICATE_NONE || + !inst->dst.is_null() || + (inst->src[0].file != VGRF && inst->src[0].file != ATTR && + inst->src[0].file != UNIFORM)) + continue; + + /* An ABS source modifier can only be handled when processing a compare + * with a value other than zero. + */ + if (inst->src[0].abs && + (inst->opcode != BRW_OPCODE_CMP || inst->src[1].is_zero())) + continue; + + /* Only an AND.NZ can be propagated. Many AND.Z instructions are + * generated (for ir_unop_not in fs_visitor::emit_bool_to_cond_code). + * Propagating those would require inverting the condition on the CMP. + * This changes both the flag value and the register destination of the + * CMP. That result may be used elsewhere, so we can't change its value + * on a whim. + */ + if (inst->opcode == BRW_OPCODE_AND && + !(inst->src[1].is_one() && + inst->conditional_mod == BRW_CONDITIONAL_NZ && + !inst->src[0].negate)) + continue; + + /* A CMP with a second source of zero can match with anything. A CMP + * with a second source that is not zero can only match with an ADD + * instruction. + * + * Only apply this optimization to float-point sources. It can fail for + * integers. For inputs a = 0x80000000, b = 4, int(0x80000000) < 4, but + * int(0x80000000) - 4 overflows and results in 0x7ffffffc. that's not + * less than zero, so the flags get set differently than for (a < b). + */ + if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) { + if (brw_reg_type_is_floating_point(inst->src[0].type) && + cmod_propagate_cmp_to_add(devinfo, block, inst)) + progress = true; + + continue; + } + + if (inst->opcode == BRW_OPCODE_NOT) { + progress = cmod_propagate_not(devinfo, block, inst) || progress; + continue; + } + + bool read_flag = false; + const unsigned flags_written = inst->flags_written(devinfo); + foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { + if (regions_overlap(scan_inst->dst, scan_inst->size_written, + inst->src[0], inst->size_read(0))) { + /* If the scan instruction writes a different flag register than + * the instruction we're trying to propagate from, bail. + * + * FINISHME: The second part of the condition may be too strong. + * Perhaps (scan_inst->flags_written() & flags_written) != + * flags_written? + */ + if (scan_inst->flags_written(devinfo) != 0 && + scan_inst->flags_written(devinfo) != flags_written) + break; + + if (scan_inst->is_partial_write() || + scan_inst->dst.offset != inst->src[0].offset || + scan_inst->exec_size != inst->exec_size) + break; + + /* If the write mask is different we can't propagate. */ + if (scan_inst->force_writemask_all != inst->force_writemask_all) + break; + + /* CMP's result is the same regardless of dest type. */ + if (inst->conditional_mod == BRW_CONDITIONAL_NZ && + scan_inst->opcode == BRW_OPCODE_CMP && + brw_reg_type_is_integer(inst->dst.type)) { + inst->remove(block, true); + progress = true; + break; + } + + /* If the AND wasn't handled by the previous case, it isn't safe + * to remove it. + */ + if (inst->opcode == BRW_OPCODE_AND) + break; + + if (inst->opcode == BRW_OPCODE_MOV) { + if (brw_reg_type_is_floating_point(scan_inst->dst.type)) { + /* If the destination type of scan_inst is floating-point, + * then: + * + * - The source of the MOV instruction must be the same + * type. + * + * - The destination of the MOV instruction must be float + * point with a size at least as large as the destination + * of inst. Size-reducing f2f conversions could cause + * non-zero values to become zero, etc. + */ + if (scan_inst->dst.type != inst->src[0].type) + break; + + if (!brw_reg_type_is_floating_point(inst->dst.type)) + break; + + if (type_sz(scan_inst->dst.type) > type_sz(inst->dst.type)) + break; + } else { + /* If the destination type of scan_inst is integer, then: + * + * - The source of the MOV instruction must be integer with + * the same size. + * + * - If the conditional modifier is Z or NZ, then the + * destination type of inst must either be floating point + * (of any size) or integer with a size at least as large + * as the destination of inst. + * + * - If the conditional modifier is neither Z nor NZ, then the + * destination type of inst must either be floating point + * (of any size) or integer with a size at least as large + * as the destination of inst and the same signedness. + */ + if (!brw_reg_type_is_integer(inst->src[0].type) || + type_sz(scan_inst->dst.type) != type_sz(inst->src[0].type)) + break; + + if (brw_reg_type_is_integer(inst->dst.type)) { + if (type_sz(inst->dst.type) < type_sz(scan_inst->dst.type)) + break; + + if (inst->conditional_mod != BRW_CONDITIONAL_Z && + inst->conditional_mod != BRW_CONDITIONAL_NZ && + brw_reg_type_is_unsigned_integer(inst->dst.type) != + brw_reg_type_is_unsigned_integer(scan_inst->dst.type)) + break; + } + } + } else { + /* Not safe to use inequality operators if the types are + * different. + */ + if (scan_inst->dst.type != inst->src[0].type && + inst->conditional_mod != BRW_CONDITIONAL_Z && + inst->conditional_mod != BRW_CONDITIONAL_NZ) + break; + + /* Comparisons operate differently for ints and floats */ + if (scan_inst->dst.type != inst->dst.type) { + /* Comparison result may be altered if the bit-size changes + * since that affects range, denorms, etc + */ + if (type_sz(scan_inst->dst.type) != type_sz(inst->dst.type)) + break; + + if (brw_reg_type_is_floating_point(scan_inst->dst.type) != + brw_reg_type_is_floating_point(inst->dst.type)) + break; + } + } + + /* Knowing following: + * - CMP writes to flag register the result of + * applying cmod to the `src0 - src1`. + * After that it stores the same value to dst. + * Other instructions first store their result to + * dst, and then store cmod(dst) to the flag + * register. + * - inst is either CMP or MOV + * - inst->dst is null + * - inst->src[0] overlaps with scan_inst->dst + * - inst->src[1] is zero + * - scan_inst wrote to a flag register + * + * There can be three possible paths: + * + * - scan_inst is CMP: + * + * Considering that src0 is either 0x0 (false), + * or 0xffffffff (true), and src1 is 0x0: + * + * - If inst's cmod is NZ, we can always remove + * scan_inst: NZ is invariant for false and true. This + * holds even if src0 is NaN: .nz is the only cmod, + * that returns true for NaN. + * + * - .g is invariant if src0 has a UD type + * + * - .l is invariant if src0 has a D type + * + * - scan_inst and inst have the same cmod: + * + * If scan_inst is anything than CMP, it already + * wrote the appropriate value to the flag register. + * + * - else: + * + * We can change cmod of scan_inst to that of inst, + * and remove inst. It is valid as long as we make + * sure that no instruction uses the flag register + * between scan_inst and inst. + */ + if (!inst->src[0].negate && + scan_inst->flags_written(devinfo)) { + if (scan_inst->opcode == BRW_OPCODE_CMP) { + if ((inst->conditional_mod == BRW_CONDITIONAL_NZ) || + (inst->conditional_mod == BRW_CONDITIONAL_G && + inst->src[0].type == BRW_REGISTER_TYPE_UD) || + (inst->conditional_mod == BRW_CONDITIONAL_L && + inst->src[0].type == BRW_REGISTER_TYPE_D)) { + inst->remove(block, true); + progress = true; + break; + } + } else if (scan_inst->conditional_mod == inst->conditional_mod) { + /* On Gfx4 and Gfx5 sel.cond will dirty the flags, but the + * flags value is not based on the result stored in the + * destination. On all other platforms sel.cond will not + * write the flags, so execution will not get to this point. + */ + if (scan_inst->opcode == BRW_OPCODE_SEL) { + assert(devinfo->ver <= 5); + } else { + inst->remove(block, true); + progress = true; + } + + break; + } else if (!read_flag && scan_inst->can_do_cmod()) { + scan_inst->conditional_mod = inst->conditional_mod; + scan_inst->flag_subreg = inst->flag_subreg; + inst->remove(block, true); + progress = true; + break; + } + } + + /* The conditional mod of the CMP/CMPN instructions behaves + * specially because the flag output is not calculated from the + * result of the instruction, but the other way around, which + * means that even if the condmod to propagate and the condmod + * from the CMP instruction are the same they will in general give + * different results because they are evaluated based on different + * inputs. + */ + if (scan_inst->opcode == BRW_OPCODE_CMP || + scan_inst->opcode == BRW_OPCODE_CMPN) + break; + + /* From the Sky Lake PRM, Vol 2a, "Multiply": + * + * "When multiplying integer data types, if one of the sources + * is a DW, the resulting full precision data is stored in + * the accumulator. However, if the destination data type is + * either W or DW, the low bits of the result are written to + * the destination register and the remaining high bits are + * discarded. This results in undefined Overflow and Sign + * flags. Therefore, conditional modifiers and saturation + * (.sat) cannot be used in this case." + * + * We just disallow cmod propagation on all integer multiplies. + */ + if (!brw_reg_type_is_floating_point(scan_inst->dst.type) && + scan_inst->opcode == BRW_OPCODE_MUL) + break; + + enum brw_conditional_mod cond = + inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod) + : inst->conditional_mod; + + /* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags": + * + * * Note that the [post condition signal] bits generated at + * the output of a compute are before the .sat. + * + * Paragraph about post_zero does not mention saturation, but + * testing it on actual GPUs shows that conditional modifiers are + * applied after saturation. + * + * * post_zero bit: This bit reflects whether the final + * result is zero after all the clamping, normalizing, + * or format conversion logic. + * + * For this reason, no additional restrictions are necessary on + * instructions with saturate. + */ + + /* Otherwise, try propagating the conditional. */ + if (scan_inst->can_do_cmod() && + ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) || + scan_inst->conditional_mod == cond)) { + scan_inst->conditional_mod = cond; + scan_inst->flag_subreg = inst->flag_subreg; + inst->remove(block, true); + progress = true; + } + break; + } + + if ((scan_inst->flags_written(devinfo) & flags_written) != 0) + break; + + read_flag = read_flag || + (scan_inst->flags_read(devinfo) & flags_written) != 0; + } + } + + /* There is progress if and only if instructions were removed. */ + assert(progress == (block->end_ip_delta != 0)); + + return progress; +} + +bool +fs_visitor::opt_cmod_propagation() +{ + bool progress = false; + + foreach_block_reverse(block, cfg) { + progress = opt_cmod_propagation_local(devinfo, block) || progress; + } + + if (progress) { + cfg->adjust_block_ips(); + + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + } + + return progress; +} diff --git a/src/intel/compiler/elk/brw_fs_combine_constants.cpp b/src/intel/compiler/elk/brw_fs_combine_constants.cpp new file mode 100644 index 00000000000..ed5176153da --- /dev/null +++ b/src/intel/compiler/elk/brw_fs_combine_constants.cpp @@ -0,0 +1,1858 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_fs_combine_constants.cpp + * + * This file contains the opt_combine_constants() pass that runs after the + * regular optimization loop. It passes over the instruction list and + * selectively promotes immediate values to registers by emitting a mov(1) + * instruction. + * + * This is useful on Gen 7 particularly, because a few instructions can be + * coissued (i.e., issued in the same cycle as another thread on the same EU + * issues an instruction) under some circumstances, one of which is that they + * cannot use immediate values. + */ + +#include "brw_fs.h" +#include "brw_fs_builder.h" +#include "brw_cfg.h" +#include "util/half_float.h" + +using namespace brw; + +static const bool debug = false; + +enum PACKED interpreted_type { + float_only = 0, + integer_only, + either_type +}; + +struct value { + /** Raw bit pattern of the value. */ + nir_const_value value; + + /** Instruction that uses this instance of the value. */ + unsigned instr_index; + + /** Size, in bits, of the value. */ + uint8_t bit_size; + + /** + * Which source of instr is this value? + * + * \note This field is not actually used by \c brw_combine_constants, but + * it is generally very useful to callers. + */ + uint8_t src; + + /** + * In what ways can instr interpret this value? + * + * Choices are floating-point only, integer only, or either type. + */ + enum interpreted_type type; + + /** + * Only try to make a single source non-constant. + * + * On some architectures, some instructions require that all sources be + * non-constant. For example, the multiply-accumulate instruction on Intel + * GPUs upto Gen11 require that all sources be non-constant. Other + * instructions, like the selection instruction, allow one constant source. + * + * If a single constant source is allowed, set this flag to true. + * + * If an instruction allows a single constant and it has only a signle + * constant to begin, it should be included. Various places in + * \c combine_constants will assume that there are multiple constants if + * \c ::allow_one_constant is set. This may even be enforced by in-code + * assertions. + */ + bool allow_one_constant; + + /** + * Restrict values that can reach this value to not include negations. + * + * This is useful for instructions that cannot have source modifiers. For + * example, on Intel GPUs the integer source of a shift instruction (e.g., + * SHL) can have a source modifier, but the integer source of the bitfield + * insertion instruction (i.e., BFI2) cannot. A pair of these instructions + * might have sources that are negations of each other. Using this flag + * will ensure that the BFI2 does not have a negated source, but the SHL + * might. + */ + bool no_negations; + + /** + * \name UtilCombineConstantsPrivate + * Private data used only by brw_combine_constants + * + * Any data stored in these fields will be overwritten by the call to + * \c brw_combine_constants. No assumptions should be made about the + * state of these fields after that function returns. + */ + /**@{*/ + /** Mask of negations that can be generated from this value. */ + uint8_t reachable_mask; + + /** Mask of negations that can generate this value. */ + uint8_t reaching_mask; + + /** + * Value with the next source from the same instruction. + * + * This pointer may be \c NULL. If it is not \c NULL, it will form a + * singly-linked circular list of values. The list is unorderd. That is, + * as the list is iterated, the \c ::src values will be in arbitrary order. + * + * \todo Is it even possible for there to be more than two elements in this + * list? This pass does not operate on vecN instructions or intrinsics, so + * the theoretical limit should be three. However, instructions with all + * constant sources should have been folded away. + */ + struct value *next_src; + /**@}*/ +}; + +struct combine_constants_value { + /** Raw bit pattern of the constant loaded. */ + nir_const_value value; + + /** + * Index of the first user. + * + * This is the offset into \c combine_constants_result::user_map of the + * first user of this value. + */ + unsigned first_user; + + /** Number of users of this value. */ + unsigned num_users; + + /** Size, in bits, of the value. */ + uint8_t bit_size; +}; + +struct combine_constants_user { + /** Index into the array of values passed to brw_combine_constants. */ + unsigned index; + + /** + * Manner in which the value should be interpreted in the instruction. + * + * This is only useful when ::negate is set. Unless the corresponding + * value::type is \c either_type, this field must have the same value as + * value::type. + */ + enum interpreted_type type; + + /** Should this value be negated to generate the original value? */ + bool negate; +}; + +class combine_constants_result { +public: + combine_constants_result(unsigned num_candidates, bool &success) + : num_values_to_emit(0), user_map(NULL) + { + user_map = (struct combine_constants_user *) calloc(num_candidates, + sizeof(user_map[0])); + + /* In the worst case, the number of output values will be equal to the + * number of input values. Allocate a buffer that is known to be large + * enough now, and it can be reduced later. + */ + values_to_emit = + (struct combine_constants_value *) calloc(num_candidates, + sizeof(values_to_emit[0])); + + success = (user_map != NULL && values_to_emit != NULL); + } + + ~combine_constants_result() + { + free(values_to_emit); + free(user_map); + } + + void append_value(const nir_const_value &value, unsigned bit_size) + { + values_to_emit[num_values_to_emit].value = value; + values_to_emit[num_values_to_emit].first_user = 0; + values_to_emit[num_values_to_emit].num_users = 0; + values_to_emit[num_values_to_emit].bit_size = bit_size; + num_values_to_emit++; + } + + unsigned num_values_to_emit; + struct combine_constants_value *values_to_emit; + + struct combine_constants_user *user_map; +}; + +#define VALUE_INDEX 0 +#define FLOAT_NEG_INDEX 1 +#define INT_NEG_INDEX 2 +#define MAX_NUM_REACHABLE 3 + +#define VALUE_EXISTS (1 << VALUE_INDEX) +#define FLOAT_NEG_EXISTS (1 << FLOAT_NEG_INDEX) +#define INT_NEG_EXISTS (1 << INT_NEG_INDEX) + +static bool +negation_exists(nir_const_value v, unsigned bit_size, + enum interpreted_type base_type) +{ + /* either_type does not make sense in this context. */ + assert(base_type == float_only || base_type == integer_only); + + switch (bit_size) { + case 8: + if (base_type == float_only) + return false; + else + return v.i8 != 0 && v.i8 != INT8_MIN; + + case 16: + if (base_type == float_only) + return !util_is_half_nan(v.i16); + else + return v.i16 != 0 && v.i16 != INT16_MIN; + + case 32: + if (base_type == float_only) + return !isnan(v.f32); + else + return v.i32 != 0 && v.i32 != INT32_MIN; + + case 64: + if (base_type == float_only) + return !isnan(v.f64); + else + return v.i64 != 0 && v.i64 != INT64_MIN; + + default: + unreachable("unsupported bit-size should have already been filtered."); + } +} + +static nir_const_value +negate(nir_const_value v, unsigned bit_size, enum interpreted_type base_type) +{ + /* either_type does not make sense in this context. */ + assert(base_type == float_only || base_type == integer_only); + + nir_const_value ret = { 0, }; + + switch (bit_size) { + case 8: + assert(base_type == integer_only); + ret.i8 = -v.i8; + break; + + case 16: + if (base_type == float_only) + ret.u16 = v.u16 ^ INT16_MIN; + else + ret.i16 = -v.i16; + break; + + case 32: + if (base_type == float_only) + ret.u32 = v.u32 ^ INT32_MIN; + else + ret.i32 = -v.i32; + break; + + case 64: + if (base_type == float_only) + ret.u64 = v.u64 ^ INT64_MIN; + else + ret.i64 = -v.i64; + break; + + default: + unreachable("unsupported bit-size should have already been filtered."); + } + + return ret; +} + +static nir_const_value +absolute(nir_const_value v, unsigned bit_size, enum interpreted_type base_type) +{ + /* either_type does not make sense in this context. */ + assert(base_type == float_only || base_type == integer_only); + + nir_const_value ret = { 0, }; + + switch (bit_size) { + case 8: + assert(base_type == integer_only); + ret.i8 = abs(v.i8); + break; + + case 16: + if (base_type == float_only) + ret.u16 = v.u16 & 0x7fff; + else + ret.i16 = abs(v.i16); + break; + + case 32: + if (base_type == float_only) + ret.f32 = fabs(v.f32); + else + ret.i32 = abs(v.i32); + break; + + case 64: + if (base_type == float_only) + ret.f64 = fabs(v.f64); + else { + if (sizeof(v.i64) == sizeof(long int)) { + ret.i64 = labs((long int) v.i64); + } else { + assert(sizeof(v.i64) == sizeof(long long int)); + ret.i64 = llabs((long long int) v.i64); + } + } + break; + + default: + unreachable("unsupported bit-size should have already been filtered."); + } + + return ret; +} + +static void +calculate_masks(nir_const_value v, enum interpreted_type type, + unsigned bit_size, uint8_t *reachable_mask, + uint8_t *reaching_mask) +{ + *reachable_mask = 0; + *reaching_mask = 0; + + /* Calculate the extended reachable mask. */ + if (type == float_only || type == either_type) { + if (negation_exists(v, bit_size, float_only)) + *reachable_mask |= FLOAT_NEG_EXISTS; + } + + if (type == integer_only || type == either_type) { + if (negation_exists(v, bit_size, integer_only)) + *reachable_mask |= INT_NEG_EXISTS; + } + + /* Calculate the extended reaching mask. All of the "is this negation + * possible" was already determined for the reachable_mask, so reuse that + * data. + */ + if (type == float_only || type == either_type) { + if (*reachable_mask & FLOAT_NEG_EXISTS) + *reaching_mask |= FLOAT_NEG_EXISTS; + } + + if (type == integer_only || type == either_type) { + if (*reachable_mask & INT_NEG_EXISTS) + *reaching_mask |= INT_NEG_EXISTS; + } +} + +static void +calculate_reachable_values(nir_const_value v, + unsigned bit_size, + unsigned reachable_mask, + nir_const_value *reachable_values) +{ + memset(reachable_values, 0, MAX_NUM_REACHABLE * sizeof(reachable_values[0])); + + reachable_values[VALUE_INDEX] = v; + + if (reachable_mask & INT_NEG_EXISTS) { + const nir_const_value neg = negate(v, bit_size, integer_only); + + reachable_values[INT_NEG_INDEX] = neg; + } + + if (reachable_mask & FLOAT_NEG_EXISTS) { + const nir_const_value neg = negate(v, bit_size, float_only); + + reachable_values[FLOAT_NEG_INDEX] = neg; + } +} + +static bool +value_equal(nir_const_value a, nir_const_value b, unsigned bit_size) +{ + switch (bit_size) { + case 8: + return a.u8 == b.u8; + case 16: + return a.u16 == b.u16; + case 32: + return a.u32 == b.u32; + case 64: + return a.u64 == b.u64; + default: + unreachable("unsupported bit-size should have already been filtered."); + } +} + +/** Can these values be the same with one level of negation? */ +static bool +value_can_equal(const nir_const_value *from, uint8_t reachable_mask, + nir_const_value to, uint8_t reaching_mask, + unsigned bit_size) +{ + const uint8_t combined_mask = reachable_mask & reaching_mask; + + return value_equal(from[VALUE_INDEX], to, bit_size) || + ((combined_mask & INT_NEG_EXISTS) && + value_equal(from[INT_NEG_INDEX], to, bit_size)) || + ((combined_mask & FLOAT_NEG_EXISTS) && + value_equal(from[FLOAT_NEG_INDEX], to, bit_size)); +} + +static void +preprocess_candidates(struct value *candidates, unsigned num_candidates) +{ + /* Calculate the reaching_mask and reachable_mask for each candidate. */ + for (unsigned i = 0; i < num_candidates; i++) { + calculate_masks(candidates[i].value, + candidates[i].type, + candidates[i].bit_size, + &candidates[i].reachable_mask, + &candidates[i].reaching_mask); + + /* If negations are not allowed, then only the original value is + * reaching. + */ + if (candidates[i].no_negations) + candidates[i].reaching_mask = 0; + } + + for (unsigned i = 0; i < num_candidates; i++) + candidates[i].next_src = NULL; + + for (unsigned i = 0; i < num_candidates - 1; i++) { + if (candidates[i].next_src != NULL) + continue; + + struct value *prev = &candidates[i]; + + for (unsigned j = i + 1; j < num_candidates; j++) { + if (candidates[i].instr_index == candidates[j].instr_index) { + prev->next_src = &candidates[j]; + prev = prev->next_src; + } + } + + /* Close the cycle. */ + if (prev != &candidates[i]) + prev->next_src = &candidates[i]; + } +} + +static bool +reaching_value_exists(const struct value *c, + const struct combine_constants_value *values, + unsigned num_values) +{ + nir_const_value reachable_values[MAX_NUM_REACHABLE]; + + calculate_reachable_values(c->value, c->bit_size, c->reaching_mask, + reachable_values); + + /* Check to see if the value is already in the result set. */ + for (unsigned j = 0; j < num_values; j++) { + if (c->bit_size == values[j].bit_size && + value_can_equal(reachable_values, c->reaching_mask, + values[j].value, c->reaching_mask, + c->bit_size)) { + return true; + } + } + + return false; +} + +static combine_constants_result * +combine_constants_greedy(struct value *candidates, unsigned num_candidates) +{ + bool success; + combine_constants_result *result = + new combine_constants_result(num_candidates, success); + if (result == NULL || !success) { + delete result; + return NULL; + } + + BITSET_WORD *remain = + (BITSET_WORD *) calloc(BITSET_WORDS(num_candidates), sizeof(remain[0])); + + if (remain == NULL) { + delete result; + return NULL; + } + + memset(remain, 0xff, BITSET_WORDS(num_candidates) * sizeof(remain[0])); + + /* Operate in three passes. The first pass handles all values that must be + * emitted and for which a negation cannot exist. + */ + unsigned i; + for (i = 0; i < num_candidates; i++) { + if (candidates[i].allow_one_constant || + (candidates[i].reaching_mask & (FLOAT_NEG_EXISTS | INT_NEG_EXISTS))) { + continue; + } + + /* Check to see if the value is already in the result set. */ + bool found = false; + const unsigned num_values = result->num_values_to_emit; + for (unsigned j = 0; j < num_values; j++) { + if (candidates[i].bit_size == result->values_to_emit[j].bit_size && + value_equal(candidates[i].value, + result->values_to_emit[j].value, + candidates[i].bit_size)) { + found = true; + break; + } + } + + if (!found) + result->append_value(candidates[i].value, candidates[i].bit_size); + + BITSET_CLEAR(remain, i); + } + + /* The second pass handles all values that must be emitted and for which a + * negation can exist. + */ + BITSET_FOREACH_SET(i, remain, num_candidates) { + if (candidates[i].allow_one_constant) + continue; + + assert(candidates[i].reaching_mask & (FLOAT_NEG_EXISTS | INT_NEG_EXISTS)); + + if (!reaching_value_exists(&candidates[i], result->values_to_emit, + result->num_values_to_emit)) { + result->append_value(absolute(candidates[i].value, + candidates[i].bit_size, + candidates[i].type), + candidates[i].bit_size); + } + + BITSET_CLEAR(remain, i); + } + + /* The third pass handles all of the values that may not have to be + * emitted. These are the values where allow_one_constant is set. + */ + BITSET_FOREACH_SET(i, remain, num_candidates) { + assert(candidates[i].allow_one_constant); + + /* The BITSET_FOREACH_SET macro does not detect changes to the bitset + * that occur within the current word. Since code in this loop may + * clear bits from the set, re-test here. + */ + if (!BITSET_TEST(remain, i)) + continue; + + assert(candidates[i].next_src != NULL); + + const struct value *const other_candidate = candidates[i].next_src; + const unsigned j = other_candidate - candidates; + + if (!reaching_value_exists(&candidates[i], result->values_to_emit, + result->num_values_to_emit)) { + /* Before emitting a value, see if a match for the other source of + * the instruction exists. + */ + if (!reaching_value_exists(&candidates[j], result->values_to_emit, + result->num_values_to_emit)) { + result->append_value(candidates[i].value, candidates[i].bit_size); + } + } + + /* Mark both sources as handled. */ + BITSET_CLEAR(remain, i); + BITSET_CLEAR(remain, j); + } + + /* As noted above, there will never be more values in the output than in + * the input. If there are fewer values, reduce the size of the + * allocation. + */ + if (result->num_values_to_emit < num_candidates) { + result->values_to_emit = (struct combine_constants_value *) + realloc(result->values_to_emit, sizeof(result->values_to_emit[0]) * + result->num_values_to_emit); + + /* Is it even possible for a reducing realloc to fail? */ + assert(result->values_to_emit != NULL); + } + + /* Create the mapping from "combined" constants to list of candidates + * passed in by the caller. + */ + memset(remain, 0xff, BITSET_WORDS(num_candidates) * sizeof(remain[0])); + + unsigned total_users = 0; + + const unsigned num_values = result->num_values_to_emit; + for (unsigned value_idx = 0; value_idx < num_values; value_idx++) { + result->values_to_emit[value_idx].first_user = total_users; + + uint8_t reachable_mask; + uint8_t unused_mask; + + calculate_masks(result->values_to_emit[value_idx].value, either_type, + result->values_to_emit[value_idx].bit_size, + &reachable_mask, &unused_mask); + + nir_const_value reachable_values[MAX_NUM_REACHABLE]; + + calculate_reachable_values(result->values_to_emit[value_idx].value, + result->values_to_emit[value_idx].bit_size, + reachable_mask, reachable_values); + + for (unsigned i = 0; i < num_candidates; i++) { + bool matched = false; + + if (!BITSET_TEST(remain, i)) + continue; + + if (candidates[i].bit_size != result->values_to_emit[value_idx].bit_size) + continue; + + if (value_equal(candidates[i].value, result->values_to_emit[value_idx].value, + result->values_to_emit[value_idx].bit_size)) { + result->user_map[total_users].index = i; + result->user_map[total_users].type = candidates[i].type; + result->user_map[total_users].negate = false; + total_users++; + + matched = true; + BITSET_CLEAR(remain, i); + } else { + const uint8_t combined_mask = reachable_mask & + candidates[i].reaching_mask; + + enum interpreted_type type = either_type; + + if ((combined_mask & INT_NEG_EXISTS) && + value_equal(candidates[i].value, + reachable_values[INT_NEG_INDEX], + candidates[i].bit_size)) { + type = integer_only; + } + + if (type == either_type && + (combined_mask & FLOAT_NEG_EXISTS) && + value_equal(candidates[i].value, + reachable_values[FLOAT_NEG_INDEX], + candidates[i].bit_size)) { + type = float_only; + } + + if (type != either_type) { + /* Finding a match on this path implies that the user must + * allow source negations. + */ + assert(!candidates[i].no_negations); + + result->user_map[total_users].index = i; + result->user_map[total_users].type = type; + result->user_map[total_users].negate = true; + total_users++; + + matched = true; + BITSET_CLEAR(remain, i); + } + } + + /* Mark the other source of instructions that can have a constant + * source. Selection is the prime example of this, and we want to + * avoid generating sequences like bcsel(a, fneg(b), ineg(c)). + * + * This also makes sure that the assertion (below) that *all* values + * were processed holds even when some values may be allowed to + * remain as constants. + * + * FINISHME: There may be value in only doing this when type == + * either_type. If both sources are loaded, a register allocator may + * be able to make a better choice about which value to "spill" + * (i.e., replace with an immediate) under heavy register pressure. + */ + if (matched && candidates[i].allow_one_constant) { + const struct value *const other_src = candidates[i].next_src; + const unsigned idx = other_src - candidates; + + assert(idx < num_candidates); + BITSET_CLEAR(remain, idx); + } + } + + assert(total_users > result->values_to_emit[value_idx].first_user); + result->values_to_emit[value_idx].num_users = + total_users - result->values_to_emit[value_idx].first_user; + } + + /* Verify that all of the values were emitted by the loop above. If any + * bits are still set in remain, then some value was not emitted. The use + * of memset to populate remain prevents the use of a more performant loop. + */ +#ifndef NDEBUG + bool pass = true; + + BITSET_FOREACH_SET(i, remain, num_candidates) { + fprintf(stderr, "candidate %d was not processed: { " + ".b = %s, " + ".f32 = %f, .f64 = %g, " + ".i8 = %d, .u8 = 0x%02x, " + ".i16 = %d, .u16 = 0x%04x, " + ".i32 = %d, .u32 = 0x%08x, " + ".i64 = %" PRId64 ", .u64 = 0x%016" PRIx64 " }\n", + i, + candidates[i].value.b ? "true" : "false", + candidates[i].value.f32, candidates[i].value.f64, + candidates[i].value.i8, candidates[i].value.u8, + candidates[i].value.i16, candidates[i].value.u16, + candidates[i].value.i32, candidates[i].value.u32, + candidates[i].value.i64, candidates[i].value.u64); + pass = false; + } + + assert(pass && "All values should have been processed."); +#endif + + free(remain); + + return result; +} + +static combine_constants_result * +brw_combine_constants(struct value *candidates, unsigned num_candidates) +{ + preprocess_candidates(candidates, num_candidates); + + return combine_constants_greedy(candidates, num_candidates); +} + +/* Returns whether an instruction could co-issue if its immediate source were + * replaced with a GRF source. + */ +static bool +could_coissue(const struct intel_device_info *devinfo, const fs_inst *inst) +{ + assert(inst->opcode == BRW_OPCODE_MOV || + inst->opcode == BRW_OPCODE_CMP || + inst->opcode == BRW_OPCODE_ADD || + inst->opcode == BRW_OPCODE_MUL); + + if (devinfo->ver != 7) + return false; + + /* Only float instructions can coissue. We don't have a great + * understanding of whether or not something like float(int(a) + int(b)) + * would be considered float (based on the destination type) or integer + * (based on the source types), so we take the conservative choice of + * only promoting when both destination and source are float. + */ + return inst->dst.type == BRW_REGISTER_TYPE_F && + inst->src[0].type == BRW_REGISTER_TYPE_F; +} + +/** + * Box for storing fs_inst and some other necessary data + * + * \sa box_instruction + */ +struct fs_inst_box { + fs_inst *inst; + unsigned ip; + bblock_t *block; + bool must_promote; +}; + +/** A box for putting fs_regs in a linked list. */ +struct reg_link { + DECLARE_RALLOC_CXX_OPERATORS(reg_link) + + reg_link(fs_inst *inst, unsigned src, bool negate, enum interpreted_type type) + : inst(inst), src(src), negate(negate), type(type) {} + + struct exec_node link; + fs_inst *inst; + uint8_t src; + bool negate; + enum interpreted_type type; +}; + +static struct exec_node * +link(void *mem_ctx, fs_inst *inst, unsigned src, bool negate, + enum interpreted_type type) +{ + reg_link *l = new(mem_ctx) reg_link(inst, src, negate, type); + return &l->link; +} + +/** + * Information about an immediate value. + */ +struct imm { + /** The common ancestor of all blocks using this immediate value. */ + bblock_t *block; + + /** + * The instruction generating the immediate value, if all uses are contained + * within a single basic block. Otherwise, NULL. + */ + fs_inst *inst; + + /** + * A list of fs_regs that refer to this immediate. If we promote it, we'll + * have to patch these up to refer to the new GRF. + */ + exec_list *uses; + + /** The immediate value */ + union { + char bytes[8]; + double df; + int64_t d64; + float f; + int32_t d; + int16_t w; + }; + uint8_t size; + + /** When promoting half-float we need to account for certain restrictions */ + bool is_half_float; + + /** + * The GRF register and subregister number where we've decided to store the + * constant value. + */ + uint8_t subreg_offset; + uint16_t nr; + + /** The number of coissuable instructions using this immediate. */ + uint16_t uses_by_coissue; + + /** + * Whether this constant is used by an instruction that can't handle an + * immediate source (and already has to be promoted to a GRF). + */ + bool must_promote; + + /** Is the value used only in a single basic block? */ + bool used_in_single_block; + + uint16_t first_use_ip; + uint16_t last_use_ip; +}; + +/** The working set of information about immediates. */ +struct table { + struct value *values; + int size; + int num_values; + + struct imm *imm; + int len; + + struct fs_inst_box *boxes; + unsigned num_boxes; + unsigned size_boxes; +}; + +static struct value * +new_value(struct table *table, void *mem_ctx) +{ + if (table->num_values == table->size) { + table->size *= 2; + table->values = reralloc(mem_ctx, table->values, struct value, table->size); + } + return &table->values[table->num_values++]; +} + +/** + * Store an instruction with some other data in a table. + * + * \returns the index into the dynamic array of boxes for the instruction. + */ +static unsigned +box_instruction(struct table *table, void *mem_ctx, fs_inst *inst, + unsigned ip, bblock_t *block, bool must_promote) +{ + /* It is common for box_instruction to be called consecutively for each + * source of an instruction. As a result, the most common case for finding + * an instruction in the table is when that instruction was the last one + * added. Search the list back to front. + */ + for (unsigned i = table->num_boxes; i > 0; /* empty */) { + i--; + + if (table->boxes[i].inst == inst) + return i; + } + + if (table->num_boxes == table->size_boxes) { + table->size_boxes *= 2; + table->boxes = reralloc(mem_ctx, table->boxes, fs_inst_box, + table->size_boxes); + } + + assert(table->num_boxes < table->size_boxes); + + const unsigned idx = table->num_boxes++; + fs_inst_box *ib = &table->boxes[idx]; + + ib->inst = inst; + ib->block = block; + ib->ip = ip; + ib->must_promote = must_promote; + + return idx; +} + +/** + * Comparator used for sorting an array of imm structures. + * + * We sort by basic block number, then last use IP, then first use IP (least + * to greatest). This sorting causes immediates live in the same area to be + * allocated to the same register in the hopes that all values will be dead + * about the same time and the register can be reused. + */ +static int +compare(const void *_a, const void *_b) +{ + const struct imm *a = (const struct imm *)_a, + *b = (const struct imm *)_b; + + int block_diff = a->block->num - b->block->num; + if (block_diff) + return block_diff; + + int end_diff = a->last_use_ip - b->last_use_ip; + if (end_diff) + return end_diff; + + return a->first_use_ip - b->first_use_ip; +} + +static struct brw_reg +build_imm_reg_for_copy(struct imm *imm) +{ + switch (imm->size) { + case 8: + return brw_imm_d(imm->d64); + case 4: + return brw_imm_d(imm->d); + case 2: + return brw_imm_w(imm->w); + default: + unreachable("not implemented"); + } +} + +static inline uint32_t +get_alignment_for_imm(const struct imm *imm) +{ + if (imm->is_half_float) + return 4; /* At least MAD seems to require this */ + else + return imm->size; +} + +static bool +representable_as_hf(float f, uint16_t *hf) +{ + union fi u; + uint16_t h = _mesa_float_to_half(f); + u.f = _mesa_half_to_float(h); + + if (u.f == f) { + *hf = h; + return true; + } + + return false; +} + +static bool +representable_as_w(int d, int16_t *w) +{ + int res = ((d & 0xffff8000) + 0x8000) & 0xffff7fff; + if (!res) { + *w = d; + return true; + } + + return false; +} + +static bool +representable_as_uw(unsigned ud, uint16_t *uw) +{ + if (!(ud & 0xffff0000)) { + *uw = ud; + return true; + } + + return false; +} + +static bool +supports_src_as_imm(const struct intel_device_info *devinfo, const fs_inst *inst) +{ + if (devinfo->ver < 12) + return false; + + switch (inst->opcode) { + case BRW_OPCODE_ADD3: + /* ADD3 only exists on Gfx12.5+. */ + return true; + + case BRW_OPCODE_MAD: + /* Integer types can always mix sizes. Floating point types can mix + * sizes on Gfx12. On Gfx12.5, floating point sources must all be HF or + * all be F. + */ + return devinfo->verx10 < 125 || inst->src[0].type != BRW_REGISTER_TYPE_F; + + default: + return false; + } +} + +static bool +can_promote_src_as_imm(const struct intel_device_info *devinfo, fs_inst *inst, + unsigned src_idx) +{ + bool can_promote = false; + + /* Experiment shows that we can only support src0 as immediate for MAD on + * Gfx12. ADD3 can use src0 or src2 in Gfx12.5, but constant propagation + * only propagates into src0. It's possible that src2 works for W or UW MAD + * on Gfx12.5. + */ + if (src_idx != 0) + return false; + + if (!supports_src_as_imm(devinfo, inst)) + return false; + + /* TODO - Fix the codepath below to use a bfloat16 immediate on XeHP, + * since HF/F mixed mode has been removed from the hardware. + */ + switch (inst->src[src_idx].type) { + case BRW_REGISTER_TYPE_F: { + uint16_t hf; + if (representable_as_hf(inst->src[src_idx].f, &hf)) { + inst->src[src_idx] = retype(brw_imm_uw(hf), BRW_REGISTER_TYPE_HF); + can_promote = true; + } + break; + } + case BRW_REGISTER_TYPE_D: { + int16_t w; + if (representable_as_w(inst->src[src_idx].d, &w)) { + inst->src[src_idx] = brw_imm_w(w); + can_promote = true; + } + break; + } + case BRW_REGISTER_TYPE_UD: { + uint16_t uw; + if (representable_as_uw(inst->src[src_idx].ud, &uw)) { + inst->src[src_idx] = brw_imm_uw(uw); + can_promote = true; + } + break; + } + case BRW_REGISTER_TYPE_W: + case BRW_REGISTER_TYPE_UW: + case BRW_REGISTER_TYPE_HF: + can_promote = true; + break; + default: + break; + } + + return can_promote; +} + +static void +add_candidate_immediate(struct table *table, fs_inst *inst, unsigned ip, + unsigned i, + bool must_promote, + bool allow_one_constant, + bblock_t *block, + const struct intel_device_info *devinfo, + void *const_ctx) +{ + struct value *v = new_value(table, const_ctx); + + unsigned box_idx = box_instruction(table, const_ctx, inst, ip, block, + must_promote); + + v->value.u64 = inst->src[i].d64; + v->bit_size = 8 * type_sz(inst->src[i].type); + v->instr_index = box_idx; + v->src = i; + v->allow_one_constant = allow_one_constant; + + /* Right-shift instructions are special. They can have source modifiers, + * but changing the type can change the semantic of the instruction. Only + * allow negations on a right shift if the source type is already signed. + */ + v->no_negations = !inst->can_do_source_mods(devinfo) || + ((inst->opcode == BRW_OPCODE_SHR || + inst->opcode == BRW_OPCODE_ASR) && + brw_reg_type_is_unsigned_integer(inst->src[i].type)); + + switch (inst->src[i].type) { + case BRW_REGISTER_TYPE_DF: + case BRW_REGISTER_TYPE_NF: + case BRW_REGISTER_TYPE_F: + case BRW_REGISTER_TYPE_HF: + v->type = float_only; + break; + + case BRW_REGISTER_TYPE_UQ: + case BRW_REGISTER_TYPE_Q: + case BRW_REGISTER_TYPE_UD: + case BRW_REGISTER_TYPE_D: + case BRW_REGISTER_TYPE_UW: + case BRW_REGISTER_TYPE_W: + v->type = integer_only; + break; + + case BRW_REGISTER_TYPE_VF: + case BRW_REGISTER_TYPE_UV: + case BRW_REGISTER_TYPE_V: + case BRW_REGISTER_TYPE_UB: + case BRW_REGISTER_TYPE_B: + default: + unreachable("not reached"); + } + + /* It is safe to change the type of the operands of a select instruction + * that has no conditional modifier, no source modifiers, and no saturate + * modifer. + */ + if (inst->opcode == BRW_OPCODE_SEL && + inst->conditional_mod == BRW_CONDITIONAL_NONE && + !inst->src[0].negate && !inst->src[0].abs && + !inst->src[1].negate && !inst->src[1].abs && + !inst->saturate) { + v->type = either_type; + } +} + +struct register_allocation { + /** VGRF for storing values. */ + unsigned nr; + + /** + * Mask of currently available slots in this register. + * + * Each register is 16, 16-bit slots. Allocations require 1, 2, or 4 slots + * for word, double-word, or quad-word values, respectively. + */ + uint16_t avail; +}; + +static fs_reg +allocate_slots(struct register_allocation *regs, unsigned num_regs, + unsigned bytes, unsigned align_bytes, + brw::simple_allocator &alloc) +{ + assert(bytes == 2 || bytes == 4 || bytes == 8); + assert(align_bytes == 2 || align_bytes == 4 || align_bytes == 8); + + const unsigned words = bytes / 2; + const unsigned align_words = align_bytes / 2; + const uint16_t mask = (1U << words) - 1; + + for (unsigned i = 0; i < num_regs; i++) { + for (unsigned j = 0; j <= (16 - words); j += align_words) { + const uint16_t x = regs[i].avail >> j; + + if ((x & mask) == mask) { + if (regs[i].nr == UINT_MAX) + regs[i].nr = alloc.allocate(1); + + regs[i].avail &= ~(mask << j); + + fs_reg reg(VGRF, regs[i].nr); + reg.offset = j * 2; + + return reg; + } + } + } + + unreachable("No free slots found."); +} + +static void +deallocate_slots(struct register_allocation *regs, unsigned num_regs, + unsigned reg_nr, unsigned subreg_offset, unsigned bytes) +{ + assert(bytes == 2 || bytes == 4 || bytes == 8); + assert(subreg_offset % 2 == 0); + assert(subreg_offset + bytes <= 32); + + const unsigned words = bytes / 2; + const unsigned offset = subreg_offset / 2; + const uint16_t mask = ((1U << words) - 1) << offset; + + for (unsigned i = 0; i < num_regs; i++) { + if (regs[i].nr == reg_nr) { + regs[i].avail |= mask; + return; + } + } + + unreachable("No such register found."); +} + +static void +parcel_out_registers(struct imm *imm, unsigned len, const bblock_t *cur_block, + struct register_allocation *regs, unsigned num_regs, + brw::simple_allocator &alloc, unsigned ver) +{ + /* Each basic block has two distinct set of constants. There is the set of + * constants that only have uses in that block, and there is the set of + * constants that have uses after that block. + * + * Allocation proceeds in three passes. + * + * 1. Allocate space for the values that are used outside this block. + * + * 2. Allocate space for the values that are used only in this block. + * + * 3. Deallocate the space for the values that are used only in this block. + */ + + for (unsigned pass = 0; pass < 2; pass++) { + const bool used_in_single_block = pass != 0; + + for (unsigned i = 0; i < len; i++) { + if (imm[i].block == cur_block && + imm[i].used_in_single_block == used_in_single_block) { + /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions: + * + * "In Align16 mode, the channel selects and channel enables apply + * to a pair of half-floats, because these parameters are defined + * for DWord elements ONLY. This is applicable when both source + * and destination are half-floats." + * + * This means that Align16 instructions that use promoted HF + * immediates and use a <0,1,0>:HF region would read 2 HF slots + * instead of replicating the single one we want. To avoid this, we + * always populate both HF slots within a DWord with the constant. + */ + const unsigned width = ver == 8 && imm[i].is_half_float ? 2 : 1; + + const fs_reg reg = allocate_slots(regs, num_regs, + imm[i].size * width, + get_alignment_for_imm(&imm[i]), + alloc); + + imm[i].nr = reg.nr; + imm[i].subreg_offset = reg.offset; + } + } + } + + for (unsigned i = 0; i < len; i++) { + if (imm[i].block == cur_block && imm[i].used_in_single_block) { + const unsigned width = ver == 8 && imm[i].is_half_float ? 2 : 1; + + deallocate_slots(regs, num_regs, imm[i].nr, imm[i].subreg_offset, + imm[i].size * width); + } + } +} + +bool +fs_visitor::opt_combine_constants() +{ + void *const_ctx = ralloc_context(NULL); + + struct table table; + + /* For each of the dynamic arrays in the table, allocate about a page of + * memory. On LP64 systems, this gives 126 value objects 169 fs_inst_box + * objects. Even larger shaders that have been obverved rarely need more + * than 20 or 30 values. Most smaller shaders, which is most shaders, need + * at most a couple dozen fs_inst_box. + */ + table.size = (4096 - (5 * sizeof(void *))) / sizeof(struct value); + table.num_values = 0; + table.values = ralloc_array(const_ctx, struct value, table.size); + + table.size_boxes = (4096 - (5 * sizeof(void *))) / sizeof(struct fs_inst_box); + table.num_boxes = 0; + table.boxes = ralloc_array(const_ctx, fs_inst_box, table.size_boxes); + + const brw::idom_tree &idom = idom_analysis.require(); + unsigned ip = -1; + + /* Make a pass through all instructions and count the number of times each + * constant is used by coissueable instructions or instructions that cannot + * take immediate arguments. + */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + ip++; + + switch (inst->opcode) { + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + case SHADER_OPCODE_POW: + if (inst->src[0].file == IMM) { + assert(inst->opcode != SHADER_OPCODE_POW); + + add_candidate_immediate(&table, inst, ip, 0, true, false, block, + devinfo, const_ctx); + } + + if (inst->src[1].file == IMM && devinfo->ver < 8) { + add_candidate_immediate(&table, inst, ip, 1, true, false, block, + devinfo, const_ctx); + } + + break; + + case BRW_OPCODE_ADD3: + case BRW_OPCODE_MAD: { + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file != IMM) + continue; + + if (can_promote_src_as_imm(devinfo, inst, i)) + continue; + + add_candidate_immediate(&table, inst, ip, i, true, false, block, + devinfo, const_ctx); + } + + break; + } + + case BRW_OPCODE_BFE: + case BRW_OPCODE_BFI2: + case BRW_OPCODE_LRP: + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file != IMM) + continue; + + add_candidate_immediate(&table, inst, ip, i, true, false, block, + devinfo, const_ctx); + } + + break; + + case BRW_OPCODE_SEL: + if (inst->src[0].file == IMM) { + /* It is possible to have src0 be immediate but src1 not be + * immediate for the non-commutative conditional modifiers (e.g., + * G). + */ + if (inst->conditional_mod == BRW_CONDITIONAL_NONE || + /* Only GE and L are commutative. */ + inst->conditional_mod == BRW_CONDITIONAL_GE || + inst->conditional_mod == BRW_CONDITIONAL_L) { + assert(inst->src[1].file == IMM); + + add_candidate_immediate(&table, inst, ip, 0, true, true, block, + devinfo, const_ctx); + add_candidate_immediate(&table, inst, ip, 1, true, true, block, + devinfo, const_ctx); + } else { + add_candidate_immediate(&table, inst, ip, 0, true, false, block, + devinfo, const_ctx); + } + } + break; + + case BRW_OPCODE_ASR: + case BRW_OPCODE_BFI1: + case BRW_OPCODE_ROL: + case BRW_OPCODE_ROR: + case BRW_OPCODE_SHL: + case BRW_OPCODE_SHR: + if (inst->src[0].file == IMM) { + add_candidate_immediate(&table, inst, ip, 0, true, false, block, + devinfo, const_ctx); + } + break; + + case BRW_OPCODE_MOV: + if (could_coissue(devinfo, inst) && inst->src[0].file == IMM) { + add_candidate_immediate(&table, inst, ip, 0, false, false, block, + devinfo, const_ctx); + } + break; + + case BRW_OPCODE_CMP: + case BRW_OPCODE_ADD: + case BRW_OPCODE_MUL: + assert(inst->src[0].file != IMM); + + if (could_coissue(devinfo, inst) && inst->src[1].file == IMM) { + add_candidate_immediate(&table, inst, ip, 1, false, false, block, + devinfo, const_ctx); + } + break; + + default: + break; + } + } + + if (table.num_values == 0) { + ralloc_free(const_ctx); + return false; + } + + combine_constants_result *result = + brw_combine_constants(table.values, table.num_values); + + table.imm = ralloc_array(const_ctx, struct imm, result->num_values_to_emit); + table.len = 0; + + for (unsigned i = 0; i < result->num_values_to_emit; i++) { + struct imm *imm = &table.imm[table.len]; + + imm->block = NULL; + imm->inst = NULL; + imm->d64 = result->values_to_emit[i].value.u64; + imm->size = result->values_to_emit[i].bit_size / 8; + + imm->uses_by_coissue = 0; + imm->must_promote = false; + imm->is_half_float = false; + + imm->first_use_ip = UINT16_MAX; + imm->last_use_ip = 0; + + imm->uses = new(const_ctx) exec_list; + + const unsigned first_user = result->values_to_emit[i].first_user; + const unsigned last_user = first_user + + result->values_to_emit[i].num_users; + + for (unsigned j = first_user; j < last_user; j++) { + const unsigned idx = table.values[result->user_map[j].index].instr_index; + fs_inst_box *const ib = &table.boxes[idx]; + + const unsigned src = table.values[result->user_map[j].index].src; + + imm->uses->push_tail(link(const_ctx, ib->inst, src, + result->user_map[j].negate, + result->user_map[j].type)); + + if (ib->must_promote) + imm->must_promote = true; + else + imm->uses_by_coissue++; + + if (imm->block == NULL) { + /* Block should only be NULL on the first pass. On the first + * pass, inst should also be NULL. + */ + assert(imm->inst == NULL); + + imm->inst = ib->inst; + imm->block = ib->block; + imm->first_use_ip = ib->ip; + imm->last_use_ip = ib->ip; + imm->used_in_single_block = true; + } else { + bblock_t *intersection = idom.intersect(ib->block, + imm->block); + + if (ib->block != imm->block) + imm->used_in_single_block = false; + + if (imm->first_use_ip > ib->ip) { + imm->first_use_ip = ib->ip; + + /* If the first-use instruction is to be tracked, block must be + * the block that contains it. The old block was read in the + * idom.intersect call above, so it is safe to overwrite it + * here. + */ + imm->inst = ib->inst; + imm->block = ib->block; + } + + if (imm->last_use_ip < ib->ip) + imm->last_use_ip = ib->ip; + + /* The common dominator is not the block that contains the + * first-use instruction, so don't track that instruction. The + * load instruction will be added in the common dominator block + * instead of before the first-use instruction. + */ + if (intersection != imm->block) + imm->inst = NULL; + + imm->block = intersection; + } + + if (ib->inst->src[src].type == BRW_REGISTER_TYPE_HF) + imm->is_half_float = true; + } + + /* Remove constants from the table that don't have enough uses to make + * them profitable to store in a register. + */ + if (imm->must_promote || imm->uses_by_coissue >= 4) + table.len++; + } + + delete result; + + if (table.len == 0) { + ralloc_free(const_ctx); + return false; + } + if (cfg->num_blocks != 1) + qsort(table.imm, table.len, sizeof(struct imm), compare); + + if (devinfo->ver > 7) { + struct register_allocation *regs = + (struct register_allocation *) calloc(table.len, sizeof(regs[0])); + + for (int i = 0; i < table.len; i++) { + regs[i].nr = UINT_MAX; + regs[i].avail = 0xffff; + } + + foreach_block(block, cfg) { + parcel_out_registers(table.imm, table.len, block, regs, table.len, + alloc, devinfo->ver); + } + + free(regs); + } else { + fs_reg reg(VGRF, alloc.allocate(1)); + reg.stride = 0; + + for (int i = 0; i < table.len; i++) { + struct imm *imm = &table.imm[i]; + + /* Put the immediate in an offset aligned to its size. Some + * instructions seem to have additional alignment requirements, so + * account for that too. + */ + reg.offset = ALIGN(reg.offset, get_alignment_for_imm(imm)); + + /* Ensure we have enough space in the register to copy the immediate */ + if (reg.offset + imm->size > REG_SIZE) { + reg.nr = alloc.allocate(1); + reg.offset = 0; + } + + imm->nr = reg.nr; + imm->subreg_offset = reg.offset; + + reg.offset += imm->size; + } + } + + bool rebuild_cfg = false; + + /* Insert MOVs to load the constant values into GRFs. */ + for (int i = 0; i < table.len; i++) { + struct imm *imm = &table.imm[i]; + + /* Insert it either before the instruction that generated the immediate + * or after the last non-control flow instruction of the common ancestor. + */ + exec_node *n; + bblock_t *insert_block; + if (imm->inst != nullptr) { + n = imm->inst; + insert_block = imm->block; + } else { + if (imm->block->start()->opcode == BRW_OPCODE_DO) { + /* DO blocks are weird. They can contain only the single DO + * instruction. As a result, MOV instructions cannot be added to + * the DO block. + */ + bblock_t *next_block = imm->block->next(); + if (next_block->starts_with_control_flow()) { + /* This is the difficult case. This occurs for code like + * + * do { + * do { + * ... + * } while (...); + * } while (...); + * + * when the MOV instructions need to be inserted between the + * two DO instructions. + * + * To properly handle this scenario, a new block would need to + * be inserted. Doing so would require modifying arbitrary many + * CONTINUE, BREAK, and WHILE instructions to point to the new + * block. + * + * It is unlikely that this would ever be correct. Instead, + * insert the MOV instructions in the known wrong place and + * rebuild the CFG at the end of the pass. + */ + insert_block = imm->block; + n = insert_block->last_non_control_flow_inst()->next; + + rebuild_cfg = true; + } else { + insert_block = next_block; + n = insert_block->start(); + } + } else { + insert_block = imm->block; + n = insert_block->last_non_control_flow_inst()->next; + } + } + + /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions: + * + * "In Align16 mode, the channel selects and channel enables apply to a + * pair of half-floats, because these parameters are defined for DWord + * elements ONLY. This is applicable when both source and destination + * are half-floats." + * + * This means that Align16 instructions that use promoted HF immediates + * and use a <0,1,0>:HF region would read 2 HF slots instead of + * replicating the single one we want. To avoid this, we always populate + * both HF slots within a DWord with the constant. + */ + const uint32_t width = devinfo->ver == 8 && imm->is_half_float ? 2 : 1; + const fs_builder ibld = fs_builder(this, width).at(insert_block, n).exec_all(); + + fs_reg reg(VGRF, imm->nr); + reg.offset = imm->subreg_offset; + reg.stride = 0; + + /* Put the immediate in an offset aligned to its size. Some instructions + * seem to have additional alignment requirements, so account for that + * too. + */ + assert(reg.offset == ALIGN(reg.offset, get_alignment_for_imm(imm))); + + struct brw_reg imm_reg = build_imm_reg_for_copy(imm); + + /* Ensure we have enough space in the register to copy the immediate */ + assert(reg.offset + type_sz(imm_reg.type) * width <= REG_SIZE); + + ibld.MOV(retype(reg, imm_reg.type), imm_reg); + } + shader_stats.promoted_constants = table.len; + + /* Rewrite the immediate sources to refer to the new GRFs. */ + for (int i = 0; i < table.len; i++) { + foreach_list_typed(reg_link, link, link, table.imm[i].uses) { + fs_reg *reg = &link->inst->src[link->src]; + + if (link->inst->opcode == BRW_OPCODE_SEL) { + if (link->type == either_type) { + /* Do not change the register type. */ + } else if (link->type == integer_only) { + reg->type = brw_int_type(type_sz(reg->type), true); + } else { + assert(link->type == float_only); + + switch (type_sz(reg->type)) { + case 2: + reg->type = BRW_REGISTER_TYPE_HF; + break; + case 4: + reg->type = BRW_REGISTER_TYPE_F; + break; + case 8: + reg->type = BRW_REGISTER_TYPE_DF; + break; + default: + unreachable("Bad type size"); + } + } + } else if ((link->inst->opcode == BRW_OPCODE_SHL || + link->inst->opcode == BRW_OPCODE_ASR) && + link->negate) { + reg->type = brw_int_type(type_sz(reg->type), true); + } + +#ifdef DEBUG + switch (reg->type) { + case BRW_REGISTER_TYPE_DF: + assert((isnan(reg->df) && isnan(table.imm[i].df)) || + (fabs(reg->df) == fabs(table.imm[i].df))); + break; + case BRW_REGISTER_TYPE_F: + assert((isnan(reg->f) && isnan(table.imm[i].f)) || + (fabsf(reg->f) == fabsf(table.imm[i].f))); + break; + case BRW_REGISTER_TYPE_HF: + assert((isnan(_mesa_half_to_float(reg->d & 0xffffu)) && + isnan(_mesa_half_to_float(table.imm[i].w))) || + (fabsf(_mesa_half_to_float(reg->d & 0xffffu)) == + fabsf(_mesa_half_to_float(table.imm[i].w)))); + break; + case BRW_REGISTER_TYPE_Q: + assert(abs(reg->d64) == abs(table.imm[i].d64)); + break; + case BRW_REGISTER_TYPE_UQ: + assert(!link->negate); + assert(reg->d64 == table.imm[i].d64); + break; + case BRW_REGISTER_TYPE_D: + assert(abs(reg->d) == abs(table.imm[i].d)); + break; + case BRW_REGISTER_TYPE_UD: + assert(!link->negate); + assert(reg->d == table.imm[i].d); + break; + case BRW_REGISTER_TYPE_W: + assert(abs((int16_t) (reg->d & 0xffff)) == table.imm[i].w); + break; + case BRW_REGISTER_TYPE_UW: + assert(!link->negate); + assert((reg->ud & 0xffffu) == (uint16_t) table.imm[i].w); + break; + default: + break; + } +#endif + + assert(link->inst->can_do_source_mods(devinfo) || !link->negate); + + reg->file = VGRF; + reg->offset = table.imm[i].subreg_offset; + reg->stride = 0; + reg->negate = link->negate; + reg->nr = table.imm[i].nr; + } + } + + /* Fixup any SEL instructions that have src0 still as an immediate. Fixup + * the types of any SEL instruction that have a negation on one of the + * sources. Adding the negation may have changed the type of that source, + * so the other source (and destination) must be changed to match. + */ + for (unsigned i = 0; i < table.num_boxes; i++) { + fs_inst *inst = table.boxes[i].inst; + + if (inst->opcode != BRW_OPCODE_SEL) + continue; + + /* If both sources have negation, the types had better be the same! */ + assert(!inst->src[0].negate || !inst->src[1].negate || + inst->src[0].type == inst->src[1].type); + + /* If either source has a negation, force the type of the other source + * and the type of the result to be the same. + */ + if (inst->src[0].negate) { + inst->src[1].type = inst->src[0].type; + inst->dst.type = inst->src[0].type; + } + + if (inst->src[1].negate) { + inst->src[0].type = inst->src[1].type; + inst->dst.type = inst->src[1].type; + } + + if (inst->src[0].file != IMM) + continue; + + assert(inst->src[1].file != IMM); + assert(inst->conditional_mod == BRW_CONDITIONAL_NONE || + inst->conditional_mod == BRW_CONDITIONAL_GE || + inst->conditional_mod == BRW_CONDITIONAL_L); + + fs_reg temp = inst->src[0]; + inst->src[0] = inst->src[1]; + inst->src[1] = temp; + + /* If this was predicated, flipping operands means we also need to flip + * the predicate. + */ + if (inst->conditional_mod == BRW_CONDITIONAL_NONE) + inst->predicate_inverse = !inst->predicate_inverse; + } + + if (debug) { + for (int i = 0; i < table.len; i++) { + struct imm *imm = &table.imm[i]; + + fprintf(stderr, + "0x%016" PRIx64 " - block %3d, reg %3d sub %2d, " + "Uses: (%2d, %2d), IP: %4d to %4d, length %4d\n", + (uint64_t)(imm->d & BITFIELD64_MASK(imm->size * 8)), + imm->block->num, + imm->nr, + imm->subreg_offset, + imm->must_promote, + imm->uses_by_coissue, + imm->first_use_ip, + imm->last_use_ip, + imm->last_use_ip - imm->first_use_ip); + } + } + + if (rebuild_cfg) { + /* When the CFG is initially built, the instructions are removed from + * the list of instructions stored in fs_visitor -- the same exec_node + * is used for membership in that list and in a block list. So we need + * to pull them back before rebuilding the CFG. + */ + assert(exec_list_length(&instructions) == 0); + foreach_block(block, cfg) { + exec_list_append(&instructions, &block->instructions); + } + + delete cfg; + cfg = NULL; + calculate_cfg(); + } + + ralloc_free(const_ctx); + + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES | + (rebuild_cfg ? DEPENDENCY_BLOCKS : DEPENDENCY_NOTHING)); + + return true; +} diff --git a/src/intel/compiler/elk/brw_fs_copy_propagation.cpp b/src/intel/compiler/elk/brw_fs_copy_propagation.cpp new file mode 100644 index 00000000000..62c16be4e64 --- /dev/null +++ b/src/intel/compiler/elk/brw_fs_copy_propagation.cpp @@ -0,0 +1,1468 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_fs_copy_propagation.cpp + * + * Support for global copy propagation in two passes: A local pass that does + * intra-block copy (and constant) propagation, and a global pass that uses + * dataflow analysis on the copies available at the end of each block to re-do + * local copy propagation with more copies available. + * + * See Muchnick's Advanced Compiler Design and Implementation, section + * 12.5 (p356). + */ + +#include "util/bitset.h" +#include "util/u_math.h" +#include "util/rb_tree.h" +#include "brw_fs.h" +#include "brw_fs_live_variables.h" +#include "brw_cfg.h" +#include "brw_eu.h" + +using namespace brw; + +namespace { /* avoid conflict with opt_copy_propagation_elements */ +struct acp_entry { + struct rb_node by_dst; + struct rb_node by_src; + fs_reg dst; + fs_reg src; + unsigned global_idx; + unsigned size_written; + unsigned size_read; + enum opcode opcode; + bool is_partial_write; + bool force_writemask_all; +}; + +/** + * Compare two acp_entry::src.nr + * + * This is intended to be used as the comparison function for rb_tree. + */ +static int +cmp_entry_dst_entry_dst(const struct rb_node *a_node, const struct rb_node *b_node) +{ + const struct acp_entry *a_entry = + rb_node_data(struct acp_entry, a_node, by_dst); + + const struct acp_entry *b_entry = + rb_node_data(struct acp_entry, b_node, by_dst); + + return a_entry->dst.nr - b_entry->dst.nr; +} + +static int +cmp_entry_dst_nr(const struct rb_node *a_node, const void *b_key) +{ + const struct acp_entry *a_entry = + rb_node_data(struct acp_entry, a_node, by_dst); + + return a_entry->dst.nr - (uintptr_t) b_key; +} + +static int +cmp_entry_src_entry_src(const struct rb_node *a_node, const struct rb_node *b_node) +{ + const struct acp_entry *a_entry = + rb_node_data(struct acp_entry, a_node, by_src); + + const struct acp_entry *b_entry = + rb_node_data(struct acp_entry, b_node, by_src); + + return a_entry->src.nr - b_entry->src.nr; +} + +/** + * Compare an acp_entry::src.nr with a raw nr. + * + * This is intended to be used as the comparison function for rb_tree. + */ +static int +cmp_entry_src_nr(const struct rb_node *a_node, const void *b_key) +{ + const struct acp_entry *a_entry = + rb_node_data(struct acp_entry, a_node, by_src); + + return a_entry->src.nr - (uintptr_t) b_key; +} + +class acp_forward_iterator { +public: + acp_forward_iterator(struct rb_node *n, unsigned offset) + : curr(n), next(nullptr), offset(offset) + { + next = rb_node_next_or_null(curr); + } + + acp_forward_iterator &operator++() + { + curr = next; + next = rb_node_next_or_null(curr); + + return *this; + } + + bool operator!=(const acp_forward_iterator &other) const + { + return curr != other.curr; + } + + struct acp_entry *operator*() const + { + /* This open-codes part of rb_node_data. */ + return curr != NULL ? (struct acp_entry *)(((char *)curr) - offset) + : NULL; + } + +private: + struct rb_node *curr; + struct rb_node *next; + unsigned offset; +}; + +struct acp { + struct rb_tree by_dst; + struct rb_tree by_src; + + acp() + { + rb_tree_init(&by_dst); + rb_tree_init(&by_src); + } + + acp_forward_iterator begin() + { + return acp_forward_iterator(rb_tree_first(&by_src), + rb_tree_offsetof(struct acp_entry, by_src, 0)); + } + + const acp_forward_iterator end() const + { + return acp_forward_iterator(nullptr, 0); + } + + unsigned length() + { + unsigned l = 0; + + for (rb_node *iter = rb_tree_first(&by_src); + iter != NULL; iter = rb_node_next(iter)) + l++; + + return l; + } + + void add(acp_entry *entry) + { + rb_tree_insert(&by_dst, &entry->by_dst, cmp_entry_dst_entry_dst); + rb_tree_insert(&by_src, &entry->by_src, cmp_entry_src_entry_src); + } + + void remove(acp_entry *entry) + { + rb_tree_remove(&by_dst, &entry->by_dst); + rb_tree_remove(&by_src, &entry->by_src); + } + + acp_forward_iterator find_by_src(unsigned nr) + { + struct rb_node *rbn = rb_tree_search(&by_src, + (void *)(uintptr_t) nr, + cmp_entry_src_nr); + + return acp_forward_iterator(rbn, rb_tree_offsetof(struct acp_entry, + by_src, rbn)); + } + + acp_forward_iterator find_by_dst(unsigned nr) + { + struct rb_node *rbn = rb_tree_search(&by_dst, + (void *)(uintptr_t) nr, + cmp_entry_dst_nr); + + return acp_forward_iterator(rbn, rb_tree_offsetof(struct acp_entry, + by_dst, rbn)); + } +}; + +struct block_data { + /** + * Which entries in the fs_copy_prop_dataflow acp table are live at the + * start of this block. This is the useful output of the analysis, since + * it lets us plug those into the local copy propagation on the second + * pass. + */ + BITSET_WORD *livein; + + /** + * Which entries in the fs_copy_prop_dataflow acp table are live at the end + * of this block. This is done in initial setup from the per-block acps + * returned by the first local copy prop pass. + */ + BITSET_WORD *liveout; + + /** + * Which entries in the fs_copy_prop_dataflow acp table are generated by + * instructions in this block which reach the end of the block without + * being killed. + */ + BITSET_WORD *copy; + + /** + * Which entries in the fs_copy_prop_dataflow acp table are killed over the + * course of this block. + */ + BITSET_WORD *kill; + + /** + * Which entries in the fs_copy_prop_dataflow acp table are guaranteed to + * have a fully uninitialized destination at the end of this block. + */ + BITSET_WORD *undef; + + /** + * Which entries in the fs_copy_prop_dataflow acp table can the + * start of this block be reached from. Note that this is a weaker + * condition than livein. + */ + BITSET_WORD *reachin; + + /** + * Which entries in the fs_copy_prop_dataflow acp table are + * overwritten by an instruction with channel masks inconsistent + * with the copy instruction (e.g. due to force_writemask_all). + * Such an overwrite can cause the copy entry to become invalid + * even if the copy instruction is subsequently re-executed for any + * given channel i, since the execution of the overwrite for + * channel i may corrupt other channels j!=i inactive for the + * subsequent copy. + */ + BITSET_WORD *exec_mismatch; +}; + +class fs_copy_prop_dataflow +{ +public: + fs_copy_prop_dataflow(linear_ctx *lin_ctx, cfg_t *cfg, + const fs_live_variables &live, + struct acp *out_acp); + + void setup_initial_values(); + void run(); + + void dump_block_data() const UNUSED; + + cfg_t *cfg; + const fs_live_variables &live; + + acp_entry **acp; + int num_acp; + int bitset_words; + + struct block_data *bd; +}; +} /* anonymous namespace */ + +fs_copy_prop_dataflow::fs_copy_prop_dataflow(linear_ctx *lin_ctx, cfg_t *cfg, + const fs_live_variables &live, + struct acp *out_acp) + : cfg(cfg), live(live) +{ + bd = linear_zalloc_array(lin_ctx, struct block_data, cfg->num_blocks); + + num_acp = 0; + foreach_block (block, cfg) + num_acp += out_acp[block->num].length(); + + bitset_words = BITSET_WORDS(num_acp); + + foreach_block (block, cfg) { + bd[block->num].livein = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words); + bd[block->num].liveout = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words); + bd[block->num].copy = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words); + bd[block->num].kill = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words); + bd[block->num].undef = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words); + bd[block->num].reachin = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words); + bd[block->num].exec_mismatch = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words); + } + + acp = linear_zalloc_array(lin_ctx, struct acp_entry *, num_acp); + + int next_acp = 0; + foreach_block (block, cfg) { + for (auto iter = out_acp[block->num].begin(); + iter != out_acp[block->num].end(); ++iter) { + acp[next_acp] = *iter; + + (*iter)->global_idx = next_acp; + + /* opt_copy_propagation_local populates out_acp with copies created + * in a block which are still live at the end of the block. This + * is exactly what we want in the COPY set. + */ + BITSET_SET(bd[block->num].copy, next_acp); + + next_acp++; + } + } + + assert(next_acp == num_acp); + + setup_initial_values(); + run(); +} + +/** + * Like reg_offset, but register must be VGRF or FIXED_GRF. + */ +static inline unsigned +grf_reg_offset(const fs_reg &r) +{ + return (r.file == VGRF ? 0 : r.nr) * REG_SIZE + + r.offset + + (r.file == FIXED_GRF ? r.subnr : 0); +} + +/** + * Like regions_overlap, but register must be VGRF or FIXED_GRF. + */ +static inline bool +grf_regions_overlap(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds) +{ + return reg_space(r) == reg_space(s) && + !(grf_reg_offset(r) + dr <= grf_reg_offset(s) || + grf_reg_offset(s) + ds <= grf_reg_offset(r)); +} + +/** + * Set up initial values for each of the data flow sets, prior to running + * the fixed-point algorithm. + */ +void +fs_copy_prop_dataflow::setup_initial_values() +{ + /* Initialize the COPY and KILL sets. */ + { + struct acp acp_table; + + /* First, get all the KILLs for instructions which overwrite ACP + * destinations. + */ + for (int i = 0; i < num_acp; i++) + acp_table.add(acp[i]); + + foreach_block (block, cfg) { + foreach_inst_in_block(fs_inst, inst, block) { + if (inst->dst.file != VGRF && + inst->dst.file != FIXED_GRF) + continue; + + for (auto iter = acp_table.find_by_src(inst->dst.nr); + iter != acp_table.end() && (*iter)->src.nr == inst->dst.nr; + ++iter) { + if (grf_regions_overlap(inst->dst, inst->size_written, + (*iter)->src, (*iter)->size_read)) { + BITSET_SET(bd[block->num].kill, (*iter)->global_idx); + if (inst->force_writemask_all && !(*iter)->force_writemask_all) + BITSET_SET(bd[block->num].exec_mismatch, (*iter)->global_idx); + } + } + + if (inst->dst.file != VGRF) + continue; + + for (auto iter = acp_table.find_by_dst(inst->dst.nr); + iter != acp_table.end() && (*iter)->dst.nr == inst->dst.nr; + ++iter) { + if (grf_regions_overlap(inst->dst, inst->size_written, + (*iter)->dst, (*iter)->size_written)) { + BITSET_SET(bd[block->num].kill, (*iter)->global_idx); + if (inst->force_writemask_all && !(*iter)->force_writemask_all) + BITSET_SET(bd[block->num].exec_mismatch, (*iter)->global_idx); + } + } + } + } + } + + /* Populate the initial values for the livein and liveout sets. For the + * block at the start of the program, livein = 0 and liveout = copy. + * For the others, set liveout and livein to ~0 (the universal set). + */ + foreach_block (block, cfg) { + if (block->parents.is_empty()) { + for (int i = 0; i < bitset_words; i++) { + bd[block->num].livein[i] = 0u; + bd[block->num].liveout[i] = bd[block->num].copy[i]; + } + } else { + for (int i = 0; i < bitset_words; i++) { + bd[block->num].liveout[i] = ~0u; + bd[block->num].livein[i] = ~0u; + } + } + } + + /* Initialize the undef set. */ + foreach_block (block, cfg) { + for (int i = 0; i < num_acp; i++) { + BITSET_SET(bd[block->num].undef, i); + for (unsigned off = 0; off < acp[i]->size_written; off += REG_SIZE) { + if (BITSET_TEST(live.block_data[block->num].defout, + live.var_from_reg(byte_offset(acp[i]->dst, off)))) + BITSET_CLEAR(bd[block->num].undef, i); + } + } + } +} + +/** + * Walk the set of instructions in the block, marking which entries in the acp + * are killed by the block. + */ +void +fs_copy_prop_dataflow::run() +{ + bool progress; + + do { + progress = false; + + foreach_block (block, cfg) { + if (block->parents.is_empty()) + continue; + + for (int i = 0; i < bitset_words; i++) { + const BITSET_WORD old_liveout = bd[block->num].liveout[i]; + const BITSET_WORD old_reachin = bd[block->num].reachin[i]; + BITSET_WORD livein_from_any_block = 0; + + /* Update livein for this block. If a copy is live out of all + * parent blocks, it's live coming in to this block. + */ + bd[block->num].livein[i] = ~0u; + foreach_list_typed(bblock_link, parent_link, link, &block->parents) { + bblock_t *parent = parent_link->block; + /* Consider ACP entries with a known-undefined destination to + * be available from the parent. This is valid because we're + * free to set the undefined variable equal to the source of + * the ACP entry without breaking the application's + * expectations, since the variable is undefined. + */ + bd[block->num].livein[i] &= (bd[parent->num].liveout[i] | + bd[parent->num].undef[i]); + livein_from_any_block |= bd[parent->num].liveout[i]; + + /* Update reachin for this block. If the end of any + * parent block is reachable from the copy, the start + * of this block is reachable from it as well. + */ + bd[block->num].reachin[i] |= (bd[parent->num].reachin[i] | + bd[parent->num].copy[i]); + } + + /* Limit to the set of ACP entries that can possibly be available + * at the start of the block, since propagating from a variable + * which is guaranteed to be undefined (rather than potentially + * undefined for some dynamic control-flow paths) doesn't seem + * particularly useful. + */ + bd[block->num].livein[i] &= livein_from_any_block; + + /* Update liveout for this block. */ + bd[block->num].liveout[i] = + bd[block->num].copy[i] | (bd[block->num].livein[i] & + ~bd[block->num].kill[i]); + + if (old_liveout != bd[block->num].liveout[i] || + old_reachin != bd[block->num].reachin[i]) + progress = true; + } + } + } while (progress); + + /* Perform a second fixed-point pass in order to propagate the + * exec_mismatch bitsets. Note that this requires an accurate + * value of the reachin bitsets as input, which isn't available + * until the end of the first propagation pass, so this loop cannot + * be folded into the previous one. + */ + do { + progress = false; + + foreach_block (block, cfg) { + for (int i = 0; i < bitset_words; i++) { + const BITSET_WORD old_exec_mismatch = bd[block->num].exec_mismatch[i]; + + /* Update exec_mismatch for this block. If the end of a + * parent block is reachable by an overwrite with + * inconsistent execution masking, the start of this block + * is reachable by such an overwrite as well. + */ + foreach_list_typed(bblock_link, parent_link, link, &block->parents) { + bblock_t *parent = parent_link->block; + bd[block->num].exec_mismatch[i] |= (bd[parent->num].exec_mismatch[i] & + bd[parent->num].reachin[i]); + } + + /* Only consider overwrites with inconsistent execution + * masking if they are reachable from the copy, since + * overwrites unreachable from a copy are harmless to that + * copy. + */ + bd[block->num].exec_mismatch[i] &= bd[block->num].reachin[i]; + if (old_exec_mismatch != bd[block->num].exec_mismatch[i]) + progress = true; + } + } + } while (progress); +} + +void +fs_copy_prop_dataflow::dump_block_data() const +{ + foreach_block (block, cfg) { + fprintf(stderr, "Block %d [%d, %d] (parents ", block->num, + block->start_ip, block->end_ip); + foreach_list_typed(bblock_link, link, link, &block->parents) { + bblock_t *parent = link->block; + fprintf(stderr, "%d ", parent->num); + } + fprintf(stderr, "):\n"); + fprintf(stderr, " livein = 0x"); + for (int i = 0; i < bitset_words; i++) + fprintf(stderr, "%08x", bd[block->num].livein[i]); + fprintf(stderr, ", liveout = 0x"); + for (int i = 0; i < bitset_words; i++) + fprintf(stderr, "%08x", bd[block->num].liveout[i]); + fprintf(stderr, ",\n copy = 0x"); + for (int i = 0; i < bitset_words; i++) + fprintf(stderr, "%08x", bd[block->num].copy[i]); + fprintf(stderr, ", kill = 0x"); + for (int i = 0; i < bitset_words; i++) + fprintf(stderr, "%08x", bd[block->num].kill[i]); + fprintf(stderr, "\n"); + } +} + +static bool +is_logic_op(enum opcode opcode) +{ + return (opcode == BRW_OPCODE_AND || + opcode == BRW_OPCODE_OR || + opcode == BRW_OPCODE_XOR || + opcode == BRW_OPCODE_NOT); +} + +static bool +can_take_stride(fs_inst *inst, brw_reg_type dst_type, + unsigned arg, unsigned stride, + const struct brw_compiler *compiler) +{ + const struct intel_device_info *devinfo = compiler->devinfo; + + if (stride > 4) + return false; + + /* Bail if the channels of the source need to be aligned to the byte offset + * of the corresponding channel of the destination, and the provided stride + * would break this restriction. + */ + if (has_dst_aligned_region_restriction(devinfo, inst, dst_type) && + !(type_sz(inst->src[arg].type) * stride == + type_sz(dst_type) * inst->dst.stride || + stride == 0)) + return false; + + /* 3-source instructions can only be Align16, which restricts what strides + * they can take. They can only take a stride of 1 (the usual case), or 0 + * with a special "repctrl" bit. But the repctrl bit doesn't work for + * 64-bit datatypes, so if the source type is 64-bit then only a stride of + * 1 is allowed. From the Broadwell PRM, Volume 7 "3D Media GPGPU", page + * 944: + * + * This is applicable to 32b datatypes and 16b datatype. 64b datatypes + * cannot use the replicate control. + */ + if (inst->is_3src(compiler)) { + if (type_sz(inst->src[arg].type) > 4) + return stride == 1; + else + return stride == 1 || stride == 0; + } + + /* From the Broadwell PRM, Volume 2a "Command Reference - Instructions", + * page 391 ("Extended Math Function"): + * + * The following restrictions apply for align1 mode: Scalar source is + * supported. Source and destination horizontal stride must be the + * same. + * + * From the Haswell PRM Volume 2b "Command Reference - Instructions", page + * 134 ("Extended Math Function"): + * + * Scalar source is supported. Source and destination horizontal stride + * must be 1. + * + * and similar language exists for IVB and SNB. Pre-SNB, math instructions + * are sends, so the sources are moved to MRF's and there are no + * restrictions. + */ + if (inst->is_math()) { + if (devinfo->ver == 6 || devinfo->ver == 7) { + assert(inst->dst.stride == 1); + return stride == 1 || stride == 0; + } else if (devinfo->ver >= 8) { + return stride == inst->dst.stride || stride == 0; + } + } + + return true; +} + +static bool +instruction_requires_packed_data(fs_inst *inst) +{ + switch (inst->opcode) { + case FS_OPCODE_DDX_FINE: + case FS_OPCODE_DDX_COARSE: + case FS_OPCODE_DDY_FINE: + case FS_OPCODE_DDY_COARSE: + case SHADER_OPCODE_QUAD_SWIZZLE: + return true; + default: + return false; + } +} + +static bool +try_copy_propagate(const brw_compiler *compiler, fs_inst *inst, + acp_entry *entry, int arg, + const brw::simple_allocator &alloc, + uint8_t max_polygons) +{ + if (inst->src[arg].file != VGRF) + return false; + + const struct intel_device_info *devinfo = compiler->devinfo; + + assert(entry->src.file == VGRF || entry->src.file == UNIFORM || + entry->src.file == ATTR || entry->src.file == FIXED_GRF); + + /* Avoid propagating a LOAD_PAYLOAD instruction into another if there is a + * good chance that we'll be able to eliminate the latter through register + * coalescing. If only part of the sources of the second LOAD_PAYLOAD can + * be simplified through copy propagation we would be making register + * coalescing impossible, ending up with unnecessary copies in the program. + * This is also the case for is_multi_copy_payload() copies that can only + * be coalesced when the instruction is lowered into a sequence of MOVs. + * + * Worse -- In cases where the ACP entry was the result of CSE combining + * multiple LOAD_PAYLOAD subexpressions, propagating the first LOAD_PAYLOAD + * into the second would undo the work of CSE, leading to an infinite + * optimization loop. Avoid this by detecting LOAD_PAYLOAD copies from CSE + * temporaries which should match is_coalescing_payload(). + */ + if (entry->opcode == SHADER_OPCODE_LOAD_PAYLOAD && + (is_coalescing_payload(alloc, inst) || is_multi_copy_payload(inst))) + return false; + + assert(entry->dst.file == VGRF); + if (inst->src[arg].nr != entry->dst.nr) + return false; + + /* Bail if inst is reading a range that isn't contained in the range + * that entry is writing. + */ + if (!region_contained_in(inst->src[arg], inst->size_read(arg), + entry->dst, entry->size_written)) + return false; + + /* Send messages with EOT set are restricted to use g112-g127 (and we + * sometimes need g127 for other purposes), so avoid copy propagating + * anything that would make it impossible to satisfy that restriction. + */ + if (inst->eot) { + /* Avoid propagating a FIXED_GRF register, as that's already pinned. */ + if (entry->src.file == FIXED_GRF) + return false; + + /* We might be propagating from a large register, while the SEND only + * is reading a portion of it (say the .A channel in an RGBA value). + * We need to pin both split SEND sources in g112-g126/127, so only + * allow this if the registers aren't too large. + */ + if (inst->opcode == SHADER_OPCODE_SEND && entry->src.file == VGRF) { + int other_src = arg == 2 ? 3 : 2; + unsigned other_size = inst->src[other_src].file == VGRF ? + alloc.sizes[inst->src[other_src].nr] : + inst->size_read(other_src); + unsigned prop_src_size = alloc.sizes[entry->src.nr]; + if (other_size + prop_src_size > 15) + return false; + } + } + + /* Avoid propagating odd-numbered FIXED_GRF registers into the first source + * of a LINTERP instruction on platforms where the PLN instruction has + * register alignment restrictions. + */ + if (devinfo->has_pln && devinfo->ver <= 6 && + entry->src.file == FIXED_GRF && (entry->src.nr & 1) && + inst->opcode == FS_OPCODE_LINTERP && arg == 0) + return false; + + /* we can't generally copy-propagate UD negations because we + * can end up accessing the resulting values as signed integers + * instead. See also resolve_ud_negate() and comment in + * fs_generator::generate_code. + */ + if (entry->src.type == BRW_REGISTER_TYPE_UD && + entry->src.negate) + return false; + + bool has_source_modifiers = entry->src.abs || entry->src.negate; + + if (has_source_modifiers && !inst->can_do_source_mods(devinfo)) + return false; + + /* Reject cases that would violate register regioning restrictions. */ + if ((entry->src.file == UNIFORM || !entry->src.is_contiguous()) && + ((devinfo->ver == 6 && inst->is_math()) || + inst->is_send_from_grf() || + inst->uses_indirect_addressing())) { + return false; + } + + if (has_source_modifiers && + inst->opcode == SHADER_OPCODE_GFX4_SCRATCH_WRITE) + return false; + + /* Some instructions implemented in the generator backend, such as + * derivatives, assume that their operands are packed so we can't + * generally propagate strided regions to them. + */ + const unsigned entry_stride = (entry->src.file == FIXED_GRF ? 1 : + entry->src.stride); + if (instruction_requires_packed_data(inst) && entry_stride != 1) + return false; + + const brw_reg_type dst_type = (has_source_modifiers && + entry->dst.type != inst->src[arg].type) ? + entry->dst.type : inst->dst.type; + + /* Bail if the result of composing both strides would exceed the + * hardware limit. + */ + if (!can_take_stride(inst, dst_type, arg, + entry_stride * inst->src[arg].stride, + compiler)) + return false; + + /* From the Cherry Trail/Braswell PRMs, Volume 7: 3D Media GPGPU: + * EU Overview + * Register Region Restrictions + * Special Requirements for Handling Double Precision Data Types : + * + * "When source or destination datatype is 64b or operation is integer + * DWord multiply, regioning in Align1 must follow these rules: + * + * 1. Source and Destination horizontal stride must be aligned to the + * same qword. + * 2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride. + * 3. Source and Destination offset must be the same, except the case + * of scalar source." + * + * Most of this is already checked in can_take_stride(), we're only left + * with checking 3. + */ + if (has_dst_aligned_region_restriction(devinfo, inst, dst_type) && + entry_stride != 0 && + (reg_offset(inst->dst) % REG_SIZE) != (reg_offset(entry->src) % REG_SIZE)) + return false; + + /* The <8;8,0> regions used for FS attributes in multipolygon + * dispatch mode could violate regioning restrictions, don't copy + * propagate them in such cases. + */ + if (entry->src.file == ATTR && max_polygons > 1 && + (has_dst_aligned_region_restriction(devinfo, inst, dst_type) || + instruction_requires_packed_data(inst) || + (inst->is_3src(compiler) && arg == 2) || + entry->dst.type != inst->src[arg].type)) + return false; + + /* Bail if the source FIXED_GRF region of the copy cannot be trivially + * composed with the source region of the instruction -- E.g. because the + * copy uses some extended stride greater than 4 not supported natively by + * the hardware as a horizontal stride, or because instruction compression + * could require us to use a vertical stride shorter than a GRF. + */ + if (entry->src.file == FIXED_GRF && + (inst->src[arg].stride > 4 || + inst->dst.component_size(inst->exec_size) > + inst->src[arg].component_size(inst->exec_size))) + return false; + + /* Bail if the instruction type is larger than the execution type of the + * copy, what implies that each channel is reading multiple channels of the + * destination of the copy, and simply replacing the sources would give a + * program with different semantics. + */ + if ((type_sz(entry->dst.type) < type_sz(inst->src[arg].type) || + entry->is_partial_write) && + inst->opcode != BRW_OPCODE_MOV) { + return false; + } + + /* Bail if the result of composing both strides cannot be expressed + * as another stride. This avoids, for example, trying to transform + * this: + * + * MOV (8) rX<1>UD rY<0;1,0>UD + * FOO (8) ... rX<8;8,1>UW + * + * into this: + * + * FOO (8) ... rY<0;1,0>UW + * + * Which would have different semantics. + */ + if (entry_stride != 1 && + (inst->src[arg].stride * + type_sz(inst->src[arg].type)) % type_sz(entry->src.type) != 0) + return false; + + /* Since semantics of source modifiers are type-dependent we need to + * ensure that the meaning of the instruction remains the same if we + * change the type. If the sizes of the types are different the new + * instruction will read a different amount of data than the original + * and the semantics will always be different. + */ + if (has_source_modifiers && + entry->dst.type != inst->src[arg].type && + (!inst->can_change_types() || + type_sz(entry->dst.type) != type_sz(inst->src[arg].type))) + return false; + + if (devinfo->ver >= 8 && (entry->src.negate || entry->src.abs) && + is_logic_op(inst->opcode)) { + return false; + } + + /* Save the offset of inst->src[arg] relative to entry->dst for it to be + * applied later. + */ + const unsigned rel_offset = inst->src[arg].offset - entry->dst.offset; + + /* Fold the copy into the instruction consuming it. */ + inst->src[arg].file = entry->src.file; + inst->src[arg].nr = entry->src.nr; + inst->src[arg].subnr = entry->src.subnr; + inst->src[arg].offset = entry->src.offset; + + /* Compose the strides of both regions. */ + if (entry->src.file == FIXED_GRF) { + if (inst->src[arg].stride) { + const unsigned orig_width = 1 << entry->src.width; + const unsigned reg_width = REG_SIZE / (type_sz(inst->src[arg].type) * + inst->src[arg].stride); + inst->src[arg].width = cvt(MIN2(orig_width, reg_width)) - 1; + inst->src[arg].hstride = cvt(inst->src[arg].stride); + inst->src[arg].vstride = inst->src[arg].hstride + inst->src[arg].width; + } else { + inst->src[arg].vstride = inst->src[arg].hstride = + inst->src[arg].width = 0; + } + + inst->src[arg].stride = 1; + + /* Hopefully no Align16 around here... */ + assert(entry->src.swizzle == BRW_SWIZZLE_XYZW); + inst->src[arg].swizzle = entry->src.swizzle; + } else { + inst->src[arg].stride *= entry->src.stride; + } + + /* Compute the first component of the copy that the instruction is + * reading, and the base byte offset within that component. + */ + assert((entry->dst.offset % REG_SIZE == 0 || inst->opcode == BRW_OPCODE_MOV) && + entry->dst.stride == 1); + const unsigned component = rel_offset / type_sz(entry->dst.type); + const unsigned suboffset = rel_offset % type_sz(entry->dst.type); + + /* Calculate the byte offset at the origin of the copy of the given + * component and suboffset. + */ + inst->src[arg] = byte_offset(inst->src[arg], + component * entry_stride * type_sz(entry->src.type) + suboffset); + + if (has_source_modifiers) { + if (entry->dst.type != inst->src[arg].type) { + /* We are propagating source modifiers from a MOV with a different + * type. If we got here, then we can just change the source and + * destination types of the instruction and keep going. + */ + for (int i = 0; i < inst->sources; i++) { + inst->src[i].type = entry->dst.type; + } + inst->dst.type = entry->dst.type; + } + + if (!inst->src[arg].abs) { + inst->src[arg].abs = entry->src.abs; + inst->src[arg].negate ^= entry->src.negate; + } + } + + return true; +} + + +static bool +try_constant_propagate(const brw_compiler *compiler, fs_inst *inst, + acp_entry *entry, int arg) +{ + const struct intel_device_info *devinfo = compiler->devinfo; + bool progress = false; + + if (type_sz(entry->src.type) > 4) + return false; + + if (inst->src[arg].file != VGRF) + return false; + + assert(entry->dst.file == VGRF); + if (inst->src[arg].nr != entry->dst.nr) + return false; + + /* Bail if inst is reading a range that isn't contained in the range + * that entry is writing. + */ + if (!region_contained_in(inst->src[arg], inst->size_read(arg), + entry->dst, entry->size_written)) + return false; + + /* If the size of the use type is larger than the size of the entry + * type, the entry doesn't contain all of the data that the user is + * trying to use. + */ + if (type_sz(inst->src[arg].type) > type_sz(entry->dst.type)) + return false; + + fs_reg val = entry->src; + + /* If the size of the use type is smaller than the size of the entry, + * clamp the value to the range of the use type. This enables constant + * copy propagation in cases like + * + * + * mov(8) g12<1>UD 0x0000000cUD + * ... + * mul(8) g47<1>D g86<8,8,1>D g12<16,8,2>W + */ + if (type_sz(inst->src[arg].type) < type_sz(entry->dst.type)) { + if (type_sz(inst->src[arg].type) != 2 || type_sz(entry->dst.type) != 4) + return false; + + assert(inst->src[arg].subnr == 0 || inst->src[arg].subnr == 2); + + /* When subnr is 0, we want the lower 16-bits, and when it's 2, we + * want the upper 16-bits. No other values of subnr are valid for a + * UD source. + */ + const uint16_t v = inst->src[arg].subnr == 2 ? val.ud >> 16 : val.ud; + + val.ud = v | (uint32_t(v) << 16); + } + + val.type = inst->src[arg].type; + + if (inst->src[arg].abs) { + if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) || + !brw_abs_immediate(val.type, &val.as_brw_reg())) { + return false; + } + } + + if (inst->src[arg].negate) { + if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) || + !brw_negate_immediate(val.type, &val.as_brw_reg())) { + return false; + } + } + + switch (inst->opcode) { + case BRW_OPCODE_MOV: + case SHADER_OPCODE_LOAD_PAYLOAD: + case FS_OPCODE_PACK: + inst->src[arg] = val; + progress = true; + break; + + case SHADER_OPCODE_POW: + /* Allow constant propagation into src1 (except on Gen 6 which + * doesn't support scalar source math), and let constant combining + * promote the constant on Gen < 8. + */ + if (devinfo->ver == 6) + break; + + if (arg == 1) { + inst->src[arg] = val; + progress = true; + } + break; + + case BRW_OPCODE_SUBB: + if (arg == 1) { + inst->src[arg] = val; + progress = true; + } + break; + + case BRW_OPCODE_MACH: + case BRW_OPCODE_MUL: + case SHADER_OPCODE_MULH: + case BRW_OPCODE_ADD: + case BRW_OPCODE_XOR: + case BRW_OPCODE_ADDC: + if (arg == 1) { + inst->src[arg] = val; + progress = true; + } else if (arg == 0 && inst->src[1].file != IMM) { + /* Don't copy propagate the constant in situations like + * + * mov(8) g8<1>D 0x7fffffffD + * mul(8) g16<1>D g8<8,8,1>D g15<16,8,2>W + * + * On platforms that only have a 32x16 multiplier, this will + * result in lowering the multiply to + * + * mul(8) g15<1>D g14<8,8,1>D 0xffffUW + * mul(8) g16<1>D g14<8,8,1>D 0x7fffUW + * add(8) g15.1<2>UW g15.1<16,8,2>UW g16<16,8,2>UW + * + * On Gfx8 and Gfx9, which have the full 32x32 multiplier, it + * results in + * + * mul(8) g16<1>D g15<16,8,2>W 0x7fffffffD + * + * Volume 2a of the Skylake PRM says: + * + * When multiplying a DW and any lower precision integer, the + * DW operand must on src0. + */ + if (inst->opcode == BRW_OPCODE_MUL && + type_sz(inst->src[1].type) < 4 && + type_sz(val.type) == 4) + break; + + /* Fit this constant in by commuting the operands. + * Exception: we can't do this for 32-bit integer MUL/MACH + * because it's asymmetric. + * + * The BSpec says for Broadwell that + * + * "When multiplying DW x DW, the dst cannot be accumulator." + * + * Integer MUL with a non-accumulator destination will be lowered + * by lower_integer_multiplication(), so don't restrict it. + */ + if (((inst->opcode == BRW_OPCODE_MUL && + inst->dst.is_accumulator()) || + inst->opcode == BRW_OPCODE_MACH) && + (inst->src[1].type == BRW_REGISTER_TYPE_D || + inst->src[1].type == BRW_REGISTER_TYPE_UD)) + break; + inst->src[0] = inst->src[1]; + inst->src[1] = val; + progress = true; + } + break; + + case BRW_OPCODE_ADD3: + /* add3 can have a single imm16 source. Proceed if the source type is + * already W or UW or the value can be coerced to one of those types. + */ + if (val.type == BRW_REGISTER_TYPE_W || val.type == BRW_REGISTER_TYPE_UW) + ; /* Nothing to do. */ + else if (val.ud <= 0xffff) + val = brw_imm_uw(val.ud); + else if (val.d >= -0x8000 && val.d <= 0x7fff) + val = brw_imm_w(val.d); + else + break; + + if (arg == 2) { + inst->src[arg] = val; + progress = true; + } else if (inst->src[2].file != IMM) { + inst->src[arg] = inst->src[2]; + inst->src[2] = val; + progress = true; + } + + break; + + case BRW_OPCODE_CMP: + case BRW_OPCODE_IF: + if (arg == 1) { + inst->src[arg] = val; + progress = true; + } else if (arg == 0 && inst->src[1].file != IMM) { + enum brw_conditional_mod new_cmod; + + new_cmod = brw_swap_cmod(inst->conditional_mod); + if (new_cmod != BRW_CONDITIONAL_NONE) { + /* Fit this constant in by swapping the operands and + * flipping the test + */ + inst->src[0] = inst->src[1]; + inst->src[1] = val; + inst->conditional_mod = new_cmod; + progress = true; + } + } + break; + + case BRW_OPCODE_SEL: + if (arg == 1) { + inst->src[arg] = val; + progress = true; + } else if (arg == 0) { + if (inst->src[1].file != IMM && + (inst->conditional_mod == BRW_CONDITIONAL_NONE || + /* Only GE and L are commutative. */ + inst->conditional_mod == BRW_CONDITIONAL_GE || + inst->conditional_mod == BRW_CONDITIONAL_L)) { + inst->src[0] = inst->src[1]; + inst->src[1] = val; + + /* If this was predicated, flipping operands means + * we also need to flip the predicate. + */ + if (inst->conditional_mod == BRW_CONDITIONAL_NONE) { + inst->predicate_inverse = + !inst->predicate_inverse; + } + } else { + inst->src[0] = val; + } + + progress = true; + } + break; + + case FS_OPCODE_FB_WRITE_LOGICAL: + /* The stencil and omask sources of FS_OPCODE_FB_WRITE_LOGICAL are + * bit-cast using a strided region so they cannot be immediates. + */ + if (arg != FB_WRITE_LOGICAL_SRC_SRC_STENCIL && + arg != FB_WRITE_LOGICAL_SRC_OMASK) { + inst->src[arg] = val; + progress = true; + } + break; + + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + /* Allow constant propagation into either source (except on Gen 6 + * which doesn't support scalar source math). Constant combining + * promote the src1 constant on Gen < 8, and it will promote the src0 + * constant on all platforms. + */ + if (devinfo->ver == 6) + break; + + FALLTHROUGH; + case BRW_OPCODE_AND: + case BRW_OPCODE_ASR: + case BRW_OPCODE_BFE: + case BRW_OPCODE_BFI1: + case BRW_OPCODE_BFI2: + case BRW_OPCODE_ROL: + case BRW_OPCODE_ROR: + case BRW_OPCODE_SHL: + case BRW_OPCODE_SHR: + case BRW_OPCODE_OR: + case SHADER_OPCODE_TEX_LOGICAL: + case SHADER_OPCODE_TXD_LOGICAL: + case SHADER_OPCODE_TXF_LOGICAL: + case SHADER_OPCODE_TXL_LOGICAL: + case SHADER_OPCODE_TXS_LOGICAL: + case FS_OPCODE_TXB_LOGICAL: + case SHADER_OPCODE_TXF_CMS_LOGICAL: + case SHADER_OPCODE_TXF_CMS_W_LOGICAL: + case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL: + case SHADER_OPCODE_TXF_UMS_LOGICAL: + case SHADER_OPCODE_TXF_MCS_LOGICAL: + case SHADER_OPCODE_LOD_LOGICAL: + case SHADER_OPCODE_TG4_LOGICAL: + case SHADER_OPCODE_TG4_OFFSET_LOGICAL: + case SHADER_OPCODE_SAMPLEINFO_LOGICAL: + case SHADER_OPCODE_IMAGE_SIZE_LOGICAL: + case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: + case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: + case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: + case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: + case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: + case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: + case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: + case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: + case SHADER_OPCODE_BROADCAST: + case BRW_OPCODE_MAD: + case BRW_OPCODE_LRP: + case FS_OPCODE_PACK_HALF_2x16_SPLIT: + case SHADER_OPCODE_SHUFFLE: + inst->src[arg] = val; + progress = true; + break; + + default: + break; + } + + return progress; +} + +static bool +can_propagate_from(fs_inst *inst) +{ + return (inst->opcode == BRW_OPCODE_MOV && + inst->dst.file == VGRF && + ((inst->src[0].file == VGRF && + !grf_regions_overlap(inst->dst, inst->size_written, + inst->src[0], inst->size_read(0))) || + inst->src[0].file == ATTR || + inst->src[0].file == UNIFORM || + inst->src[0].file == IMM || + (inst->src[0].file == FIXED_GRF && + inst->src[0].is_contiguous())) && + inst->src[0].type == inst->dst.type && + !inst->saturate && + /* Subset of !is_partial_write() conditions. */ + !inst->predicate && inst->dst.is_contiguous()) || + is_identity_payload(FIXED_GRF, inst); +} + +/* Walks a basic block and does copy propagation on it using the acp + * list. + */ +static bool +opt_copy_propagation_local(const brw_compiler *compiler, linear_ctx *lin_ctx, + bblock_t *block, struct acp &acp, + const brw::simple_allocator &alloc, + uint8_t max_polygons) +{ + bool progress = false; + + foreach_inst_in_block(fs_inst, inst, block) { + /* Try propagating into this instruction. */ + bool instruction_progress = false; + for (int i = inst->sources - 1; i >= 0; i--) { + if (inst->src[i].file != VGRF) + continue; + + for (auto iter = acp.find_by_dst(inst->src[i].nr); + iter != acp.end() && (*iter)->dst.nr == inst->src[i].nr; + ++iter) { + if ((*iter)->src.file == IMM) { + if (try_constant_propagate(compiler, inst, *iter, i)) { + instruction_progress = true; + break; + } + } else { + if (try_copy_propagate(compiler, inst, *iter, i, alloc, + max_polygons)) { + instruction_progress = true; + break; + } + } + } + } + + if (instruction_progress) { + progress = true; + + /* ADD3 can only have the immediate as src0. */ + if (inst->opcode == BRW_OPCODE_ADD3) { + if (inst->src[2].file == IMM) { + const auto src0 = inst->src[0]; + inst->src[0] = inst->src[2]; + inst->src[2] = src0; + } + } + + /* If only one of the sources of a 2-source, commutative instruction (e.g., + * AND) is immediate, it must be src1. If both are immediate, opt_algebraic + * should fold it away. + */ + if (inst->sources == 2 && inst->is_commutative() && + inst->src[0].file == IMM && inst->src[1].file != IMM) { + const auto src1 = inst->src[1]; + inst->src[1] = inst->src[0]; + inst->src[0] = src1; + } + } + + /* kill the destination from the ACP */ + if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) { + for (auto iter = acp.find_by_dst(inst->dst.nr); + iter != acp.end() && (*iter)->dst.nr == inst->dst.nr; + ++iter) { + if (grf_regions_overlap((*iter)->dst, (*iter)->size_written, + inst->dst, inst->size_written)) + acp.remove(*iter); + } + + for (auto iter = acp.find_by_src(inst->dst.nr); + iter != acp.end() && (*iter)->src.nr == inst->dst.nr; + ++iter) { + /* Make sure we kill the entry if this instruction overwrites + * _any_ of the registers that it reads + */ + if (grf_regions_overlap((*iter)->src, (*iter)->size_read, + inst->dst, inst->size_written)) + acp.remove(*iter); + } + } + + /* If this instruction's source could potentially be folded into the + * operand of another instruction, add it to the ACP. + */ + if (can_propagate_from(inst)) { + acp_entry *entry = linear_zalloc(lin_ctx, acp_entry); + entry->dst = inst->dst; + entry->src = inst->src[0]; + entry->size_written = inst->size_written; + for (unsigned i = 0; i < inst->sources; i++) + entry->size_read += inst->size_read(i); + entry->opcode = inst->opcode; + entry->is_partial_write = inst->is_partial_write(); + entry->force_writemask_all = inst->force_writemask_all; + acp.add(entry); + } else if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD && + inst->dst.file == VGRF) { + int offset = 0; + for (int i = 0; i < inst->sources; i++) { + int effective_width = i < inst->header_size ? 8 : inst->exec_size; + const unsigned size_written = effective_width * + type_sz(inst->src[i].type); + if (inst->src[i].file == VGRF || + (inst->src[i].file == FIXED_GRF && + inst->src[i].is_contiguous())) { + const brw_reg_type t = i < inst->header_size ? + BRW_REGISTER_TYPE_UD : inst->src[i].type; + fs_reg dst = byte_offset(retype(inst->dst, t), offset); + if (!dst.equals(inst->src[i])) { + acp_entry *entry = linear_zalloc(lin_ctx, acp_entry); + entry->dst = dst; + entry->src = retype(inst->src[i], t); + entry->size_written = size_written; + entry->size_read = inst->size_read(i); + entry->opcode = inst->opcode; + entry->force_writemask_all = inst->force_writemask_all; + acp.add(entry); + } + } + offset += size_written; + } + } + } + + return progress; +} + +bool +fs_visitor::opt_copy_propagation() +{ + bool progress = false; + void *copy_prop_ctx = ralloc_context(NULL); + linear_ctx *lin_ctx = linear_context(copy_prop_ctx); + struct acp out_acp[cfg->num_blocks]; + + const fs_live_variables &live = live_analysis.require(); + + /* First, walk through each block doing local copy propagation and getting + * the set of copies available at the end of the block. + */ + foreach_block (block, cfg) { + progress = opt_copy_propagation_local(compiler, lin_ctx, block, + out_acp[block->num], alloc, + max_polygons) || progress; + + /* If the destination of an ACP entry exists only within this block, + * then there's no need to keep it for dataflow analysis. We can delete + * it from the out_acp table and avoid growing the bitsets any bigger + * than we absolutely have to. + * + * Because nothing in opt_copy_propagation_local touches the block + * start/end IPs and opt_copy_propagation_local is incapable of + * extending the live range of an ACP destination beyond the block, + * it's safe to use the liveness information in this way. + */ + for (auto iter = out_acp[block->num].begin(); + iter != out_acp[block->num].end(); ++iter) { + assert((*iter)->dst.file == VGRF); + if (block->start_ip <= live.vgrf_start[(*iter)->dst.nr] && + live.vgrf_end[(*iter)->dst.nr] <= block->end_ip) { + out_acp[block->num].remove(*iter); + } + } + } + + /* Do dataflow analysis for those available copies. */ + fs_copy_prop_dataflow dataflow(lin_ctx, cfg, live, out_acp); + + /* Next, re-run local copy propagation, this time with the set of copies + * provided by the dataflow analysis available at the start of a block. + */ + foreach_block (block, cfg) { + struct acp in_acp; + + for (int i = 0; i < dataflow.num_acp; i++) { + if (BITSET_TEST(dataflow.bd[block->num].livein, i) && + !BITSET_TEST(dataflow.bd[block->num].exec_mismatch, i)) { + struct acp_entry *entry = dataflow.acp[i]; + in_acp.add(entry); + } + } + + progress = opt_copy_propagation_local(compiler, lin_ctx, block, + in_acp, alloc, max_polygons) || + progress; + } + + ralloc_free(copy_prop_ctx); + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW | + DEPENDENCY_INSTRUCTION_DETAIL); + + return progress; +} diff --git a/src/intel/compiler/elk/brw_fs_cse.cpp b/src/intel/compiler/elk/brw_fs_cse.cpp new file mode 100644 index 00000000000..8fa1d281b06 --- /dev/null +++ b/src/intel/compiler/elk/brw_fs_cse.cpp @@ -0,0 +1,396 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_fs.h" +#include "brw_fs_builder.h" +#include "brw_cfg.h" + +/** @file brw_fs_cse.cpp + * + * Support for local common subexpression elimination. + * + * See Muchnick's Advanced Compiler Design and Implementation, section + * 13.1 (p378). + */ + +using namespace brw; + +namespace { +struct aeb_entry : public exec_node { + /** The instruction that generates the expression value. */ + fs_inst *generator; + + /** The temporary where the value is stored. */ + fs_reg tmp; +}; +} + +static bool +is_expression(const fs_visitor *v, const fs_inst *const inst) +{ + switch (inst->opcode) { + case BRW_OPCODE_MOV: + case BRW_OPCODE_SEL: + case BRW_OPCODE_NOT: + case BRW_OPCODE_AND: + case BRW_OPCODE_OR: + case BRW_OPCODE_XOR: + case BRW_OPCODE_SHR: + case BRW_OPCODE_SHL: + case BRW_OPCODE_ASR: + case BRW_OPCODE_CMP: + case BRW_OPCODE_CMPN: + case BRW_OPCODE_ADD: + case BRW_OPCODE_MUL: + case SHADER_OPCODE_MULH: + case BRW_OPCODE_FRC: + case BRW_OPCODE_RNDU: + case BRW_OPCODE_RNDD: + case BRW_OPCODE_RNDE: + case BRW_OPCODE_RNDZ: + case BRW_OPCODE_LINE: + case BRW_OPCODE_PLN: + case BRW_OPCODE_MAD: + case BRW_OPCODE_LRP: + case FS_OPCODE_FB_READ_LOGICAL: + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL: + case FS_OPCODE_LINTERP: + case SHADER_OPCODE_FIND_LIVE_CHANNEL: + case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL: + case FS_OPCODE_LOAD_LIVE_CHANNELS: + case SHADER_OPCODE_BROADCAST: + case SHADER_OPCODE_MOV_INDIRECT: + case SHADER_OPCODE_TEX_LOGICAL: + case SHADER_OPCODE_TXD_LOGICAL: + case SHADER_OPCODE_TXF_LOGICAL: + case SHADER_OPCODE_TXL_LOGICAL: + case SHADER_OPCODE_TXS_LOGICAL: + case FS_OPCODE_TXB_LOGICAL: + case SHADER_OPCODE_TXF_CMS_LOGICAL: + case SHADER_OPCODE_TXF_CMS_W_LOGICAL: + case SHADER_OPCODE_TXF_UMS_LOGICAL: + case SHADER_OPCODE_TXF_MCS_LOGICAL: + case SHADER_OPCODE_LOD_LOGICAL: + case SHADER_OPCODE_TG4_LOGICAL: + case SHADER_OPCODE_TG4_OFFSET_LOGICAL: + case FS_OPCODE_PACK: + return true; + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_POW: + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + return inst->mlen < 2; + case SHADER_OPCODE_LOAD_PAYLOAD: + return !is_coalescing_payload(v->alloc, inst); + default: + return inst->is_send_from_grf() && !inst->has_side_effects() && + !inst->is_volatile(); + } +} + +static bool +operands_match(const fs_inst *a, const fs_inst *b, bool *negate) +{ + fs_reg *xs = a->src; + fs_reg *ys = b->src; + + if (a->opcode == BRW_OPCODE_MAD) { + return xs[0].equals(ys[0]) && + ((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) || + (xs[2].equals(ys[1]) && xs[1].equals(ys[2]))); + } else if (a->opcode == BRW_OPCODE_MUL && a->dst.type == BRW_REGISTER_TYPE_F) { + bool xs0_negate = xs[0].negate; + bool xs1_negate = xs[1].file == IMM ? xs[1].f < 0.0f + : xs[1].negate; + bool ys0_negate = ys[0].negate; + bool ys1_negate = ys[1].file == IMM ? ys[1].f < 0.0f + : ys[1].negate; + float xs1_imm = xs[1].f; + float ys1_imm = ys[1].f; + + xs[0].negate = false; + xs[1].negate = false; + ys[0].negate = false; + ys[1].negate = false; + xs[1].f = fabsf(xs[1].f); + ys[1].f = fabsf(ys[1].f); + + bool ret = (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) || + (xs[1].equals(ys[0]) && xs[0].equals(ys[1])); + + xs[0].negate = xs0_negate; + xs[1].negate = xs[1].file == IMM ? false : xs1_negate; + ys[0].negate = ys0_negate; + ys[1].negate = ys[1].file == IMM ? false : ys1_negate; + xs[1].f = xs1_imm; + ys[1].f = ys1_imm; + + *negate = (xs0_negate != xs1_negate) != (ys0_negate != ys1_negate); + if (*negate && (a->saturate || b->saturate)) + return false; + return ret; + } else if (!a->is_commutative()) { + bool match = true; + for (int i = 0; i < a->sources; i++) { + if (!xs[i].equals(ys[i])) { + match = false; + break; + } + } + return match; + } else { + return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) || + (xs[1].equals(ys[0]) && xs[0].equals(ys[1])); + } +} + +static bool +instructions_match(fs_inst *a, fs_inst *b, bool *negate) +{ + return a->opcode == b->opcode && + a->force_writemask_all == b->force_writemask_all && + a->exec_size == b->exec_size && + a->group == b->group && + a->saturate == b->saturate && + a->predicate == b->predicate && + a->predicate_inverse == b->predicate_inverse && + a->conditional_mod == b->conditional_mod && + a->flag_subreg == b->flag_subreg && + a->dst.type == b->dst.type && + a->offset == b->offset && + a->mlen == b->mlen && + a->ex_mlen == b->ex_mlen && + a->sfid == b->sfid && + a->desc == b->desc && + a->size_written == b->size_written && + a->base_mrf == b->base_mrf && + a->check_tdr == b->check_tdr && + a->send_has_side_effects == b->send_has_side_effects && + a->eot == b->eot && + a->header_size == b->header_size && + a->shadow_compare == b->shadow_compare && + a->pi_noperspective == b->pi_noperspective && + a->target == b->target && + a->sources == b->sources && + operands_match(a, b, negate); +} + +static void +create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate) +{ + unsigned written = regs_written(inst); + unsigned dst_width = + DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE); + fs_inst *copy; + + if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) { + assert(src.file == VGRF); + fs_reg *payload = ralloc_array(bld.shader->mem_ctx, fs_reg, + inst->sources); + for (int i = 0; i < inst->header_size; i++) { + payload[i] = src; + src.offset += REG_SIZE; + } + for (int i = inst->header_size; i < inst->sources; i++) { + src.type = inst->src[i].type; + payload[i] = src; + src = offset(src, bld, 1); + } + copy = bld.LOAD_PAYLOAD(inst->dst, payload, inst->sources, + inst->header_size); + } else if (written != dst_width) { + assert(src.file == VGRF); + assert(written % dst_width == 0); + const int sources = written / dst_width; + fs_reg *payload = ralloc_array(bld.shader->mem_ctx, fs_reg, sources); + for (int i = 0; i < sources; i++) { + payload[i] = src; + src = offset(src, bld, 1); + } + copy = bld.LOAD_PAYLOAD(inst->dst, payload, sources, 0); + } else { + copy = bld.MOV(inst->dst, src); + copy->group = inst->group; + copy->force_writemask_all = inst->force_writemask_all; + copy->src[0].negate = negate; + } + assert(regs_written(copy) == written); +} + +bool +fs_visitor::opt_cse_local(const fs_live_variables &live, bblock_t *block, int &ip) +{ + bool progress = false; + exec_list aeb; + + void *cse_ctx = ralloc_context(NULL); + + foreach_inst_in_block(fs_inst, inst, block) { + /* Skip some cases. */ + if (is_expression(this, inst) && !inst->is_partial_write() && + ((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) || + inst->dst.is_null())) + { + bool found = false; + bool negate = false; + + foreach_in_list_use_after(aeb_entry, entry, &aeb) { + /* Match current instruction's expression against those in AEB. */ + if (!(entry->generator->dst.is_null() && !inst->dst.is_null()) && + instructions_match(inst, entry->generator, &negate)) { + found = true; + progress = true; + break; + } + } + + if (!found) { + if (inst->opcode != BRW_OPCODE_MOV || + (inst->opcode == BRW_OPCODE_MOV && + inst->src[0].file == IMM && + inst->src[0].type == BRW_REGISTER_TYPE_VF)) { + /* Our first sighting of this expression. Create an entry. */ + aeb_entry *entry = ralloc(cse_ctx, aeb_entry); + entry->tmp = reg_undef; + entry->generator = inst; + aeb.push_tail(entry); + } + } else { + /* This is at least our second sighting of this expression. + * If we don't have a temporary already, make one. + */ + bool no_existing_temp = entry->tmp.file == BAD_FILE; + if (no_existing_temp && !entry->generator->dst.is_null()) { + const fs_builder ibld = fs_builder(this, block, entry->generator) + .at(block, entry->generator->next); + int written = regs_written(entry->generator); + + entry->tmp = fs_reg(VGRF, alloc.allocate(written), + entry->generator->dst.type); + + create_copy_instr(ibld, entry->generator, entry->tmp, false); + + entry->generator->dst = entry->tmp; + } + + /* dest <- temp */ + if (!inst->dst.is_null()) { + assert(inst->size_written == entry->generator->size_written); + assert(inst->dst.type == entry->tmp.type); + const fs_builder ibld(this, block, inst); + + create_copy_instr(ibld, inst, entry->tmp, negate); + } + + /* Set our iterator so that next time through the loop inst->next + * will get the instruction in the basic block after the one we've + * removed. + */ + fs_inst *prev = (fs_inst *)inst->prev; + + inst->remove(block); + inst = prev; + } + } + + /* Discard jumps aren't represented in the CFG unfortunately, so we need + * to make sure that they behave as a CSE barrier, since we lack global + * dataflow information. This is particularly likely to cause problems + * with instructions dependent on the current execution mask like + * SHADER_OPCODE_FIND_LIVE_CHANNEL. + */ + if (inst->opcode == BRW_OPCODE_HALT || + inst->opcode == SHADER_OPCODE_HALT_TARGET) + aeb.make_empty(); + + foreach_in_list_safe(aeb_entry, entry, &aeb) { + /* Kill all AEB entries that write a different value to or read from + * the flag register if we just wrote it. + */ + if (inst->flags_written(devinfo)) { + bool negate; /* dummy */ + if (entry->generator->flags_read(devinfo) || + (entry->generator->flags_written(devinfo) && + !instructions_match(inst, entry->generator, &negate))) { + entry->remove(); + ralloc_free(entry); + continue; + } + } + + for (int i = 0; i < entry->generator->sources; i++) { + fs_reg *src_reg = &entry->generator->src[i]; + + /* Kill all AEB entries that use the destination we just + * overwrote. + */ + if (regions_overlap(inst->dst, inst->size_written, + entry->generator->src[i], + entry->generator->size_read(i))) { + entry->remove(); + ralloc_free(entry); + break; + } + + /* Kill any AEB entries using registers that don't get reused any + * more -- a sure sign they'll fail operands_match(). + */ + if (src_reg->file == VGRF && live.vgrf_end[src_reg->nr] < ip) { + entry->remove(); + ralloc_free(entry); + break; + } + } + } + + ip++; + } + + ralloc_free(cse_ctx); + + return progress; +} + +bool +fs_visitor::opt_cse() +{ + const fs_live_variables &live = live_analysis.require(); + bool progress = false; + int ip = 0; + + foreach_block (block, cfg) { + progress = opt_cse_local(live, block, ip) || progress; + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + + return progress; +} diff --git a/src/intel/compiler/elk/brw_fs_dead_code_eliminate.cpp b/src/intel/compiler/elk/brw_fs_dead_code_eliminate.cpp new file mode 100644 index 00000000000..51e1bd549cd --- /dev/null +++ b/src/intel/compiler/elk/brw_fs_dead_code_eliminate.cpp @@ -0,0 +1,152 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_fs.h" +#include "brw_fs_live_variables.h" +#include "brw_cfg.h" + +/** @file brw_fs_dead_code_eliminate.cpp + * + * Dataflow-aware dead code elimination. + * + * Walks the instruction list from the bottom, removing instructions that + * have results that both aren't used in later blocks and haven't been read + * yet in the tail end of this block. + */ + +using namespace brw; + +/** + * Is it safe to eliminate the instruction? + */ +static bool +can_eliminate(const intel_device_info *devinfo, const fs_inst *inst, + BITSET_WORD *flag_live) +{ + return !inst->is_control_flow() && + !inst->has_side_effects() && + !(flag_live[0] & inst->flags_written(devinfo)) && + !inst->writes_accumulator; +} + +/** + * Is it safe to omit the write, making the destination ARF null? + */ +static bool +can_omit_write(const fs_inst *inst) +{ + switch (inst->opcode) { + case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL: + case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: + case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: + return true; + default: + /* We can eliminate the destination write for ordinary instructions, + * but not most SENDs. + */ + if (inst->opcode < 128 && inst->mlen == 0) + return true; + + /* It might not be safe for other virtual opcodes. */ + return false; + } +} + +bool +fs_visitor::dead_code_eliminate() +{ + bool progress = false; + + const fs_live_variables &live_vars = live_analysis.require(); + int num_vars = live_vars.num_vars; + BITSET_WORD *live = rzalloc_array(NULL, BITSET_WORD, BITSET_WORDS(num_vars)); + BITSET_WORD *flag_live = rzalloc_array(NULL, BITSET_WORD, 1); + + foreach_block_reverse_safe(block, cfg) { + memcpy(live, live_vars.block_data[block->num].liveout, + sizeof(BITSET_WORD) * BITSET_WORDS(num_vars)); + memcpy(flag_live, live_vars.block_data[block->num].flag_liveout, + sizeof(BITSET_WORD)); + + foreach_inst_in_block_reverse_safe(fs_inst, inst, block) { + if (inst->dst.file == VGRF) { + const unsigned var = live_vars.var_from_reg(inst->dst); + bool result_live = false; + + for (unsigned i = 0; i < regs_written(inst); i++) + result_live |= BITSET_TEST(live, var + i); + + if (!result_live && + (can_omit_write(inst) || can_eliminate(devinfo, inst, flag_live))) { + inst->dst = fs_reg(spread(retype(brw_null_reg(), inst->dst.type), + inst->dst.stride)); + progress = true; + } + } + + if (inst->dst.is_null() && can_eliminate(devinfo, inst, flag_live)) { + inst->opcode = BRW_OPCODE_NOP; + progress = true; + } + + if (inst->dst.file == VGRF) { + if (!inst->is_partial_write()) { + const unsigned var = live_vars.var_from_reg(inst->dst); + for (unsigned i = 0; i < regs_written(inst); i++) { + BITSET_CLEAR(live, var + i); + } + } + } + + if (!inst->predicate && inst->exec_size >= 8) + flag_live[0] &= ~inst->flags_written(devinfo); + + if (inst->opcode == BRW_OPCODE_NOP) { + inst->remove(block, true); + continue; + } + + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF) { + int var = live_vars.var_from_reg(inst->src[i]); + + for (unsigned j = 0; j < regs_read(inst, i); j++) { + BITSET_SET(live, var + j); + } + } + } + + flag_live[0] |= inst->flags_read(devinfo); + } + } + + cfg->adjust_block_ips(); + + ralloc_free(live); + ralloc_free(flag_live); + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + + return progress; +} diff --git a/src/intel/compiler/elk/brw_fs_generator.cpp b/src/intel/compiler/elk/brw_fs_generator.cpp new file mode 100644 index 00000000000..2525c415ce5 --- /dev/null +++ b/src/intel/compiler/elk/brw_fs_generator.cpp @@ -0,0 +1,2544 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_fs_generator.cpp + * + * This file supports generating code from the FS LIR to the actual + * native instructions. + */ + +#include "brw_eu.h" +#include "brw_disasm_info.h" +#include "brw_fs.h" +#include "brw_cfg.h" +#include "dev/intel_debug.h" +#include "util/mesa-sha1.h" +#include "util/half_float.h" + +static enum brw_reg_file +brw_file_from_reg(fs_reg *reg) +{ + switch (reg->file) { + case ARF: + return BRW_ARCHITECTURE_REGISTER_FILE; + case FIXED_GRF: + case VGRF: + return BRW_GENERAL_REGISTER_FILE; + case MRF: + return BRW_MESSAGE_REGISTER_FILE; + case IMM: + return BRW_IMMEDIATE_VALUE; + case BAD_FILE: + case ATTR: + case UNIFORM: + unreachable("not reached"); + } + return BRW_ARCHITECTURE_REGISTER_FILE; +} + +static struct brw_reg +brw_reg_from_fs_reg(const struct intel_device_info *devinfo, fs_inst *inst, + fs_reg *reg, bool compressed) +{ + struct brw_reg brw_reg; + + switch (reg->file) { + case MRF: + assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver)); + FALLTHROUGH; + case VGRF: + if (reg->stride == 0) { + brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0); + } else { + /* From the Haswell PRM: + * + * "VertStride must be used to cross GRF register boundaries. This + * rule implies that elements within a 'Width' cannot cross GRF + * boundaries." + * + * The maximum width value that could satisfy this restriction is: + */ + const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type)); + + /* Because the hardware can only split source regions at a whole + * multiple of width during decompression (i.e. vertically), clamp + * the value obtained above to the physical execution size of a + * single decompressed chunk of the instruction: + */ + const unsigned phys_width = compressed ? inst->exec_size / 2 : + inst->exec_size; + + const unsigned max_hw_width = 16; + + /* XXX - The equation above is strictly speaking not correct on + * hardware that supports unbalanced GRF writes -- On Gfx9+ + * each decompressed chunk of the instruction may have a + * different execution size when the number of components + * written to each destination GRF is not the same. + */ + if (reg->stride > 4) { + assert(reg != &inst->dst); + assert(reg->stride * type_sz(reg->type) <= REG_SIZE); + brw_reg = brw_vecn_reg(1, brw_file_from_reg(reg), reg->nr, 0); + brw_reg = stride(brw_reg, reg->stride, 1, 0); + } else { + const unsigned width = MIN3(reg_width, phys_width, max_hw_width); + brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0); + brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride); + } + + if (devinfo->verx10 == 70) { + /* From the IvyBridge PRM (EU Changes by Processor Generation, page 13): + * "Each DF (Double Float) operand uses an element size of 4 rather + * than 8 and all regioning parameters are twice what the values + * would be based on the true element size: ExecSize, Width, + * HorzStride, and VertStride. Each DF operand uses a pair of + * channels and all masking and swizzing should be adjusted + * appropriately." + * + * From the IvyBridge PRM (Special Requirements for Handling Double + * Precision Data Types, page 71): + * "In Align1 mode, all regioning parameters like stride, execution + * size, and width must use the syntax of a pair of packed + * floats. The offsets for these data types must be 64-bit + * aligned. The execution size and regioning parameters are in terms + * of floats." + * + * Summarized: when handling DF-typed arguments, ExecSize, + * VertStride, and Width must be doubled. + * + * It applies to BayTrail too. + */ + if (type_sz(reg->type) == 8) { + brw_reg.width++; + if (brw_reg.vstride > 0) + brw_reg.vstride++; + assert(brw_reg.hstride == BRW_HORIZONTAL_STRIDE_1); + } + + /* When converting from DF->F, we set the destination stride to 2 + * because each d2f conversion implicitly writes 2 floats, being + * the first one the converted value. IVB/BYT actually writes two + * F components per SIMD channel, and every other component is + * filled with garbage. + */ + if (reg == &inst->dst && get_exec_type_size(inst) == 8 && + type_sz(inst->dst.type) < 8) { + assert(brw_reg.hstride > BRW_HORIZONTAL_STRIDE_1); + brw_reg.hstride--; + } + } + } + + brw_reg = retype(brw_reg, reg->type); + brw_reg = byte_offset(brw_reg, reg->offset); + brw_reg.abs = reg->abs; + brw_reg.negate = reg->negate; + break; + case ARF: + case FIXED_GRF: + case IMM: + assert(reg->offset == 0); + brw_reg = reg->as_brw_reg(); + break; + case BAD_FILE: + /* Probably unused. */ + brw_reg = brw_null_reg(); + break; + case ATTR: + case UNIFORM: + unreachable("not reached"); + } + + /* On HSW+, scalar DF sources can be accessed using the normal <0,1,0> + * region, but on IVB and BYT DF regions must be programmed in terms of + * floats. A <0,2,1> region accomplishes this. + */ + if (devinfo->verx10 == 70 && + type_sz(reg->type) == 8 && + brw_reg.vstride == BRW_VERTICAL_STRIDE_0 && + brw_reg.width == BRW_WIDTH_1 && + brw_reg.hstride == BRW_HORIZONTAL_STRIDE_0) { + brw_reg.width = BRW_WIDTH_2; + brw_reg.hstride = BRW_HORIZONTAL_STRIDE_1; + } + + return brw_reg; +} + +fs_generator::fs_generator(const struct brw_compiler *compiler, + const struct brw_compile_params *params, + struct brw_stage_prog_data *prog_data, + bool runtime_check_aads_emit, + gl_shader_stage stage) + + : compiler(compiler), params(params), + devinfo(compiler->devinfo), + prog_data(prog_data), dispatch_width(0), + runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false), + shader_name(NULL), stage(stage), mem_ctx(params->mem_ctx) +{ + p = rzalloc(mem_ctx, struct brw_codegen); + brw_init_codegen(&compiler->isa, p, mem_ctx); + + /* In the FS code generator, we are very careful to ensure that we always + * set the right execution size so we don't need the EU code to "help" us + * by trying to infer it. Sometimes, it infers the wrong thing. + */ + p->automatic_exec_sizes = false; +} + +fs_generator::~fs_generator() +{ +} + +class ip_record : public exec_node { +public: + DECLARE_RALLOC_CXX_OPERATORS(ip_record) + + ip_record(int ip) + { + this->ip = ip; + } + + int ip; +}; + +bool +fs_generator::patch_halt_jumps() +{ + if (this->discard_halt_patches.is_empty()) + return false; + + int scale = brw_jump_scale(p->devinfo); + + if (devinfo->ver >= 6) { + /* There is a somewhat strange undocumented requirement of using + * HALT, according to the simulator. If some channel has HALTed to + * a particular UIP, then by the end of the program, every channel + * must have HALTed to that UIP. Furthermore, the tracking is a + * stack, so you can't do the final halt of a UIP after starting + * halting to a new UIP. + * + * Symptoms of not emitting this instruction on actual hardware + * included GPU hangs and sparkly rendering on the piglit discard + * tests. + */ + brw_inst *last_halt = brw_HALT(p); + brw_inst_set_uip(p->devinfo, last_halt, 1 * scale); + brw_inst_set_jip(p->devinfo, last_halt, 1 * scale); + } + + int ip = p->nr_insn; + + foreach_in_list(ip_record, patch_ip, &discard_halt_patches) { + brw_inst *patch = &p->store[patch_ip->ip]; + + assert(brw_inst_opcode(p->isa, patch) == BRW_OPCODE_HALT); + if (devinfo->ver >= 6) { + /* HALT takes a half-instruction distance from the pre-incremented IP. */ + brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale); + } else { + brw_set_src1(p, patch, brw_imm_d((ip - patch_ip->ip) * scale)); + } + } + + this->discard_halt_patches.make_empty(); + + if (devinfo->ver < 6) { + /* From the g965 PRM: + * + * "As DMask is not automatically reloaded into AMask upon completion + * of this instruction, software has to manually restore AMask upon + * completion." + * + * DMask lives in the bottom 16 bits of sr0.1. + */ + brw_inst *reset = brw_MOV(p, brw_mask_reg(BRW_AMASK), + retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW)); + brw_inst_set_exec_size(devinfo, reset, BRW_EXECUTE_1); + brw_inst_set_mask_control(devinfo, reset, BRW_MASK_DISABLE); + brw_inst_set_qtr_control(devinfo, reset, BRW_COMPRESSION_NONE); + brw_inst_set_thread_control(devinfo, reset, BRW_THREAD_SWITCH); + } + + if (devinfo->ver == 4 && devinfo->platform != INTEL_PLATFORM_G4X) { + /* From the g965 PRM: + * + * "[DevBW, DevCL] Erratum: The subfields in mask stack register are + * reset to zero during graphics reset, however, they are not + * initialized at thread dispatch. These subfields will retain the + * values from the previous thread. Software should make sure the + * mask stack is empty (reset to zero) before terminating the thread. + * In case that this is not practical, software may have to reset the + * mask stack at the beginning of each kernel, which will impact the + * performance." + * + * Luckily we can rely on: + * + * "[DevBW, DevCL] This register access restriction is not + * applicable, hardware does ensure execution pipeline coherency, + * when a mask stack register is used as an explicit source and/or + * destination." + */ + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + + brw_set_default_exec_size(p, BRW_EXECUTE_2); + brw_MOV(p, vec2(brw_mask_stack_depth_reg(0)), brw_imm_uw(0)); + + brw_set_default_exec_size(p, BRW_EXECUTE_16); + /* Reset the if stack. */ + brw_MOV(p, retype(brw_mask_stack_reg(0), BRW_REGISTER_TYPE_UW), + brw_imm_uw(0)); + + brw_pop_insn_state(p); + } + + return true; +} + +void +fs_generator::generate_send(fs_inst *inst, + struct brw_reg dst, + struct brw_reg desc, + struct brw_reg ex_desc, + struct brw_reg payload, + struct brw_reg payload2) +{ + const bool dst_is_null = dst.file == BRW_ARCHITECTURE_REGISTER_FILE && + dst.nr == BRW_ARF_NULL; + const unsigned rlen = dst_is_null ? 0 : inst->size_written / REG_SIZE; + + uint32_t desc_imm = inst->desc | + brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size); + + uint32_t ex_desc_imm = inst->ex_desc | + brw_message_ex_desc(devinfo, inst->ex_mlen); + + if (ex_desc.file != BRW_IMMEDIATE_VALUE || ex_desc.ud || ex_desc_imm || + inst->send_ex_desc_scratch) { + /* If we have any sort of extended descriptor, then we need SENDS. This + * also covers the dual-payload case because ex_mlen goes in ex_desc. + */ + brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2, + desc, desc_imm, ex_desc, ex_desc_imm, + inst->send_ex_desc_scratch, + inst->send_ex_bso, inst->eot); + if (inst->check_tdr) + brw_inst_set_opcode(p->isa, brw_last_inst, + devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC); + } else { + brw_send_indirect_message(p, inst->sfid, dst, payload, desc, desc_imm, + inst->eot); + if (inst->check_tdr) + brw_inst_set_opcode(p->isa, brw_last_inst, BRW_OPCODE_SENDC); + } +} + +void +fs_generator::fire_fb_write(fs_inst *inst, + struct brw_reg payload, + struct brw_reg implied_header, + GLuint nr) +{ + struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); + + if (devinfo->ver < 6) { + brw_push_insn_state(p); + brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_set_default_flag_reg(p, 0, 0); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + brw_MOV(p, offset(retype(payload, BRW_REGISTER_TYPE_UD), 1), + offset(retype(implied_header, BRW_REGISTER_TYPE_UD), 1)); + brw_pop_insn_state(p); + } + + uint32_t msg_control = brw_fb_write_msg_control(inst, prog_data); + + /* We assume render targets start at 0, because headerless FB write + * messages set "Render Target Index" to 0. Using a different binding + * table index would make it impossible to use headerless messages. + */ + const uint32_t surf_index = inst->target; + + brw_inst *insn = brw_fb_WRITE(p, + payload, + retype(implied_header, BRW_REGISTER_TYPE_UW), + msg_control, + surf_index, + nr, + 0, + inst->eot, + inst->last_rt, + inst->header_size != 0); + + if (devinfo->ver >= 6) + brw_inst_set_rt_slot_group(devinfo, insn, inst->group / 16); +} + +void +fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload) +{ + assert(devinfo->ver < 7); + + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_set_default_flag_reg(p, 0, 0); + + const struct brw_reg implied_header = + devinfo->ver < 6 ? payload : brw_null_reg(); + + if (inst->base_mrf >= 0) + payload = brw_message_reg(inst->base_mrf); + + if (!runtime_check_aads_emit) { + fire_fb_write(inst, payload, implied_header, inst->mlen); + } else { + /* This can only happen in gen < 6 */ + assert(devinfo->ver < 6); + + struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); + + /* Check runtime bit to detect if we have to send AA data or not */ + brw_push_insn_state(p); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + brw_set_default_exec_size(p, BRW_EXECUTE_1); + brw_AND(p, + v1_null_ud, + retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD), + brw_imm_ud(1<<26)); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); + + int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store; + brw_pop_insn_state(p); + { + /* Don't send AA data */ + fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1); + } + brw_land_fwd_jump(p, jmp); + fire_fb_write(inst, payload, implied_header, inst->mlen); + } +} + +void +fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst, + struct brw_reg payload) +{ + assert(inst->size_written % REG_SIZE == 0); + struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); + /* We assume that render targets start at binding table index 0. */ + const unsigned surf_index = inst->target; + + gfx9_fb_READ(p, dst, payload, surf_index, + inst->header_size, inst->size_written / REG_SIZE, + prog_data->persample_dispatch); +} + +void +fs_generator::generate_mov_indirect(fs_inst *inst, + struct brw_reg dst, + struct brw_reg reg, + struct brw_reg indirect_byte_offset) +{ + assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD); + assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE); + assert(!reg.abs && !reg.negate); + + /* Gen12.5 adds the following region restriction: + * + * "Vx1 and VxH indirect addressing for Float, Half-Float, Double-Float + * and Quad-Word data must not be used." + * + * We require the source and destination types to match so stomp to an + * unsigned integer type. + */ + assert(reg.type == dst.type); + reg.type = dst.type = brw_reg_type_from_bit_size(type_sz(reg.type) * 8, + BRW_REGISTER_TYPE_UD); + + unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr; + + if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) { + imm_byte_offset += indirect_byte_offset.ud; + + reg.nr = imm_byte_offset / REG_SIZE; + reg.subnr = imm_byte_offset % REG_SIZE; + if (type_sz(reg.type) > 4 && !devinfo->has_64bit_float) { + brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), + subscript(reg, BRW_REGISTER_TYPE_D, 0)); + brw_set_default_swsb(p, tgl_swsb_null()); + brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), + subscript(reg, BRW_REGISTER_TYPE_D, 1)); + } else { + brw_MOV(p, dst, reg); + } + } else { + /* Prior to Broadwell, there are only 8 address registers. */ + assert(inst->exec_size <= 8 || devinfo->ver >= 8); + + /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */ + struct brw_reg addr = vec8(brw_address_reg(0)); + + /* Whether we can use destination dependency control without running the + * risk of a hang if an instruction gets shot down. + */ + const bool use_dep_ctrl = !inst->predicate && + inst->exec_size == dispatch_width; + brw_inst *insn; + + /* The destination stride of an instruction (in bytes) must be greater + * than or equal to the size of the rest of the instruction. Since the + * address register is of type UW, we can't use a D-type instruction. + * In order to get around this, re retype to UW and use a stride. + */ + indirect_byte_offset = + retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW); + + /* There are a number of reasons why we don't use the base offset here. + * One reason is that the field is only 9 bits which means we can only + * use it to access the first 16 GRFs. Also, from the Haswell PRM + * section "Register Region Restrictions": + * + * "The lower bits of the AddressImmediate must not overflow to + * change the register address. The lower 5 bits of Address + * Immediate when added to lower 5 bits of address register gives + * the sub-register offset. The upper bits of Address Immediate + * when added to upper bits of address register gives the register + * address. Any overflow from sub-register offset is dropped." + * + * Since the indirect may cause us to cross a register boundary, this + * makes the base offset almost useless. We could try and do something + * clever where we use a actual base offset if base_offset % 32 == 0 but + * that would mean we were generating different code depending on the + * base offset. Instead, for the sake of consistency, we'll just do the + * add ourselves. This restriction is only listed in the Haswell PRM + * but empirical testing indicates that it applies on all older + * generations and is lifted on Broadwell. + * + * In the end, while base_offset is nice to look at in the generated + * code, using it saves us 0 instructions and would require quite a bit + * of case-by-case work. It's just not worth it. + * + * Due to a hardware bug some platforms (particularly Gfx11+) seem to + * require the address components of all channels to be valid whether or + * not they're active, which causes issues if we use VxH addressing + * under non-uniform control-flow. We can easily work around that by + * initializing the whole address register with a pipelined NoMask MOV + * instruction. + */ + if (devinfo->ver >= 7) { + insn = brw_MOV(p, addr, brw_imm_uw(imm_byte_offset)); + brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); + brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); + if (devinfo->ver >= 12) + brw_set_default_swsb(p, tgl_swsb_null()); + else + brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl); + } + + insn = brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset)); + if (devinfo->ver >= 12) + brw_set_default_swsb(p, tgl_swsb_regdist(1)); + else if (devinfo->ver >= 7) + brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl); + + if (type_sz(reg.type) > 4 && + ((devinfo->verx10 == 70) || + devinfo->platform == INTEL_PLATFORM_CHV || intel_device_info_is_9lp(devinfo) || + !devinfo->has_64bit_float || devinfo->verx10 >= 125)) { + /* IVB has an issue (which we found empirically) where it reads two + * address register components per channel for indirectly addressed + * 64-bit sources. + * + * From the Cherryview PRM Vol 7. "Register Region Restrictions": + * + * "When source or destination datatype is 64b or operation is + * integer DWord multiply, indirect addressing must not be used." + * + * To work around both of these, we do two integer MOVs insead of one + * 64-bit MOV. Because no double value should ever cross a register + * boundary, it's safe to use the immediate offset in the indirect + * here to handle adding 4 bytes to the offset and avoid the extra + * ADD to the register file. + */ + brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), + retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D)); + brw_set_default_swsb(p, tgl_swsb_null()); + brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), + retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D)); + } else { + struct brw_reg ind_src = brw_VxH_indirect(0, 0); + + brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type)); + + if (devinfo->ver == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE && + !inst->get_next()->is_tail_sentinel() && + ((fs_inst *)inst->get_next())->mlen > 0) { + /* From the Sandybridge PRM: + * + * "[Errata: DevSNB(SNB)] If MRF register is updated by any + * instruction that “indexed/indirect” source AND is followed + * by a send, the instruction requires a “Switch”. This is to + * avoid race condition where send may dispatch before MRF is + * updated." + */ + brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH); + } + } + } +} + +void +fs_generator::generate_shuffle(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src, + struct brw_reg idx) +{ + assert(src.file == BRW_GENERAL_REGISTER_FILE); + assert(!src.abs && !src.negate); + + /* Ivy bridge has some strange behavior that makes this a real pain to + * implement for 64-bit values so we just don't bother. + */ + assert((devinfo->verx10 >= 75 && devinfo->has_64bit_float) || + type_sz(src.type) <= 4); + + /* Gen12.5 adds the following region restriction: + * + * "Vx1 and VxH indirect addressing for Float, Half-Float, Double-Float + * and Quad-Word data must not be used." + * + * We require the source and destination types to match so stomp to an + * unsigned integer type. + */ + assert(src.type == dst.type); + src.type = dst.type = brw_reg_type_from_bit_size(type_sz(src.type) * 8, + BRW_REGISTER_TYPE_UD); + + /* Because we're using the address register, we're limited to 8-wide + * execution on gfx7. On gfx8, we're limited to 16-wide by the address + * register file and 8-wide for 64-bit types. We could try and make this + * instruction splittable higher up in the compiler but that gets weird + * because it reads all of the channels regardless of execution size. It's + * easier just to split it here. + */ + const unsigned lower_width = + devinfo->ver <= 7 || element_sz(src) > 4 || element_sz(dst) > 4 ? 8 : + MIN2(16, inst->exec_size); + + brw_set_default_exec_size(p, cvt(lower_width) - 1); + for (unsigned group = 0; group < inst->exec_size; group += lower_width) { + brw_set_default_group(p, group); + + if ((src.vstride == 0 && src.hstride == 0) || + idx.file == BRW_IMMEDIATE_VALUE) { + /* Trivial, the source is already uniform or the index is a constant. + * We will typically not get here if the optimizer is doing its job, + * but asserting would be mean. + */ + const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0; + struct brw_reg group_src = stride(suboffset(src, i), 0, 1, 0); + struct brw_reg group_dst = suboffset(dst, group << (dst.hstride - 1)); + brw_MOV(p, group_dst, group_src); + } else { + /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */ + struct brw_reg addr = vec8(brw_address_reg(0)); + + struct brw_reg group_idx = suboffset(idx, group); + + if (lower_width == 8 && group_idx.width == BRW_WIDTH_16) { + /* Things get grumpy if the register is too wide. */ + group_idx.width--; + group_idx.vstride--; + } + + assert(type_sz(group_idx.type) <= 4); + if (type_sz(group_idx.type) == 4) { + /* The destination stride of an instruction (in bytes) must be + * greater than or equal to the size of the rest of the + * instruction. Since the address register is of type UW, we + * can't use a D-type instruction. In order to get around this, + * re retype to UW and use a stride. + */ + group_idx = retype(spread(group_idx, 2), BRW_REGISTER_TYPE_W); + } + + uint32_t src_start_offset = src.nr * REG_SIZE + src.subnr; + + /* From the Haswell PRM: + * + * "When a sequence of NoDDChk and NoDDClr are used, the last + * instruction that completes the scoreboard clear must have a + * non-zero execution mask. This means, if any kind of predication + * can change the execution mask or channel enable of the last + * instruction, the optimization must be avoided. This is to + * avoid instructions being shot down the pipeline when no writes + * are required." + * + * Whenever predication is enabled or the instructions being emitted + * aren't the full width, it's possible that it will be run with zero + * channels enabled so we can't use dependency control without + * running the risk of a hang if an instruction gets shot down. + */ + const bool use_dep_ctrl = !inst->predicate && + lower_width == dispatch_width; + brw_inst *insn; + + /* Due to a hardware bug some platforms (particularly Gfx11+) seem + * to require the address components of all channels to be valid + * whether or not they're active, which causes issues if we use VxH + * addressing under non-uniform control-flow. We can easily work + * around that by initializing the whole address register with a + * pipelined NoMask MOV instruction. + */ + insn = brw_MOV(p, addr, brw_imm_uw(src_start_offset)); + brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); + brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); + if (devinfo->ver >= 12) + brw_set_default_swsb(p, tgl_swsb_null()); + else + brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl); + + /* Take into account the component size and horizontal stride. */ + assert(src.vstride == src.hstride + src.width); + insn = brw_SHL(p, addr, group_idx, + brw_imm_uw(util_logbase2(type_sz(src.type)) + + src.hstride - 1)); + if (devinfo->ver >= 12) + brw_set_default_swsb(p, tgl_swsb_regdist(1)); + else + brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl); + + /* Add on the register start offset */ + brw_ADD(p, addr, addr, brw_imm_uw(src_start_offset)); + brw_MOV(p, suboffset(dst, group << (dst.hstride - 1)), + retype(brw_VxH_indirect(0, 0), src.type)); + } + + brw_set_default_swsb(p, tgl_swsb_null()); + } +} + +void +fs_generator::generate_quad_swizzle(const fs_inst *inst, + struct brw_reg dst, struct brw_reg src, + unsigned swiz) +{ + /* Requires a quad. */ + assert(inst->exec_size >= 4); + + if (src.file == BRW_IMMEDIATE_VALUE || + has_scalar_region(src)) { + /* The value is uniform across all channels */ + brw_MOV(p, dst, src); + + } else if (devinfo->ver < 11 && type_sz(src.type) == 4) { + /* This only works on 8-wide 32-bit values */ + assert(inst->exec_size == 8); + assert(src.hstride == BRW_HORIZONTAL_STRIDE_1); + assert(src.vstride == src.width + 1); + brw_set_default_access_mode(p, BRW_ALIGN_16); + struct brw_reg swiz_src = stride(src, 4, 4, 1); + swiz_src.swizzle = swiz; + brw_MOV(p, dst, swiz_src); + + } else { + assert(src.hstride == BRW_HORIZONTAL_STRIDE_1); + assert(src.vstride == src.width + 1); + const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, 0)); + + switch (swiz) { + case BRW_SWIZZLE_XXXX: + case BRW_SWIZZLE_YYYY: + case BRW_SWIZZLE_ZZZZ: + case BRW_SWIZZLE_WWWW: + brw_MOV(p, dst, stride(src_0, 4, 4, 0)); + break; + + case BRW_SWIZZLE_XXZZ: + case BRW_SWIZZLE_YYWW: + brw_MOV(p, dst, stride(src_0, 2, 2, 0)); + break; + + case BRW_SWIZZLE_XYXY: + case BRW_SWIZZLE_ZWZW: + assert(inst->exec_size == 4); + brw_MOV(p, dst, stride(src_0, 0, 2, 1)); + break; + + default: + assert(inst->force_writemask_all); + brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1); + + for (unsigned c = 0; c < 4; c++) { + brw_inst *insn = brw_MOV( + p, stride(suboffset(dst, c), + 4 * inst->dst.stride, 1, 4 * inst->dst.stride), + stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0)); + + if (devinfo->ver < 12) { + brw_inst_set_no_dd_clear(devinfo, insn, c < 3); + brw_inst_set_no_dd_check(devinfo, insn, c > 0); + } + + brw_set_default_swsb(p, tgl_swsb_null()); + } + + break; + } + } +} + +void +fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload) +{ + struct brw_inst *insn; + + insn = brw_next_insn(p, BRW_OPCODE_SEND); + + brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW)); + brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW)); + if (devinfo->ver < 12) + brw_set_src1(p, insn, brw_imm_ud(0u)); + + /* For XeHP and newer send a message to the message gateway to terminate a + * compute shader. For older devices, a message is sent to the thread + * spawner. + */ + if (devinfo->verx10 >= 125) + brw_inst_set_sfid(devinfo, insn, BRW_SFID_MESSAGE_GATEWAY); + else + brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER); + brw_inst_set_mlen(devinfo, insn, 1); + brw_inst_set_rlen(devinfo, insn, 0); + brw_inst_set_eot(devinfo, insn, inst->eot); + brw_inst_set_header_present(devinfo, insn, false); + + brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */ + + if (devinfo->ver < 11) { + brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */ + + /* Note that even though the thread has a URB resource associated with it, + * we set the "do not dereference URB" bit, because the URB resource is + * managed by the fixed-function unit, so it will free it automatically. + */ + brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */ + } + + brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); +} + +void +fs_generator::generate_barrier(fs_inst *, struct brw_reg src) +{ + brw_barrier(p, src); + if (devinfo->ver >= 12) { + brw_set_default_swsb(p, tgl_swsb_null()); + brw_SYNC(p, TGL_SYNC_BAR); + } else { + brw_WAIT(p); + } +} + +bool +fs_generator::generate_linterp(fs_inst *inst, + struct brw_reg dst, struct brw_reg *src) +{ + /* PLN reads: + * / in SIMD16 \ + * ----------------------------------- + * | src1+0 | src1+1 | src1+2 | src1+3 | + * |-----------------------------------| + * |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)| + * ----------------------------------- + * + * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys: + * + * ----------------------------------- + * | src1+0 | src1+1 | src1+2 | src1+3 | + * |-----------------------------------| + * |(x0, x1)|(y0, y1)| | | in SIMD8 + * |-----------------------------------| + * |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16 + * ----------------------------------- + * + * See also: emit_interpolation_setup_gfx4(). + */ + struct brw_reg delta_x = src[0]; + struct brw_reg delta_y = offset(src[0], inst->exec_size / 8); + struct brw_reg interp = src[1]; + brw_inst *i[2]; + + /* nir_lower_interpolation() will do the lowering to MAD instructions for + * us on gfx11+ + */ + assert(devinfo->ver < 11); + + if (devinfo->has_pln) { + if (devinfo->ver <= 6 && (delta_x.nr & 1) != 0) { + /* From the Sandy Bridge PRM Vol. 4, Pt. 2, Section 8.3.53, "Plane": + * + * "[DevSNB]: must be even register aligned. + * + * This restriction is lifted on Ivy Bridge. + * + * This means that we need to split PLN into LINE+MAC on-the-fly. + * Unfortunately, the inputs are laid out for PLN and not LINE+MAC so + * we have to split into SIMD8 pieces. For gfx4 (!has_pln), the + * coordinate registers are laid out differently so we leave it as a + * SIMD16 instruction. + */ + assert(inst->exec_size == 8 || inst->exec_size == 16); + assert(inst->group % 16 == 0); + + brw_push_insn_state(p); + brw_set_default_exec_size(p, BRW_EXECUTE_8); + + /* Thanks to two accumulators, we can emit all the LINEs and then all + * the MACs. This improves parallelism a bit. + */ + for (unsigned g = 0; g < inst->exec_size / 8; g++) { + brw_inst *line = brw_LINE(p, brw_null_reg(), interp, + offset(delta_x, g * 2)); + brw_inst_set_group(devinfo, line, inst->group + g * 8); + + /* LINE writes the accumulator automatically on gfx4-5. On Sandy + * Bridge and later, we have to explicitly enable it. + */ + if (devinfo->ver >= 6) + brw_inst_set_acc_wr_control(p->devinfo, line, true); + + /* brw_set_default_saturate() is called before emitting + * instructions, so the saturate bit is set in each instruction, + * so we need to unset it on the LINE instructions. + */ + brw_inst_set_saturate(p->devinfo, line, false); + } + + for (unsigned g = 0; g < inst->exec_size / 8; g++) { + brw_inst *mac = brw_MAC(p, offset(dst, g), suboffset(interp, 1), + offset(delta_x, g * 2 + 1)); + brw_inst_set_group(devinfo, mac, inst->group + g * 8); + brw_inst_set_cond_modifier(p->devinfo, mac, inst->conditional_mod); + } + + brw_pop_insn_state(p); + + return true; + } else { + brw_PLN(p, dst, interp, delta_x); + + return false; + } + } else { + i[0] = brw_LINE(p, brw_null_reg(), interp, delta_x); + i[1] = brw_MAC(p, dst, suboffset(interp, 1), delta_y); + + brw_inst_set_cond_modifier(p->devinfo, i[1], inst->conditional_mod); + + /* brw_set_default_saturate() is called before emitting instructions, so + * the saturate bit is set in each instruction, so we need to unset it on + * the first instruction. + */ + brw_inst_set_saturate(p->devinfo, i[0], false); + + return true; + } +} + +void +fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, + struct brw_reg surface_index, + struct brw_reg sampler_index) +{ + assert(devinfo->ver < 7); + assert(inst->size_written % REG_SIZE == 0); + int msg_type = -1; + uint32_t simd_mode; + uint32_t return_format; + + /* Sampler EOT message of less than the dispatch width would kill the + * thread prematurely. + */ + assert(!inst->eot || inst->exec_size == dispatch_width); + + switch (dst.type) { + case BRW_REGISTER_TYPE_D: + return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32; + break; + case BRW_REGISTER_TYPE_UD: + return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; + break; + default: + return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; + break; + } + + /* Stomp the resinfo output type to UINT32. On gens 4-5, the output type + * is set as part of the message descriptor. On gfx4, the PRM seems to + * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on + * later gens UINT32 is required. Once you hit Sandy Bridge, the bit is + * gone from the message descriptor entirely and you just get UINT32 all + * the time regasrdless. Since we can really only do non-UINT32 on gfx4, + * just stomp it to UINT32 all the time. + */ + if (inst->opcode == SHADER_OPCODE_TXS) + return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; + + switch (inst->exec_size) { + case 8: + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; + break; + case 16: + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; + break; + default: + unreachable("Invalid width for texture instruction"); + } + + if (devinfo->ver >= 5) { + switch (inst->opcode) { + case SHADER_OPCODE_TEX: + if (inst->shadow_compare) { + msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE; + } else { + msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE; + } + break; + case FS_OPCODE_TXB: + if (inst->shadow_compare) { + msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE; + } else { + msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS; + } + break; + case SHADER_OPCODE_TXL: + if (inst->shadow_compare) { + msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE; + } else { + msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LOD; + } + break; + case SHADER_OPCODE_TXS: + msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO; + break; + case SHADER_OPCODE_TXD: + assert(!inst->shadow_compare); + msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS; + break; + case SHADER_OPCODE_TXF: + msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD; + break; + case SHADER_OPCODE_TXF_CMS: + msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD; + break; + case SHADER_OPCODE_LOD: + msg_type = GFX5_SAMPLER_MESSAGE_LOD; + break; + case SHADER_OPCODE_TG4: + assert(devinfo->ver == 6); + assert(!inst->shadow_compare); + msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4; + break; + case SHADER_OPCODE_SAMPLEINFO: + msg_type = GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO; + break; + default: + unreachable("not reached"); + } + } else { + switch (inst->opcode) { + case SHADER_OPCODE_TEX: + /* Note that G45 and older determines shadow compare and dispatch width + * from message length for most messages. + */ + if (inst->exec_size == 8) { + msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE; + if (inst->shadow_compare) { + assert(inst->mlen == 6); + } else { + assert(inst->mlen <= 4); + } + } else { + if (inst->shadow_compare) { + msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE; + assert(inst->mlen == 9); + } else { + msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE; + assert(inst->mlen <= 7 && inst->mlen % 2 == 1); + } + } + break; + case FS_OPCODE_TXB: + if (inst->shadow_compare) { + assert(inst->exec_size == 8); + assert(inst->mlen == 6); + msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE; + } else { + assert(inst->mlen == 9); + msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS; + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; + } + break; + case SHADER_OPCODE_TXL: + if (inst->shadow_compare) { + assert(inst->exec_size == 8); + assert(inst->mlen == 6); + msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE; + } else { + assert(inst->mlen == 9); + msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD; + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; + } + break; + case SHADER_OPCODE_TXD: + /* There is no sample_d_c message; comparisons are done manually */ + assert(inst->exec_size == 8); + assert(inst->mlen == 7 || inst->mlen == 10); + msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS; + break; + case SHADER_OPCODE_TXF: + assert(inst->mlen <= 9 && inst->mlen % 2 == 1); + msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD; + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; + break; + case SHADER_OPCODE_TXS: + assert(inst->mlen == 3); + msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO; + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; + break; + default: + unreachable("not reached"); + } + } + assert(msg_type != -1); + + if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) { + dst = vec16(dst); + } + + assert(sampler_index.type == BRW_REGISTER_TYPE_UD); + + /* Load the message header if present. If there's a texture offset, + * we need to set it up explicitly and load the offset bitfield. + * Otherwise, we can use an implied move from g0 to the first message reg. + */ + struct brw_reg src = brw_null_reg(); + if (inst->header_size != 0) { + if (devinfo->ver < 6 && !inst->offset) { + /* Set up an implied move from g0 to the MRF. */ + src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); + } else { + const tgl_swsb swsb = brw_get_default_swsb(p); + assert(inst->base_mrf != -1); + struct brw_reg header_reg = brw_message_reg(inst->base_mrf); + + brw_push_insn_state(p); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); + brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + /* Explicitly set up the message header by copying g0 to the MRF. */ + brw_MOV(p, header_reg, brw_vec8_grf(0, 0)); + brw_set_default_swsb(p, tgl_swsb_regdist(1)); + + brw_set_default_exec_size(p, BRW_EXECUTE_1); + if (inst->offset) { + /* Set the offset bits in DWord 2. */ + brw_MOV(p, get_element_ud(header_reg, 2), + brw_imm_ud(inst->offset)); + } + + brw_pop_insn_state(p); + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); + } + } + + assert(surface_index.file == BRW_IMMEDIATE_VALUE); + assert(sampler_index.file == BRW_IMMEDIATE_VALUE); + + brw_SAMPLE(p, + retype(dst, BRW_REGISTER_TYPE_UW), + inst->base_mrf, + src, + surface_index.ud, + sampler_index.ud % 16, + msg_type, + inst->size_written / REG_SIZE, + inst->mlen, + inst->header_size != 0, + simd_mode, + return_format); +} + + +/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input + * looking like: + * + * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br + * + * Ideally, we want to produce: + * + * DDX DDY + * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) + * (ss0.tr - ss0.tl) (ss0.tr - ss0.br) + * (ss0.br - ss0.bl) (ss0.tl - ss0.bl) + * (ss0.br - ss0.bl) (ss0.tr - ss0.br) + * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl) + * (ss1.tr - ss1.tl) (ss1.tr - ss1.br) + * (ss1.br - ss1.bl) (ss1.tl - ss1.bl) + * (ss1.br - ss1.bl) (ss1.tr - ss1.br) + * + * and add another set of two more subspans if in 16-pixel dispatch mode. + * + * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result + * for each pair, and vertstride = 2 jumps us 2 elements after processing a + * pair. But the ideal approximation may impose a huge performance cost on + * sample_d. On at least Haswell, sample_d instruction does some + * optimizations if the same LOD is used for all pixels in the subspan. + * + * For DDY, we need to use ALIGN16 mode since it's capable of doing the + * appropriate swizzling. + */ +void +fs_generator::generate_ddx(const fs_inst *inst, + struct brw_reg dst, struct brw_reg src) +{ + unsigned vstride, width; + + if (devinfo->ver >= 8) { + if (inst->opcode == FS_OPCODE_DDX_FINE) { + /* produce accurate derivatives */ + vstride = BRW_VERTICAL_STRIDE_2; + width = BRW_WIDTH_2; + } else { + /* replicate the derivative at the top-left pixel to other pixels */ + vstride = BRW_VERTICAL_STRIDE_4; + width = BRW_WIDTH_4; + } + + struct brw_reg src0 = byte_offset(src, type_sz(src.type));; + struct brw_reg src1 = src; + + src0.vstride = vstride; + src0.width = width; + src0.hstride = BRW_HORIZONTAL_STRIDE_0; + src1.vstride = vstride; + src1.width = width; + src1.hstride = BRW_HORIZONTAL_STRIDE_0; + + brw_ADD(p, dst, src0, negate(src1)); + } else { + /* On Haswell and earlier, the region used above appears to not work + * correctly for compressed instructions. At least on Haswell and + * Iron Lake, compressed ALIGN16 instructions do work. Since we + * would have to split to SIMD8 no matter which method we choose, we + * may as well use ALIGN16 on all platforms gfx7 and earlier. + */ + struct brw_reg src0 = stride(src, 4, 4, 1); + struct brw_reg src1 = stride(src, 4, 4, 1); + if (inst->opcode == FS_OPCODE_DDX_FINE) { + src0.swizzle = BRW_SWIZZLE_XXZZ; + src1.swizzle = BRW_SWIZZLE_YYWW; + } else { + src0.swizzle = BRW_SWIZZLE_XXXX; + src1.swizzle = BRW_SWIZZLE_YYYY; + } + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_ADD(p, dst, negate(src0), src1); + brw_pop_insn_state(p); + } +} + +/* The negate_value boolean is used to negate the derivative computation for + * FBOs, since they place the origin at the upper left instead of the lower + * left. + */ +void +fs_generator::generate_ddy(const fs_inst *inst, + struct brw_reg dst, struct brw_reg src) +{ + const uint32_t type_size = type_sz(src.type); + + if (inst->opcode == FS_OPCODE_DDY_FINE) { + /* produce accurate derivatives. + * + * From the Broadwell PRM, Volume 7 (3D-Media-GPGPU) + * "Register Region Restrictions", Section "1. Special Restrictions": + * + * "In Align16 mode, the channel selects and channel enables apply to + * a pair of half-floats, because these parameters are defined for + * DWord elements ONLY. This is applicable when both source and + * destination are half-floats." + * + * So for half-float operations we use the Gfx11+ Align1 path. CHV + * inherits its FP16 hardware from SKL, so it is not affected. + */ + if (devinfo->ver >= 11 || + (devinfo->platform == INTEL_PLATFORM_BDW && src.type == BRW_REGISTER_TYPE_HF)) { + src = stride(src, 0, 2, 1); + + brw_push_insn_state(p); + brw_set_default_exec_size(p, BRW_EXECUTE_4); + for (uint32_t g = 0; g < inst->exec_size; g += 4) { + brw_set_default_group(p, inst->group + g); + brw_ADD(p, byte_offset(dst, g * type_size), + negate(byte_offset(src, g * type_size)), + byte_offset(src, (g + 2) * type_size)); + brw_set_default_swsb(p, tgl_swsb_null()); + } + brw_pop_insn_state(p); + } else { + struct brw_reg src0 = stride(src, 4, 4, 1); + struct brw_reg src1 = stride(src, 4, 4, 1); + src0.swizzle = BRW_SWIZZLE_XYXY; + src1.swizzle = BRW_SWIZZLE_ZWZW; + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_ADD(p, dst, negate(src0), src1); + brw_pop_insn_state(p); + } + } else { + /* replicate the derivative at the top-left pixel to other pixels */ + if (devinfo->ver >= 8) { + struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size); + struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size); + + brw_ADD(p, dst, negate(src0), src1); + } else { + /* On Haswell and earlier, the region used above appears to not work + * correctly for compressed instructions. At least on Haswell and + * Iron Lake, compressed ALIGN16 instructions do work. Since we + * would have to split to SIMD8 no matter which method we choose, we + * may as well use ALIGN16 on all platforms gfx7 and earlier. + */ + struct brw_reg src0 = stride(src, 4, 4, 1); + struct brw_reg src1 = stride(src, 4, 4, 1); + src0.swizzle = BRW_SWIZZLE_XXXX; + src1.swizzle = BRW_SWIZZLE_ZZZZ; + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_ADD(p, dst, negate(src0), src1); + brw_pop_insn_state(p); + } + } +} + +void +fs_generator::generate_halt(fs_inst *) +{ + /* This HALT will be patched up at FB write time to point UIP at the end of + * the program, and at brw_uip_jip() JIP will be set to the end of the + * current block (or the program). + */ + this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn)); + brw_HALT(p); +} + +void +fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src) +{ + /* The 32-wide messages only respect the first 16-wide half of the channel + * enable signals which are replicated identically for the second group of + * 16 channels, so we cannot use them unless the write is marked + * force_writemask_all. + */ + const unsigned lower_size = inst->force_writemask_all ? inst->exec_size : + MIN2(16, inst->exec_size); + const unsigned block_size = 4 * lower_size / REG_SIZE; + const tgl_swsb swsb = brw_get_default_swsb(p); + assert(inst->mlen != 0); + + brw_push_insn_state(p); + brw_set_default_exec_size(p, cvt(lower_size) - 1); + brw_set_default_compression(p, lower_size > 8); + + for (unsigned i = 0; i < inst->exec_size / lower_size; i++) { + brw_set_default_group(p, inst->group + lower_size * i); + + if (i > 0) { + assert(swsb.mode & TGL_SBID_SET); + brw_set_default_swsb(p, tgl_swsb_sbid(TGL_SBID_SRC, swsb.sbid)); + } else { + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); + } + + brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0), + retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD)); + + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); + brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), + block_size, + inst->offset + block_size * REG_SIZE * i); + } + + brw_pop_insn_state(p); +} + +void +fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst) +{ + assert(inst->exec_size <= 16 || inst->force_writemask_all); + assert(inst->mlen != 0); + + brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), + inst->exec_size / 8, inst->offset); +} + +void +fs_generator::generate_scratch_read_gfx7(fs_inst *inst, struct brw_reg dst) +{ + assert(inst->exec_size <= 16 || inst->force_writemask_all); + + gfx7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset); +} + +/* The A32 messages take a buffer base address in header.5:[31:0] (See + * MH1_A32_PSM for typed messages or MH_A32_GO for byte/dword scattered + * and OWord block messages in the SKL PRM Vol. 2d for more details.) + * Unfortunately, there are a number of subtle differences: + * + * For the block read/write messages: + * + * - We always stomp header.2 to fill in the actual scratch address (in + * units of OWORDs) so we don't care what's in there. + * + * - They rely on per-thread scratch space value in header.3[3:0] to do + * bounds checking so that needs to be valid. The upper bits of + * header.3 are ignored, though, so we can copy all of g0.3. + * + * - They ignore header.5[9:0] and assumes the address is 1KB aligned. + * + * + * For the byte/dword scattered read/write messages: + * + * - We want header.2 to be zero because that gets added to the per-channel + * offset in the non-header portion of the message. + * + * - Contrary to what the docs claim, they don't do any bounds checking so + * the value of header.3[3:0] doesn't matter. + * + * - They consider all of header.5 for the base address and header.5[9:0] + * are not ignored. This means that we can't copy g0.5 verbatim because + * g0.5[9:0] contains the FFTID on most platforms. Instead, we have to + * use an AND to mask off the bottom 10 bits. + * + * + * For block messages, just copying g0 gives a valid header because all the + * garbage gets ignored except for header.2 which we stomp as part of message + * setup. For byte/dword scattered messages, we can just zero out the header + * and copy over the bits we need from g0.5. This opcode, however, tries to + * satisfy the requirements of both by starting with 0 and filling out the + * information required by either set of opcodes. + */ +void +fs_generator::generate_scratch_header(fs_inst *inst, struct brw_reg dst) +{ + assert(inst->exec_size == 8 && inst->force_writemask_all); + assert(dst.file == BRW_GENERAL_REGISTER_FILE); + + dst.type = BRW_REGISTER_TYPE_UD; + + brw_inst *insn = brw_MOV(p, dst, brw_imm_ud(0)); + if (devinfo->ver >= 12) + brw_set_default_swsb(p, tgl_swsb_null()); + else + brw_inst_set_no_dd_clear(p->devinfo, insn, true); + + /* Copy the per-thread scratch space size from g0.3[3:0] */ + brw_set_default_exec_size(p, BRW_EXECUTE_1); + insn = brw_AND(p, suboffset(dst, 3), + retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD), + brw_imm_ud(INTEL_MASK(3, 0))); + if (devinfo->ver < 12) { + brw_inst_set_no_dd_clear(p->devinfo, insn, true); + brw_inst_set_no_dd_check(p->devinfo, insn, true); + } + + /* Copy the scratch base address from g0.5[31:10] */ + insn = brw_AND(p, suboffset(dst, 5), + retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), + brw_imm_ud(INTEL_MASK(31, 10))); + if (devinfo->ver < 12) + brw_inst_set_no_dd_check(p->devinfo, insn, true); +} + +void +fs_generator::generate_uniform_pull_constant_load(fs_inst *inst, + struct brw_reg dst, + struct brw_reg index, + struct brw_reg offset) +{ + assert(type_sz(dst.type) == 4); + assert(inst->mlen != 0); + + assert(index.file == BRW_IMMEDIATE_VALUE && + index.type == BRW_REGISTER_TYPE_UD); + uint32_t surf_index = index.ud; + + assert(offset.file == BRW_IMMEDIATE_VALUE && + offset.type == BRW_REGISTER_TYPE_UD); + uint32_t read_offset = offset.ud; + + brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), + read_offset, surf_index); +} + +void +fs_generator::generate_varying_pull_constant_load_gfx4(fs_inst *inst, + struct brw_reg dst, + struct brw_reg index) +{ + assert(devinfo->ver < 7); /* Should use the gfx7 variant. */ + assert(inst->header_size != 0); + assert(inst->mlen); + + assert(index.file == BRW_IMMEDIATE_VALUE && + index.type == BRW_REGISTER_TYPE_UD); + uint32_t surf_index = index.ud; + + uint32_t simd_mode, rlen, msg_type; + if (inst->exec_size == 16) { + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; + rlen = 8; + } else { + assert(inst->exec_size == 8); + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; + rlen = 4; + } + + if (devinfo->ver >= 5) + msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD; + else { + /* We always use the SIMD16 message so that we only have to load U, and + * not V or R. + */ + msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD; + assert(inst->mlen == 3); + assert(inst->size_written == 8 * REG_SIZE); + rlen = 8; + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; + } + + struct brw_reg header = brw_vec8_grf(0, 0); + gfx6_resolve_implied_move(p, &header, inst->base_mrf); + + brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_inst_set_compression(devinfo, send, false); + brw_inst_set_sfid(devinfo, send, BRW_SFID_SAMPLER); + brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW)); + brw_set_src0(p, send, header); + if (devinfo->ver < 6) + brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf); + + /* Our surface is set up as floats, regardless of what actual data is + * stored in it. + */ + uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; + brw_set_desc(p, send, + brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size) | + brw_sampler_desc(devinfo, surf_index, + 0, /* sampler (unused) */ + msg_type, simd_mode, return_format)); +} + +/* Sets vstride=1, width=4, hstride=0 of register src1 during + * the ADD instruction. + */ +void +fs_generator::generate_set_sample_id(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1) +{ + assert(dst.type == BRW_REGISTER_TYPE_D || + dst.type == BRW_REGISTER_TYPE_UD); + assert(src0.type == BRW_REGISTER_TYPE_D || + src0.type == BRW_REGISTER_TYPE_UD); + + const struct brw_reg reg = stride(src1, 1, 4, 0); + const unsigned lower_size = MIN2(inst->exec_size, + devinfo->ver >= 8 ? 16 : 8); + + for (unsigned i = 0; i < inst->exec_size / lower_size; i++) { + brw_inst *insn = brw_ADD(p, offset(dst, i * lower_size / 8), + offset(src0, (src0.vstride == 0 ? 0 : (1 << (src0.vstride - 1)) * + (i * lower_size / (1 << src0.width))) * + type_sz(src0.type) / REG_SIZE), + suboffset(reg, i * lower_size / 4)); + brw_inst_set_exec_size(devinfo, insn, cvt(lower_size) - 1); + brw_inst_set_group(devinfo, insn, inst->group + lower_size * i); + brw_inst_set_compression(devinfo, insn, lower_size > 8); + brw_set_default_swsb(p, tgl_swsb_null()); + } +} + +void +fs_generator::enable_debug(const char *shader_name) +{ + debug_flag = true; + this->shader_name = shader_name; +} + +static gfx12_systolic_depth +translate_systolic_depth(unsigned d) +{ + /* Could also return (ffs(d) - 1) & 3. */ + switch (d) { + case 2: return BRW_SYSTOLIC_DEPTH_2; + case 4: return BRW_SYSTOLIC_DEPTH_4; + case 8: return BRW_SYSTOLIC_DEPTH_8; + case 16: return BRW_SYSTOLIC_DEPTH_16; + default: unreachable("Invalid systolic depth."); + } +} + +int +fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, + struct shader_stats shader_stats, + const brw::performance &perf, + struct brw_compile_stats *stats, + unsigned max_polygons) +{ + /* align to 64 byte boundary. */ + brw_realign(p, 64); + + this->dispatch_width = dispatch_width; + + int start_offset = p->next_insn_offset; + + int loop_count = 0, send_count = 0, nop_count = 0, sync_nop_count = 0; + bool is_accum_used = false; + + struct disasm_info *disasm_info = disasm_initialize(p->isa, cfg); + + foreach_block_and_inst (block, fs_inst, inst, cfg) { + if (inst->opcode == SHADER_OPCODE_UNDEF) + continue; + + struct brw_reg src[4], dst; + unsigned int last_insn_offset = p->next_insn_offset; + bool multiple_instructions_emitted = false; + tgl_swsb swsb = inst->sched; + + /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the + * "Register Region Restrictions" section: for BDW, SKL: + * + * "A POW/FDIV operation must not be followed by an instruction + * that requires two destination registers." + * + * The documentation is often lacking annotations for Atom parts, + * and empirically this affects CHV as well. + */ + if (devinfo->ver >= 8 && + devinfo->ver <= 9 && + p->nr_insn > 1 && + brw_inst_opcode(p->isa, brw_last_inst) == BRW_OPCODE_MATH && + brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW && + inst->dst.component_size(inst->exec_size) > REG_SIZE) { + brw_NOP(p); + last_insn_offset = p->next_insn_offset; + + /* In order to avoid spurious instruction count differences when the + * instruction schedule changes, keep track of the number of inserted + * NOPs. + */ + nop_count++; + } + + /* Wa_14010017096: + * + * Clear accumulator register before end of thread. + */ + if (inst->eot && is_accum_used && + intel_needs_workaround(devinfo, 14010017096)) { + brw_set_default_exec_size(p, BRW_EXECUTE_16); + brw_set_default_group(p, 0); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_set_default_flag_reg(p, 0, 0); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); + brw_MOV(p, brw_acc_reg(8), brw_imm_f(0.0f)); + last_insn_offset = p->next_insn_offset; + swsb = tgl_swsb_dst_dep(swsb, 1); + } + + if (!is_accum_used && !inst->eot) { + is_accum_used = inst->writes_accumulator_implicitly(devinfo) || + inst->dst.is_accumulator(); + } + + /* Wa_14013672992: + * + * Always use @1 SWSB for EOT. + */ + if (inst->eot && intel_needs_workaround(devinfo, 14013672992)) { + if (tgl_swsb_src_dep(swsb).mode) { + brw_set_default_exec_size(p, BRW_EXECUTE_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_set_default_flag_reg(p, 0, 0); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); + brw_SYNC(p, TGL_SYNC_NOP); + last_insn_offset = p->next_insn_offset; + } + + swsb = tgl_swsb_dst_dep(swsb, 1); + } + + if (unlikely(debug_flag)) + disasm_annotate(disasm_info, inst, p->next_insn_offset); + + /* If the instruction writes to more than one register, it needs to be + * explicitly marked as compressed on Gen <= 5. On Gen >= 6 the + * hardware figures out by itself what the right compression mode is, + * but we still need to know whether the instruction is compressed to + * set up the source register regions appropriately. + * + * XXX - This is wrong for instructions that write a single register but + * read more than one which should strictly speaking be treated as + * compressed. For instructions that don't write any registers it + * relies on the destination being a null register of the correct + * type and regioning so the instruction is considered compressed + * or not accordingly. + */ + const bool compressed = + inst->dst.component_size(inst->exec_size) > REG_SIZE; + brw_set_default_compression(p, compressed); + + if ((devinfo->ver >= 20 || devinfo->ver < 7) && inst->group % 8 != 0) { + assert(inst->force_writemask_all); + assert(!inst->predicate && !inst->conditional_mod); + assert(!inst->writes_accumulator_implicitly(devinfo) && + !inst->reads_accumulator_implicitly()); + assert(inst->opcode != SHADER_OPCODE_SEL_EXEC); + brw_set_default_group(p, 0); + } else { + brw_set_default_group(p, inst->group); + } + + for (unsigned int i = 0; i < inst->sources; i++) { + src[i] = brw_reg_from_fs_reg(devinfo, inst, + &inst->src[i], compressed); + /* The accumulator result appears to get used for the + * conditional modifier generation. When negating a UD + * value, there is a 33rd bit generated for the sign in the + * accumulator value, so now you can't check, for example, + * equality with a 32-bit value. See piglit fs-op-neg-uvec4. + */ + assert(!inst->conditional_mod || + inst->src[i].type != BRW_REGISTER_TYPE_UD || + !inst->src[i].negate); + } + dst = brw_reg_from_fs_reg(devinfo, inst, + &inst->dst, compressed); + + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_predicate_control(p, inst->predicate); + brw_set_default_predicate_inverse(p, inst->predicate_inverse); + /* On gfx7 and above, hardware automatically adds the group onto the + * flag subregister number. On Sandy Bridge and older, we have to do it + * ourselves. + */ + const unsigned flag_subreg = inst->flag_subreg + + (devinfo->ver >= 7 ? 0 : inst->group / 16); + brw_set_default_flag_reg(p, flag_subreg / 2, flag_subreg % 2); + brw_set_default_saturate(p, inst->saturate); + brw_set_default_mask_control(p, inst->force_writemask_all); + if (devinfo->ver >= 20 && inst->writes_accumulator) { + assert(inst->dst.is_accumulator() || + inst->opcode == BRW_OPCODE_ADDC || + inst->opcode == BRW_OPCODE_MACH || + inst->opcode == BRW_OPCODE_SUBB); + } else { + brw_set_default_acc_write_control(p, inst->writes_accumulator); + } + brw_set_default_swsb(p, swsb); + + unsigned exec_size = inst->exec_size; + if (devinfo->verx10 == 70 && + (get_exec_type_size(inst) == 8 || type_sz(inst->dst.type) == 8)) { + exec_size *= 2; + } + + brw_set_default_exec_size(p, cvt(exec_size) - 1); + + assert(inst->force_writemask_all || inst->exec_size >= 4); + assert(inst->force_writemask_all || inst->group % inst->exec_size == 0); + assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->ver)); + assert(inst->mlen <= BRW_MAX_MSG_LENGTH * reg_unit(devinfo)); + + switch (inst->opcode) { + case BRW_OPCODE_SYNC: + assert(src[0].file == BRW_IMMEDIATE_VALUE); + brw_SYNC(p, tgl_sync_function(src[0].ud)); + + if (tgl_sync_function(src[0].ud) == TGL_SYNC_NOP) + ++sync_nop_count; + + break; + case BRW_OPCODE_MOV: + brw_MOV(p, dst, src[0]); + break; + case BRW_OPCODE_ADD: + brw_ADD(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_MUL: + brw_MUL(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_AVG: + brw_AVG(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_MACH: + brw_MACH(p, dst, src[0], src[1]); + break; + + case BRW_OPCODE_DP4A: + assert(devinfo->ver >= 12); + brw_DP4A(p, dst, src[0], src[1], src[2]); + break; + + case BRW_OPCODE_LINE: + brw_LINE(p, dst, src[0], src[1]); + break; + + case BRW_OPCODE_DPAS: + assert(devinfo->verx10 >= 125); + brw_DPAS(p, translate_systolic_depth(inst->sdepth), inst->rcount, + dst, src[0], src[1], src[2]); + break; + + case BRW_OPCODE_MAD: + assert(devinfo->ver >= 6); + if (devinfo->ver < 10) + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_MAD(p, dst, src[0], src[1], src[2]); + break; + + case BRW_OPCODE_LRP: + assert(devinfo->ver >= 6 && devinfo->ver <= 10); + if (devinfo->ver < 10) + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_LRP(p, dst, src[0], src[1], src[2]); + break; + + case BRW_OPCODE_ADD3: + assert(devinfo->verx10 >= 125); + brw_ADD3(p, dst, src[0], src[1], src[2]); + break; + + case BRW_OPCODE_FRC: + brw_FRC(p, dst, src[0]); + break; + case BRW_OPCODE_RNDD: + brw_RNDD(p, dst, src[0]); + break; + case BRW_OPCODE_RNDE: + brw_RNDE(p, dst, src[0]); + break; + case BRW_OPCODE_RNDZ: + brw_RNDZ(p, dst, src[0]); + break; + + case BRW_OPCODE_AND: + brw_AND(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_OR: + brw_OR(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_XOR: + brw_XOR(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_NOT: + brw_NOT(p, dst, src[0]); + break; + case BRW_OPCODE_ASR: + brw_ASR(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_SHR: + brw_SHR(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_SHL: + brw_SHL(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_ROL: + assert(devinfo->ver >= 11); + assert(src[0].type == dst.type); + brw_ROL(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_ROR: + assert(devinfo->ver >= 11); + assert(src[0].type == dst.type); + brw_ROR(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_F32TO16: + brw_F32TO16(p, dst, src[0]); + break; + case BRW_OPCODE_F16TO32: + brw_F16TO32(p, dst, src[0]); + break; + case BRW_OPCODE_CMP: + if (inst->exec_size >= 16 && devinfo->verx10 == 70 && + dst.file == BRW_ARCHITECTURE_REGISTER_FILE) { + /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround + * implemented in the compiler is not sufficient. Overriding the + * type when the destination is the null register is necessary but + * not sufficient by itself. + */ + dst.type = BRW_REGISTER_TYPE_D; + } + brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); + break; + case BRW_OPCODE_CMPN: + if (inst->exec_size >= 16 && devinfo->verx10 == 70 && + dst.file == BRW_ARCHITECTURE_REGISTER_FILE) { + /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround + * implemented in the compiler is not sufficient. Overriding the + * type when the destination is the null register is necessary but + * not sufficient by itself. + */ + dst.type = BRW_REGISTER_TYPE_D; + } + brw_CMPN(p, dst, inst->conditional_mod, src[0], src[1]); + break; + case BRW_OPCODE_SEL: + brw_SEL(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_CSEL: + assert(devinfo->ver >= 8); + if (devinfo->ver < 10) + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_CSEL(p, dst, src[0], src[1], src[2]); + break; + case BRW_OPCODE_BFREV: + assert(devinfo->ver >= 7); + brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD), + retype(src[0], BRW_REGISTER_TYPE_UD)); + break; + case BRW_OPCODE_FBH: + assert(devinfo->ver >= 7); + brw_FBH(p, retype(dst, src[0].type), src[0]); + break; + case BRW_OPCODE_FBL: + assert(devinfo->ver >= 7); + brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), + retype(src[0], BRW_REGISTER_TYPE_UD)); + break; + case BRW_OPCODE_LZD: + brw_LZD(p, dst, src[0]); + break; + case BRW_OPCODE_CBIT: + assert(devinfo->ver >= 7); + brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), + retype(src[0], BRW_REGISTER_TYPE_UD)); + break; + case BRW_OPCODE_ADDC: + assert(devinfo->ver >= 7); + brw_ADDC(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_SUBB: + assert(devinfo->ver >= 7); + brw_SUBB(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_MAC: + brw_MAC(p, dst, src[0], src[1]); + break; + + case BRW_OPCODE_BFE: + assert(devinfo->ver >= 7); + if (devinfo->ver < 10) + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_BFE(p, dst, src[0], src[1], src[2]); + break; + + case BRW_OPCODE_BFI1: + assert(devinfo->ver >= 7); + brw_BFI1(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_BFI2: + assert(devinfo->ver >= 7); + if (devinfo->ver < 10) + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_BFI2(p, dst, src[0], src[1], src[2]); + break; + + case BRW_OPCODE_IF: + if (inst->src[0].file != BAD_FILE) { + /* The instruction has an embedded compare (only allowed on gfx6) */ + assert(devinfo->ver == 6); + gfx6_IF(p, inst->conditional_mod, src[0], src[1]); + } else { + brw_IF(p, brw_get_default_exec_size(p)); + } + break; + + case BRW_OPCODE_ELSE: + brw_ELSE(p); + break; + case BRW_OPCODE_ENDIF: + brw_ENDIF(p); + break; + + case BRW_OPCODE_DO: + brw_DO(p, brw_get_default_exec_size(p)); + break; + + case BRW_OPCODE_BREAK: + brw_BREAK(p); + break; + case BRW_OPCODE_CONTINUE: + brw_CONT(p); + break; + + case BRW_OPCODE_WHILE: + brw_WHILE(p); + loop_count++; + break; + + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); + if (devinfo->ver >= 6) { + assert(inst->mlen == 0); + assert(devinfo->ver >= 7 || inst->exec_size == 8); + gfx6_math(p, dst, brw_math_function(inst->opcode), + src[0], brw_null_reg()); + } else { + assert(inst->mlen >= 1); + assert(devinfo->ver == 5 || devinfo->platform == INTEL_PLATFORM_G4X || inst->exec_size == 8); + gfx4_math(p, dst, + brw_math_function(inst->opcode), + inst->base_mrf, src[0], + BRW_MATH_PRECISION_FULL); + send_count++; + } + break; + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + case SHADER_OPCODE_POW: + assert(devinfo->verx10 < 125); + assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); + if (devinfo->ver >= 6) { + assert(inst->mlen == 0); + assert((devinfo->ver >= 7 && inst->opcode == SHADER_OPCODE_POW) || + inst->exec_size == 8); + gfx6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]); + } else { + assert(inst->mlen >= 1); + assert(inst->exec_size == 8); + gfx4_math(p, dst, brw_math_function(inst->opcode), + inst->base_mrf, src[0], + BRW_MATH_PRECISION_FULL); + send_count++; + } + break; + case FS_OPCODE_LINTERP: + multiple_instructions_emitted = generate_linterp(inst, dst, src); + break; + case FS_OPCODE_PIXEL_X: + assert(src[0].type == BRW_REGISTER_TYPE_UW); + assert(src[1].type == BRW_REGISTER_TYPE_UW); + src[0].subnr = 0 * type_sz(src[0].type); + if (src[1].file == BRW_IMMEDIATE_VALUE) { + assert(src[1].ud == 0); + brw_MOV(p, dst, stride(src[0], 8, 4, 1)); + } else { + /* Coarse pixel case */ + brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]); + } + break; + case FS_OPCODE_PIXEL_Y: + assert(src[0].type == BRW_REGISTER_TYPE_UW); + assert(src[1].type == BRW_REGISTER_TYPE_UW); + src[0].subnr = 4 * type_sz(src[0].type); + if (src[1].file == BRW_IMMEDIATE_VALUE) { + assert(src[1].ud == 0); + brw_MOV(p, dst, stride(src[0], 8, 4, 1)); + } else { + /* Coarse pixel case */ + brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]); + } + break; + + case SHADER_OPCODE_SEND: + generate_send(inst, dst, src[0], src[1], src[2], + inst->ex_mlen > 0 ? src[3] : brw_null_reg()); + send_count++; + break; + + case SHADER_OPCODE_TEX: + case FS_OPCODE_TXB: + case SHADER_OPCODE_TXD: + case SHADER_OPCODE_TXF: + case SHADER_OPCODE_TXF_CMS: + case SHADER_OPCODE_TXL: + case SHADER_OPCODE_TXS: + case SHADER_OPCODE_LOD: + case SHADER_OPCODE_TG4: + case SHADER_OPCODE_SAMPLEINFO: + assert(inst->src[0].file == BAD_FILE); + generate_tex(inst, dst, src[1], src[2]); + send_count++; + break; + + case FS_OPCODE_DDX_COARSE: + case FS_OPCODE_DDX_FINE: + generate_ddx(inst, dst, src[0]); + break; + case FS_OPCODE_DDY_COARSE: + case FS_OPCODE_DDY_FINE: + generate_ddy(inst, dst, src[0]); + break; + + case SHADER_OPCODE_GFX4_SCRATCH_WRITE: + generate_scratch_write(inst, src[0]); + send_count++; + break; + + case SHADER_OPCODE_GFX4_SCRATCH_READ: + generate_scratch_read(inst, dst); + send_count++; + break; + + case SHADER_OPCODE_GFX7_SCRATCH_READ: + generate_scratch_read_gfx7(inst, dst); + send_count++; + break; + + case SHADER_OPCODE_SCRATCH_HEADER: + generate_scratch_header(inst, dst); + break; + + case SHADER_OPCODE_MOV_INDIRECT: + generate_mov_indirect(inst, dst, src[0], src[1]); + break; + + case SHADER_OPCODE_MOV_RELOC_IMM: + assert(src[0].file == BRW_IMMEDIATE_VALUE); + brw_MOV_reloc_imm(p, dst, dst.type, src[0].ud); + break; + + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: + assert(inst->force_writemask_all); + generate_uniform_pull_constant_load(inst, dst, + src[PULL_UNIFORM_CONSTANT_SRC_SURFACE], + src[PULL_UNIFORM_CONSTANT_SRC_OFFSET]); + send_count++; + break; + + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4: + generate_varying_pull_constant_load_gfx4(inst, dst, src[0]); + send_count++; + break; + + case FS_OPCODE_REP_FB_WRITE: + case FS_OPCODE_FB_WRITE: + generate_fb_write(inst, src[0]); + send_count++; + break; + + case FS_OPCODE_FB_READ: + generate_fb_read(inst, dst, src[0]); + send_count++; + break; + + case BRW_OPCODE_HALT: + generate_halt(inst); + break; + + case SHADER_OPCODE_INTERLOCK: + case SHADER_OPCODE_MEMORY_FENCE: { + assert(src[1].file == BRW_IMMEDIATE_VALUE); + assert(src[2].file == BRW_IMMEDIATE_VALUE); + + const enum opcode send_op = inst->opcode == SHADER_OPCODE_INTERLOCK ? + BRW_OPCODE_SENDC : BRW_OPCODE_SEND; + + brw_memory_fence(p, dst, src[0], send_op, + brw_message_target(inst->sfid), + inst->desc, + /* commit_enable */ src[1].ud, + /* bti */ src[2].ud); + send_count++; + break; + } + + case FS_OPCODE_SCHEDULING_FENCE: + if (inst->sources == 0 && swsb.regdist == 0 && + swsb.mode == TGL_SBID_NULL) { + if (unlikely(debug_flag)) + disasm_info->use_tail = true; + break; + } + + if (devinfo->ver >= 12) { + /* Use the available SWSB information to stall. A single SYNC is + * sufficient since if there were multiple dependencies, the + * scoreboard algorithm already injected other SYNCs before this + * instruction. + */ + brw_SYNC(p, TGL_SYNC_NOP); + } else { + for (unsigned i = 0; i < inst->sources; i++) { + /* Emit a MOV to force a stall until the instruction producing the + * registers finishes. + */ + brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), + retype(src[i], BRW_REGISTER_TYPE_UW)); + } + + if (inst->sources > 1) + multiple_instructions_emitted = true; + } + + break; + + case SHADER_OPCODE_FIND_LIVE_CHANNEL: + brw_find_live_channel(p, dst, false); + break; + case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL: + brw_find_live_channel(p, dst, true); + break; + + case FS_OPCODE_LOAD_LIVE_CHANNELS: { + assert(devinfo->ver >= 8); + assert(inst->force_writemask_all && inst->group == 0); + assert(inst->dst.file == BAD_FILE); + brw_set_default_exec_size(p, BRW_EXECUTE_1); + brw_MOV(p, retype(brw_flag_subreg(inst->flag_subreg), + BRW_REGISTER_TYPE_UD), + retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)); + break; + } + case SHADER_OPCODE_BROADCAST: + assert(inst->force_writemask_all); + brw_broadcast(p, dst, src[0], src[1]); + break; + + case SHADER_OPCODE_SHUFFLE: + generate_shuffle(inst, dst, src[0], src[1]); + break; + + case SHADER_OPCODE_SEL_EXEC: + assert(inst->force_writemask_all); + assert(devinfo->has_64bit_float || type_sz(dst.type) <= 4); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, dst, src[1]); + brw_set_default_mask_control(p, BRW_MASK_ENABLE); + brw_set_default_swsb(p, tgl_swsb_null()); + brw_MOV(p, dst, src[0]); + break; + + case SHADER_OPCODE_QUAD_SWIZZLE: + assert(src[1].file == BRW_IMMEDIATE_VALUE); + assert(src[1].type == BRW_REGISTER_TYPE_UD); + generate_quad_swizzle(inst, dst, src[0], src[1].ud); + break; + + case SHADER_OPCODE_CLUSTER_BROADCAST: { + assert((devinfo->platform != INTEL_PLATFORM_CHV && + !intel_device_info_is_9lp(devinfo) && + devinfo->has_64bit_float) || type_sz(src[0].type) <= 4); + assert(!src[0].negate && !src[0].abs); + assert(src[1].file == BRW_IMMEDIATE_VALUE); + assert(src[1].type == BRW_REGISTER_TYPE_UD); + assert(src[2].file == BRW_IMMEDIATE_VALUE); + assert(src[2].type == BRW_REGISTER_TYPE_UD); + const unsigned component = src[1].ud; + const unsigned cluster_size = src[2].ud; + assert(inst->src[0].file != ARF && inst->src[0].file != FIXED_GRF); + const unsigned s = inst->src[0].stride; + unsigned vstride = cluster_size * s; + unsigned width = cluster_size; + + /* The maximum exec_size is 32, but the maximum width is only 16. */ + if (inst->exec_size == width) { + vstride = 0; + width = 1; + } + + struct brw_reg strided = stride(suboffset(src[0], component * s), + vstride, width, 0); + brw_MOV(p, dst, strided); + break; + } + + case FS_OPCODE_SET_SAMPLE_ID: + generate_set_sample_id(inst, dst, src[0], src[1]); + break; + + case SHADER_OPCODE_HALT_TARGET: + /* This is the place where the final HALT needs to be inserted if + * we've emitted any discards. If not, this will emit no code. + */ + if (!patch_halt_jumps()) { + if (unlikely(debug_flag)) { + disasm_info->use_tail = true; + } + } + break; + + case CS_OPCODE_CS_TERMINATE: + generate_cs_terminate(inst, src[0]); + send_count++; + break; + + case SHADER_OPCODE_BARRIER: + generate_barrier(inst, src[0]); + send_count++; + break; + + case BRW_OPCODE_DIM: + assert(devinfo->platform == INTEL_PLATFORM_HSW); + assert(src[0].type == BRW_REGISTER_TYPE_DF); + assert(dst.type == BRW_REGISTER_TYPE_DF); + brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F)); + break; + + case SHADER_OPCODE_RND_MODE: { + assert(src[0].file == BRW_IMMEDIATE_VALUE); + /* + * Changes the floating point rounding mode updating the control + * register field defined at cr0.0[5-6] bits. + */ + enum brw_rnd_mode mode = + (enum brw_rnd_mode) (src[0].d << BRW_CR0_RND_MODE_SHIFT); + brw_float_controls_mode(p, mode, BRW_CR0_RND_MODE_MASK); + } + break; + + case SHADER_OPCODE_FLOAT_CONTROL_MODE: + assert(src[0].file == BRW_IMMEDIATE_VALUE); + assert(src[1].file == BRW_IMMEDIATE_VALUE); + brw_float_controls_mode(p, src[0].d, src[1].d); + break; + + case SHADER_OPCODE_READ_SR_REG: + if (devinfo->ver >= 12) { + /* There is a SWSB restriction that requires that any time sr0 is + * accessed both the instruction doing the access and the next one + * have SWSB set to RegDist(1). + */ + if (brw_get_default_swsb(p).mode != TGL_SBID_NULL) + brw_SYNC(p, TGL_SYNC_NOP); + assert(src[0].file == BRW_IMMEDIATE_VALUE); + brw_set_default_swsb(p, tgl_swsb_regdist(1)); + brw_MOV(p, dst, brw_sr0_reg(src[0].ud)); + brw_set_default_swsb(p, tgl_swsb_regdist(1)); + brw_AND(p, dst, dst, brw_imm_ud(0xffffffff)); + } else { + brw_MOV(p, dst, brw_sr0_reg(src[0].ud)); + } + break; + + default: + unreachable("Unsupported opcode"); + + case SHADER_OPCODE_LOAD_PAYLOAD: + unreachable("Should be lowered by lower_load_payload()"); + } + + if (multiple_instructions_emitted) + continue; + + if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) { + assert(p->next_insn_offset == last_insn_offset + 16 || + !"conditional_mod, no_dd_check, or no_dd_clear set for IR " + "emitting more than 1 instruction"); + + brw_inst *last = &p->store[last_insn_offset / 16]; + + if (inst->conditional_mod) + brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod); + if (devinfo->ver < 12) { + brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear); + brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check); + } + } + + /* When enabled, insert sync NOP after every instruction and make sure + * that current instruction depends on the previous instruction. + */ + if (INTEL_DEBUG(DEBUG_SWSB_STALL) && devinfo->ver >= 12) { + brw_set_default_swsb(p, tgl_swsb_regdist(1)); + brw_SYNC(p, TGL_SYNC_NOP); + } + } + + brw_set_uip_jip(p, start_offset); + + /* end of program sentinel */ + disasm_new_inst_group(disasm_info, p->next_insn_offset); + + /* `send_count` explicitly does not include spills or fills, as we'd + * like to use it as a metric for intentional memory access or other + * shared function use. Otherwise, subtle changes to scheduling or + * register allocation could cause it to fluctuate wildly - and that + * effect is already counted in spill/fill counts. + */ + send_count -= shader_stats.spill_count; + send_count -= shader_stats.fill_count; + +#ifndef NDEBUG + bool validated = +#else + if (unlikely(debug_flag)) +#endif + brw_validate_instructions(&compiler->isa, p->store, + start_offset, + p->next_insn_offset, + disasm_info); + + int before_size = p->next_insn_offset - start_offset; + brw_compact_instructions(p, start_offset, disasm_info); + int after_size = p->next_insn_offset - start_offset; + + bool dump_shader_bin = brw_should_dump_shader_bin(); + unsigned char sha1[21]; + char sha1buf[41]; + + if (unlikely(debug_flag || dump_shader_bin)) { + _mesa_sha1_compute(p->store + start_offset / sizeof(brw_inst), + after_size, sha1); + _mesa_sha1_format(sha1buf, sha1); + } + + if (unlikely(dump_shader_bin)) + brw_dump_shader_bin(p->store, start_offset, p->next_insn_offset, + sha1buf); + + if (unlikely(debug_flag)) { + fprintf(stderr, "Native code for %s (src_hash 0x%08x) (sha1 %s)\n" + "SIMD%d shader: %d instructions. %d loops. %u cycles. " + "%d:%d spills:fills, %u sends, " + "scheduled with mode %s. " + "Promoted %u constants. " + "Compacted %d to %d bytes (%.0f%%)\n", + shader_name, params->source_hash, sha1buf, + dispatch_width, before_size / 16, + loop_count, perf.latency, + shader_stats.spill_count, + shader_stats.fill_count, + send_count, + shader_stats.scheduler_mode, + shader_stats.promoted_constants, + before_size, after_size, + 100.0f * (before_size - after_size) / before_size); + + /* overriding the shader makes disasm_info invalid */ + if (!brw_try_override_assembly(p, start_offset, sha1buf)) { + dump_assembly(p->store, start_offset, p->next_insn_offset, + disasm_info, perf.block_latency); + } else { + fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf); + } + } + ralloc_free(disasm_info); +#ifndef NDEBUG + if (!validated && !debug_flag) { + fprintf(stderr, + "Validation failed. Rerun with INTEL_DEBUG=shaders to get more information.\n"); + } +#endif + assert(validated); + + brw_shader_debug_log(compiler, params->log_data, + "%s SIMD%d shader: %d inst, %d loops, %u cycles, " + "%d:%d spills:fills, %u sends, " + "scheduled with mode %s, " + "Promoted %u constants, " + "compacted %d to %d bytes.\n", + _mesa_shader_stage_to_abbrev(stage), + dispatch_width, + before_size / 16 - nop_count - sync_nop_count, + loop_count, perf.latency, + shader_stats.spill_count, + shader_stats.fill_count, + send_count, + shader_stats.scheduler_mode, + shader_stats.promoted_constants, + before_size, after_size); + if (stats) { + stats->dispatch_width = dispatch_width; + stats->max_polygons = max_polygons; + stats->max_dispatch_width = dispatch_width; + stats->instructions = before_size / 16 - nop_count - sync_nop_count; + stats->sends = send_count; + stats->loops = loop_count; + stats->cycles = perf.latency; + stats->spills = shader_stats.spill_count; + stats->fills = shader_stats.fill_count; + stats->max_live_registers = shader_stats.max_register_pressure; + } + + return start_offset; +} + +void +fs_generator::add_const_data(void *data, unsigned size) +{ + assert(prog_data->const_data_size == 0); + if (size > 0) { + prog_data->const_data_size = size; + prog_data->const_data_offset = brw_append_data(p, data, size, 32); + } +} + +void +fs_generator::add_resume_sbt(unsigned num_resume_shaders, uint64_t *sbt) +{ + assert(brw_shader_stage_is_bindless(stage)); + struct brw_bs_prog_data *bs_prog_data = brw_bs_prog_data(prog_data); + if (num_resume_shaders > 0) { + bs_prog_data->resume_sbt_offset = + brw_append_data(p, sbt, num_resume_shaders * sizeof(uint64_t), 32); + for (unsigned i = 0; i < num_resume_shaders; i++) { + size_t offset = bs_prog_data->resume_sbt_offset + i * sizeof(*sbt); + assert(offset <= UINT32_MAX); + brw_add_reloc(p, BRW_SHADER_RELOC_SHADER_START_OFFSET, + BRW_SHADER_RELOC_TYPE_U32, + (uint32_t)offset, (uint32_t)sbt[i]); + } + } +} + +const unsigned * +fs_generator::get_assembly() +{ + prog_data->relocs = brw_get_shader_relocs(p, &prog_data->num_relocs); + + return brw_get_program(p, &prog_data->program_size); +} diff --git a/src/intel/compiler/elk/brw_fs_live_variables.cpp b/src/intel/compiler/elk/brw_fs_live_variables.cpp new file mode 100644 index 00000000000..c6361d67d95 --- /dev/null +++ b/src/intel/compiler/elk/brw_fs_live_variables.cpp @@ -0,0 +1,371 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt + * + */ + +#include "brw_fs.h" +#include "brw_fs_live_variables.h" + +using namespace brw; + +#define MAX_INSTRUCTION (1 << 30) + +/** @file brw_fs_live_variables.cpp + * + * Support for calculating liveness information about virtual GRFs. + * + * This produces a live interval for each whole virtual GRF. We could + * choose to expose per-component live intervals for VGRFs of size > 1, + * but we currently do not. It is easier for the consumers of this + * information to work with whole VGRFs. + * + * However, we internally track use/def information at the per-GRF level for + * greater accuracy. Large VGRFs may be accessed piecemeal over many + * (possibly non-adjacent) instructions. In this case, examining a single + * instruction is insufficient to decide whether a whole VGRF is ultimately + * used or defined. Tracking individual components allows us to easily + * assemble this information. + * + * See Muchnick's Advanced Compiler Design and Implementation, section + * 14.1 (p444). + */ + +void +fs_live_variables::setup_one_read(struct block_data *bd, + int ip, const fs_reg ®) +{ + int var = var_from_reg(reg); + assert(var < num_vars); + + start[var] = MIN2(start[var], ip); + end[var] = MAX2(end[var], ip); + + /* The use[] bitset marks when the block makes use of a variable (VGRF + * channel) without having completely defined that variable within the + * block. + */ + if (!BITSET_TEST(bd->def, var)) + BITSET_SET(bd->use, var); +} + +void +fs_live_variables::setup_one_write(struct block_data *bd, fs_inst *inst, + int ip, const fs_reg ®) +{ + int var = var_from_reg(reg); + assert(var < num_vars); + + start[var] = MIN2(start[var], ip); + end[var] = MAX2(end[var], ip); + + /* The def[] bitset marks when an initialization in a block completely + * screens off previous updates of that variable (VGRF channel). + */ + if (inst->dst.file == VGRF) { + if (!inst->is_partial_write() && !BITSET_TEST(bd->use, var)) + BITSET_SET(bd->def, var); + + BITSET_SET(bd->defout, var); + } +} + +/** + * Sets up the use[] and def[] bitsets. + * + * The basic-block-level live variable analysis needs to know which + * variables get used before they're completely defined, and which + * variables are completely defined before they're used. + * + * These are tracked at the per-component level, rather than whole VGRFs. + */ +void +fs_live_variables::setup_def_use() +{ + int ip = 0; + + foreach_block (block, cfg) { + assert(ip == block->start_ip); + if (block->num > 0) + assert(cfg->blocks[block->num - 1]->end_ip == ip - 1); + + struct block_data *bd = &block_data[block->num]; + + foreach_inst_in_block(fs_inst, inst, block) { + /* Set use[] for this instruction */ + for (unsigned int i = 0; i < inst->sources; i++) { + fs_reg reg = inst->src[i]; + + if (reg.file != VGRF) + continue; + + for (unsigned j = 0; j < regs_read(inst, i); j++) { + setup_one_read(bd, ip, reg); + reg.offset += REG_SIZE; + } + } + + bd->flag_use[0] |= inst->flags_read(devinfo) & ~bd->flag_def[0]; + + /* Set def[] for this instruction */ + if (inst->dst.file == VGRF) { + fs_reg reg = inst->dst; + for (unsigned j = 0; j < regs_written(inst); j++) { + setup_one_write(bd, inst, ip, reg); + reg.offset += REG_SIZE; + } + } + + if (!inst->predicate && inst->exec_size >= 8) + bd->flag_def[0] |= inst->flags_written(devinfo) & ~bd->flag_use[0]; + + ip++; + } + } +} + +/** + * The algorithm incrementally sets bits in liveout and livein, + * propagating it through control flow. It will eventually terminate + * because it only ever adds bits, and stops when no bits are added in + * a pass. + */ +void +fs_live_variables::compute_live_variables() +{ + bool cont = true; + + /* Propagate defin and defout down the CFG to calculate the union of live + * variables potentially defined along any possible control flow path. + */ + do { + cont = false; + + foreach_block (block, cfg) { + const struct block_data *bd = &block_data[block->num]; + + foreach_list_typed(bblock_link, child_link, link, &block->children) { + struct block_data *child_bd = &block_data[child_link->block->num]; + + for (int i = 0; i < bitset_words; i++) { + const BITSET_WORD new_def = bd->defout[i] & ~child_bd->defin[i]; + child_bd->defin[i] |= new_def; + child_bd->defout[i] |= new_def; + cont |= new_def; + } + } + } + } while (cont); + + do { + cont = false; + + foreach_block_reverse (block, cfg) { + struct block_data *bd = &block_data[block->num]; + + /* Update liveout */ + foreach_list_typed(bblock_link, child_link, link, &block->children) { + struct block_data *child_bd = &block_data[child_link->block->num]; + + for (int i = 0; i < bitset_words; i++) { + BITSET_WORD new_liveout = (child_bd->livein[i] & + ~bd->liveout[i]); + new_liveout &= bd->defout[i]; /* Screen off uses with no reaching def */ + if (new_liveout) + bd->liveout[i] |= new_liveout; + } + BITSET_WORD new_liveout = (child_bd->flag_livein[0] & + ~bd->flag_liveout[0]); + if (new_liveout) + bd->flag_liveout[0] |= new_liveout; + } + + /* Update livein */ + for (int i = 0; i < bitset_words; i++) { + BITSET_WORD new_livein = (bd->use[i] | + (bd->liveout[i] & + ~bd->def[i])); + new_livein &= bd->defin[i]; /* Screen off uses with no reaching def */ + if (new_livein & ~bd->livein[i]) { + bd->livein[i] |= new_livein; + cont = true; + } + } + BITSET_WORD new_livein = (bd->flag_use[0] | + (bd->flag_liveout[0] & + ~bd->flag_def[0])); + if (new_livein & ~bd->flag_livein[0]) { + bd->flag_livein[0] |= new_livein; + cont = true; + } + } + } while (cont); +} + +/** + * Extend the start/end ranges for each variable to account for the + * new information calculated from control flow. + */ +void +fs_live_variables::compute_start_end() +{ + foreach_block (block, cfg) { + struct block_data *bd = &block_data[block->num]; + unsigned i; + + BITSET_FOREACH_SET(i, bd->livein, (unsigned)num_vars) { + start[i] = MIN2(start[i], block->start_ip); + end[i] = MAX2(end[i], block->start_ip); + } + + BITSET_FOREACH_SET(i, bd->liveout, (unsigned)num_vars) { + start[i] = MIN2(start[i], block->end_ip); + end[i] = MAX2(end[i], block->end_ip); + } + } +} + +fs_live_variables::fs_live_variables(const backend_shader *s) + : devinfo(s->devinfo), cfg(s->cfg) +{ + mem_ctx = ralloc_context(NULL); + linear_ctx *lin_ctx = linear_context(mem_ctx); + + num_vgrfs = s->alloc.count; + num_vars = 0; + var_from_vgrf = linear_zalloc_array(lin_ctx, int, num_vgrfs); + for (int i = 0; i < num_vgrfs; i++) { + var_from_vgrf[i] = num_vars; + num_vars += s->alloc.sizes[i]; + } + + vgrf_from_var = linear_zalloc_array(lin_ctx, int, num_vars); + for (int i = 0; i < num_vgrfs; i++) { + for (unsigned j = 0; j < s->alloc.sizes[i]; j++) { + vgrf_from_var[var_from_vgrf[i] + j] = i; + } + } + + start = ralloc_array(mem_ctx, int, num_vars); + end = linear_zalloc_array(lin_ctx, int, num_vars); + for (int i = 0; i < num_vars; i++) { + start[i] = MAX_INSTRUCTION; + end[i] = -1; + } + + vgrf_start = ralloc_array(mem_ctx, int, num_vgrfs); + vgrf_end = ralloc_array(mem_ctx, int, num_vgrfs); + for (int i = 0; i < num_vgrfs; i++) { + vgrf_start[i] = MAX_INSTRUCTION; + vgrf_end[i] = -1; + } + + block_data = linear_zalloc_array(lin_ctx, struct block_data, cfg->num_blocks); + + bitset_words = BITSET_WORDS(num_vars); + for (int i = 0; i < cfg->num_blocks; i++) { + block_data[i].def = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words); + block_data[i].use = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words); + block_data[i].livein = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words); + block_data[i].liveout = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words); + block_data[i].defin = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words); + block_data[i].defout = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words); + + block_data[i].flag_def[0] = 0; + block_data[i].flag_use[0] = 0; + block_data[i].flag_livein[0] = 0; + block_data[i].flag_liveout[0] = 0; + } + + setup_def_use(); + compute_live_variables(); + compute_start_end(); + + /* Merge the per-component live ranges to whole VGRF live ranges. */ + for (int i = 0; i < num_vars; i++) { + const unsigned vgrf = vgrf_from_var[i]; + vgrf_start[vgrf] = MIN2(vgrf_start[vgrf], start[i]); + vgrf_end[vgrf] = MAX2(vgrf_end[vgrf], end[i]); + } +} + +fs_live_variables::~fs_live_variables() +{ + ralloc_free(mem_ctx); +} + +static bool +check_register_live_range(const fs_live_variables *live, int ip, + const fs_reg ®, unsigned n) +{ + const unsigned var = live->var_from_reg(reg); + + if (var + n > unsigned(live->num_vars) || + live->vgrf_start[reg.nr] > ip || live->vgrf_end[reg.nr] < ip) + return false; + + for (unsigned j = 0; j < n; j++) { + if (live->start[var + j] > ip || live->end[var + j] < ip) + return false; + } + + return true; +} + +bool +fs_live_variables::validate(const backend_shader *s) const +{ + int ip = 0; + + foreach_block_and_inst(block, fs_inst, inst, s->cfg) { + for (unsigned i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF && + !check_register_live_range(this, ip, + inst->src[i], regs_read(inst, i))) + return false; + } + + if (inst->dst.file == VGRF && + !check_register_live_range(this, ip, inst->dst, regs_written(inst))) + return false; + + ip++; + } + + return true; +} + +bool +fs_live_variables::vars_interfere(int a, int b) const +{ + return !(end[b] <= start[a] || + end[a] <= start[b]); +} + +bool +fs_live_variables::vgrfs_interfere(int a, int b) const +{ + return !(vgrf_end[a] <= vgrf_start[b] || + vgrf_end[b] <= vgrf_start[a]); +} diff --git a/src/intel/compiler/elk/brw_fs_live_variables.h b/src/intel/compiler/elk/brw_fs_live_variables.h new file mode 100644 index 00000000000..1c77efa0c19 --- /dev/null +++ b/src/intel/compiler/elk/brw_fs_live_variables.h @@ -0,0 +1,148 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt + * + */ + +#ifndef BRW_FS_LIVE_VARIABLES_H +#define BRW_FS_LIVE_VARIABLES_H + +#include "brw_ir_analysis.h" +#include "brw_ir_fs.h" +#include "util/bitset.h" + +struct cfg_t; +struct backend_shader; + +namespace brw { + +class fs_live_variables { +public: + struct block_data { + /** + * Which variables are defined before being used in the block. + * + * Note that for our purposes, "defined" means unconditionally, completely + * defined. + */ + BITSET_WORD *def; + + /** + * Which variables are used before being defined in the block. + */ + BITSET_WORD *use; + + /** Which defs reach the entry point of the block. */ + BITSET_WORD *livein; + + /** Which defs reach the exit point of the block. */ + BITSET_WORD *liveout; + + /** + * Variables such that the entry point of the block may be reached from any + * of their definitions. + */ + BITSET_WORD *defin; + + /** + * Variables such that the exit point of the block may be reached from any + * of their definitions. + */ + BITSET_WORD *defout; + + BITSET_WORD flag_def[1]; + BITSET_WORD flag_use[1]; + BITSET_WORD flag_livein[1]; + BITSET_WORD flag_liveout[1]; + }; + + fs_live_variables(const backend_shader *s); + ~fs_live_variables(); + + bool validate(const backend_shader *s) const; + + analysis_dependency_class + dependency_class() const + { + return (DEPENDENCY_INSTRUCTION_IDENTITY | + DEPENDENCY_INSTRUCTION_DATA_FLOW | + DEPENDENCY_VARIABLES); + } + + bool vars_interfere(int a, int b) const; + bool vgrfs_interfere(int a, int b) const; + int var_from_reg(const fs_reg ®) const + { + return var_from_vgrf[reg.nr] + reg.offset / REG_SIZE; + } + + /** Map from virtual GRF number to index in block_data arrays. */ + int *var_from_vgrf; + + /** + * Map from any index in block_data to the virtual GRF containing it. + * + * For alloc.sizes of [1, 2, 3], vgrf_from_var would contain + * [0, 1, 1, 2, 2, 2]. + */ + int *vgrf_from_var; + + int num_vars; + int num_vgrfs; + int bitset_words; + + /** @{ + * Final computed live ranges for each var (each component of each virtual + * GRF). + */ + int *start; + int *end; + /** @} */ + + /** @{ + * Final computed live ranges for each VGRF. + */ + int *vgrf_start; + int *vgrf_end; + /** @} */ + + /** Per-basic-block information on live variables */ + struct block_data *block_data; + +protected: + void setup_def_use(); + void setup_one_read(struct block_data *bd, int ip, const fs_reg ®); + void setup_one_write(struct block_data *bd, fs_inst *inst, int ip, + const fs_reg ®); + void compute_live_variables(); + void compute_start_end(); + + const struct intel_device_info *devinfo; + const cfg_t *cfg; + void *mem_ctx; +}; + +} /* namespace brw */ + +#endif /* BRW_FS_LIVE_VARIABLES_H */ diff --git a/src/intel/compiler/elk/brw_fs_lower_dpas.cpp b/src/intel/compiler/elk/brw_fs_lower_dpas.cpp new file mode 100644 index 00000000000..306731722af --- /dev/null +++ b/src/intel/compiler/elk/brw_fs_lower_dpas.cpp @@ -0,0 +1,306 @@ +/* + * Copyright 2023 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "brw_fs.h" +#include "brw_fs_builder.h" + +using namespace brw; + +static void +f16_using_mac(const fs_builder &bld, fs_inst *inst) +{ + /* We only intend to support configurations where the destination and + * accumulator have the same type. + */ + if (!inst->src[0].is_null()) + assert(inst->dst.type == inst->src[0].type); + + assert(inst->src[1].type == BRW_REGISTER_TYPE_HF); + assert(inst->src[2].type == BRW_REGISTER_TYPE_HF); + + const brw_reg_type src0_type = inst->dst.type; + const brw_reg_type src1_type = BRW_REGISTER_TYPE_HF; + const brw_reg_type src2_type = BRW_REGISTER_TYPE_HF; + + const fs_reg dest = inst->dst; + fs_reg src0 = inst->src[0]; + const fs_reg src1 = retype(inst->src[1], src1_type); + const fs_reg src2 = retype(inst->src[2], src2_type); + + const unsigned dest_stride = + dest.type == BRW_REGISTER_TYPE_HF ? REG_SIZE / 2 : REG_SIZE; + + for (unsigned r = 0; r < inst->rcount; r++) { + fs_reg temp = bld.vgrf(BRW_REGISTER_TYPE_HF, 1); + + for (unsigned subword = 0; subword < 2; subword++) { + for (unsigned s = 0; s < inst->sdepth; s++) { + /* The first multiply of the dot-product operation has to + * explicitly write the accumulator register. The successive MAC + * instructions will implicitly read *and* write the + * accumulator. Those MAC instructions can also optionally + * explicitly write some other register. + * + * FINISHME: The accumulator can actually hold 16 HF values. On + * Gfx12 there are two accumulators. It should be possible to do + * this in SIMD16 or even SIMD32. I was unable to get this to work + * properly. + */ + if (s == 0 && subword == 0) { + const unsigned acc_width = 8; + fs_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD), + inst->group % acc_width); + + if (bld.shader->devinfo->verx10 >= 125) { + acc = subscript(acc, BRW_REGISTER_TYPE_HF, subword); + } else { + acc = retype(acc, BRW_REGISTER_TYPE_HF); + } + + bld.MUL(acc, + subscript(retype(byte_offset(src1, s * REG_SIZE), + BRW_REGISTER_TYPE_UD), + BRW_REGISTER_TYPE_HF, subword), + component(retype(byte_offset(src2, r * REG_SIZE), + BRW_REGISTER_TYPE_HF), + s * 2 + subword)) + ->writes_accumulator = true; + + } else { + fs_reg result; + + /* As mentioned above, the MAC had an optional, explicit + * destination register. Various optimization passes are not + * clever enough to understand the intricacies of this + * instruction, so only write the result register on the final + * MAC in the sequence. + */ + if ((s + 1) == inst->sdepth && subword == 1) + result = temp; + else + result = retype(bld.null_reg_ud(), BRW_REGISTER_TYPE_HF); + + bld.MAC(result, + subscript(retype(byte_offset(src1, s * REG_SIZE), + BRW_REGISTER_TYPE_UD), + BRW_REGISTER_TYPE_HF, subword), + component(retype(byte_offset(src2, r * REG_SIZE), + BRW_REGISTER_TYPE_HF), + s * 2 + subword)) + ->writes_accumulator = true; + } + } + } + + if (!src0.is_null()) { + if (src0_type != BRW_REGISTER_TYPE_HF) { + fs_reg temp2 = bld.vgrf(src0_type, 1); + + bld.MOV(temp2, temp); + + bld.ADD(byte_offset(dest, r * dest_stride), + temp2, + byte_offset(src0, r * dest_stride)); + } else { + bld.ADD(byte_offset(dest, r * dest_stride), + temp, + byte_offset(src0, r * dest_stride)); + } + } else { + bld.MOV(byte_offset(dest, r * dest_stride), temp); + } + } +} + +static void +int8_using_dp4a(const fs_builder &bld, fs_inst *inst) +{ + /* We only intend to support configurations where the destination and + * accumulator have the same type. + */ + if (!inst->src[0].is_null()) + assert(inst->dst.type == inst->src[0].type); + + assert(inst->src[1].type == BRW_REGISTER_TYPE_B || + inst->src[1].type == BRW_REGISTER_TYPE_UB); + assert(inst->src[2].type == BRW_REGISTER_TYPE_B || + inst->src[2].type == BRW_REGISTER_TYPE_UB); + + const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB + ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D; + + const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB + ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D; + + fs_reg dest = inst->dst; + fs_reg src0 = inst->src[0]; + const fs_reg src1 = retype(inst->src[1], src1_type); + const fs_reg src2 = retype(inst->src[2], src2_type); + + const unsigned dest_stride = REG_SIZE; + + for (unsigned r = 0; r < inst->rcount; r++) { + if (!src0.is_null()) { + bld.MOV(dest, src0); + src0 = byte_offset(src0, dest_stride); + } else { + bld.MOV(dest, retype(brw_imm_d(0), dest.type)); + } + + for (unsigned s = 0; s < inst->sdepth; s++) { + bld.DP4A(dest, + dest, + byte_offset(src1, s * REG_SIZE), + component(byte_offset(src2, r * REG_SIZE), s)) + ->saturate = inst->saturate; + } + + dest = byte_offset(dest, dest_stride); + } +} + +static void +int8_using_mul_add(const fs_builder &bld, fs_inst *inst) +{ + /* We only intend to support configurations where the destination and + * accumulator have the same type. + */ + if (!inst->src[0].is_null()) + assert(inst->dst.type == inst->src[0].type); + + assert(inst->src[1].type == BRW_REGISTER_TYPE_B || + inst->src[1].type == BRW_REGISTER_TYPE_UB); + assert(inst->src[2].type == BRW_REGISTER_TYPE_B || + inst->src[2].type == BRW_REGISTER_TYPE_UB); + + const brw_reg_type src0_type = inst->dst.type; + + const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB + ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D; + + const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB + ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D; + + fs_reg dest = inst->dst; + fs_reg src0 = inst->src[0]; + const fs_reg src1 = retype(inst->src[1], src1_type); + const fs_reg src2 = retype(inst->src[2], src2_type); + + const unsigned dest_stride = REG_SIZE; + + for (unsigned r = 0; r < inst->rcount; r++) { + if (!src0.is_null()) { + bld.MOV(dest, src0); + src0 = byte_offset(src0, dest_stride); + } else { + bld.MOV(dest, retype(brw_imm_d(0), dest.type)); + } + + for (unsigned s = 0; s < inst->sdepth; s++) { + fs_reg temp1 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fs_reg temp2 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fs_reg temp3 = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); + const brw_reg_type temp_type = + (inst->src[1].type == BRW_REGISTER_TYPE_B || + inst->src[2].type == BRW_REGISTER_TYPE_B) + ? BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW; + + /* Expand 8 dwords of packed bytes into 16 dwords of packed + * words. + * + * FINISHME: Gfx9 should not need this work around. Gfx11 + * may be able to use integer MAD. Both platforms may be + * able to use MAC. + */ + bld.group(32, 0).MOV(retype(temp3, temp_type), + retype(byte_offset(src2, r * REG_SIZE), + inst->src[2].type)); + + bld.MUL(subscript(temp1, temp_type, 0), + subscript(retype(byte_offset(src1, s * REG_SIZE), + BRW_REGISTER_TYPE_UD), + inst->src[1].type, 0), + subscript(component(retype(temp3, + BRW_REGISTER_TYPE_UD), + s * 2), + temp_type, 0)); + + bld.MUL(subscript(temp1, temp_type, 1), + subscript(retype(byte_offset(src1, s * REG_SIZE), + BRW_REGISTER_TYPE_UD), + inst->src[1].type, 1), + subscript(component(retype(temp3, + BRW_REGISTER_TYPE_UD), + s * 2), + temp_type, 1)); + + bld.MUL(subscript(temp2, temp_type, 0), + subscript(retype(byte_offset(src1, s * REG_SIZE), + BRW_REGISTER_TYPE_UD), + inst->src[1].type, 2), + subscript(component(retype(temp3, + BRW_REGISTER_TYPE_UD), + s * 2 + 1), + temp_type, 0)); + + bld.MUL(subscript(temp2, temp_type, 1), + subscript(retype(byte_offset(src1, s * REG_SIZE), + BRW_REGISTER_TYPE_UD), + inst->src[1].type, 3), + subscript(component(retype(temp3, + BRW_REGISTER_TYPE_UD), + s * 2 + 1), + temp_type, 1)); + + bld.ADD(subscript(temp1, src0_type, 0), + subscript(temp1, temp_type, 0), + subscript(temp1, temp_type, 1)); + + bld.ADD(subscript(temp2, src0_type, 0), + subscript(temp2, temp_type, 0), + subscript(temp2, temp_type, 1)); + + bld.ADD(retype(temp1, src0_type), + retype(temp1, src0_type), + retype(temp2, src0_type)); + + bld.ADD(dest, dest, retype(temp1, src0_type)) + ->saturate = inst->saturate; + } + + dest = byte_offset(dest, dest_stride); + } +} + +bool +brw_lower_dpas(fs_visitor &v) +{ + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, v.cfg) { + if (inst->opcode != BRW_OPCODE_DPAS) + continue; + + const fs_builder bld = fs_builder(&v, block, inst).group(8, 0).exec_all(); + + if (brw_reg_type_is_floating_point(inst->dst.type)) { + f16_using_mac(bld, inst); + } else { + if (v.devinfo->ver >= 12) { + int8_using_dp4a(bld, inst); + } else { + int8_using_mul_add(bld, inst); + } + } + + inst->remove(block); + progress = true; + } + + if (progress) + v.invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + + return progress; +} diff --git a/src/intel/compiler/elk/brw_fs_lower_pack.cpp b/src/intel/compiler/elk/brw_fs_lower_pack.cpp new file mode 100644 index 00000000000..3a60989ecda --- /dev/null +++ b/src/intel/compiler/elk/brw_fs_lower_pack.cpp @@ -0,0 +1,92 @@ +/* + * Copyright © 2015 Connor Abbott + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "util/half_float.h" +#include "brw_fs.h" +#include "brw_cfg.h" +#include "brw_fs_builder.h" + +using namespace brw; + +bool +fs_visitor::lower_pack() +{ + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + if (inst->opcode != FS_OPCODE_PACK && + inst->opcode != FS_OPCODE_PACK_HALF_2x16_SPLIT) + continue; + + assert(inst->dst.file == VGRF); + assert(inst->saturate == false); + fs_reg dst = inst->dst; + + const fs_builder ibld(this, block, inst); + /* The lowering generates 2 instructions for what was previously 1. This + * can trick the IR to believe we're doing partial writes, but the + * register is actually fully written. Mark it as undef to help the IR + * reduce the liveness of the register. + */ + if (!inst->is_partial_write()) + ibld.emit_undef_for_dst(inst); + + switch (inst->opcode) { + case FS_OPCODE_PACK: + for (unsigned i = 0; i < inst->sources; i++) + ibld.MOV(subscript(dst, inst->src[i].type, i), inst->src[i]); + break; + case FS_OPCODE_PACK_HALF_2x16_SPLIT: + assert(dst.type == BRW_REGISTER_TYPE_UD); + + for (unsigned i = 0; i < inst->sources; i++) { + if (inst->src[i].file == IMM) { + const uint32_t half = _mesa_float_to_half(inst->src[i].f); + ibld.MOV(subscript(dst, BRW_REGISTER_TYPE_UW, i), + brw_imm_uw(half)); + } else if (i == 1 && devinfo->ver < 9) { + /* Pre-Skylake requires DWord aligned destinations */ + fs_reg tmp = ibld.vgrf(BRW_REGISTER_TYPE_UD); + ibld.F32TO16(subscript(tmp, BRW_REGISTER_TYPE_HF, 0), + inst->src[i]); + ibld.MOV(subscript(dst, BRW_REGISTER_TYPE_UW, 1), + subscript(tmp, BRW_REGISTER_TYPE_UW, 0)); + } else { + ibld.F32TO16(subscript(dst, BRW_REGISTER_TYPE_HF, i), + inst->src[i]); + } + } + break; + default: + unreachable("skipped above"); + } + + inst->remove(block); + progress = true; + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + + return progress; +} diff --git a/src/intel/compiler/elk/brw_fs_lower_regioning.cpp b/src/intel/compiler/elk/brw_fs_lower_regioning.cpp new file mode 100644 index 00000000000..3bff7770cd0 --- /dev/null +++ b/src/intel/compiler/elk/brw_fs_lower_regioning.cpp @@ -0,0 +1,677 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_fs.h" +#include "brw_cfg.h" +#include "brw_fs_builder.h" + +using namespace brw; + +namespace { + /* From the SKL PRM Vol 2a, "Move": + * + * "A mov with the same source and destination type, no source modifier, + * and no saturation is a raw move. A packed byte destination region (B + * or UB type with HorzStride == 1 and ExecSize > 1) can only be written + * using raw move." + */ + bool + is_byte_raw_mov(const fs_inst *inst) + { + return type_sz(inst->dst.type) == 1 && + inst->opcode == BRW_OPCODE_MOV && + inst->src[0].type == inst->dst.type && + !inst->saturate && + !inst->src[0].negate && + !inst->src[0].abs; + } + + /* + * Return an acceptable byte stride for the destination of an instruction + * that requires it to have some particular alignment. + */ + unsigned + required_dst_byte_stride(const fs_inst *inst) + { + if (inst->dst.is_accumulator()) { + /* If the destination is an accumulator, insist that we leave the + * stride alone. We cannot "fix" accumulator destinations by writing + * to a temporary and emitting a MOV into the original destination. + * For multiply instructions (our one use of the accumulator), the + * MUL writes the full 66 bits of the accumulator whereas the MOV we + * would emit only writes 33 bits and leaves the top 33 bits + * undefined. + * + * It's safe to just require the original stride here because the + * lowering pass will detect the mismatch in has_invalid_src_region + * and fix the sources of the multiply instead of the destination. + */ + return inst->dst.stride * type_sz(inst->dst.type); + } else if (type_sz(inst->dst.type) < get_exec_type_size(inst) && + !is_byte_raw_mov(inst)) { + return get_exec_type_size(inst); + } else { + /* Calculate the maximum byte stride and the minimum/maximum type + * size across all source and destination operands we are required to + * lower. + */ + unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type); + unsigned min_size = type_sz(inst->dst.type); + unsigned max_size = type_sz(inst->dst.type); + + for (unsigned i = 0; i < inst->sources; i++) { + if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) { + const unsigned size = type_sz(inst->src[i].type); + max_stride = MAX2(max_stride, inst->src[i].stride * size); + min_size = MIN2(min_size, size); + max_size = MAX2(max_size, size); + } + } + + /* All operands involved in lowering need to fit in the calculated + * stride. + */ + assert(max_size <= 4 * min_size); + + /* Attempt to use the largest byte stride among all present operands, + * but never exceed a stride of 4 since that would lead to illegal + * destination regions during lowering. + */ + return MIN2(max_stride, 4 * min_size); + } + } + + /* + * Return an acceptable byte sub-register offset for the destination of an + * instruction that requires it to be aligned to the sub-register offset of + * the sources. + */ + unsigned + required_dst_byte_offset(const intel_device_info *devinfo, const fs_inst *inst) + { + for (unsigned i = 0; i < inst->sources; i++) { + if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) + if (reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE) != + reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE)) + return 0; + } + + return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE); + } + + /* + * Return the closest legal execution type for an instruction on + * the specified platform. + */ + brw_reg_type + required_exec_type(const intel_device_info *devinfo, const fs_inst *inst) + { + const brw_reg_type t = get_exec_type(inst); + const bool has_64bit = brw_reg_type_is_floating_point(t) ? + devinfo->has_64bit_float : devinfo->has_64bit_int; + + switch (inst->opcode) { + case SHADER_OPCODE_SHUFFLE: + /* IVB has an issue (which we found empirically) where it reads + * two address register components per channel for indirectly + * addressed 64-bit sources. + * + * From the Cherryview PRM Vol 7. "Register Region Restrictions": + * + * "When source or destination datatype is 64b or operation is + * integer DWord multiply, indirect addressing must not be + * used." + * + * Work around both of the above and handle platforms that + * don't support 64-bit types at all. + */ + if ((!devinfo->has_64bit_int || + devinfo->platform == INTEL_PLATFORM_CHV || + intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4) + return BRW_REGISTER_TYPE_UD; + else if (has_dst_aligned_region_restriction(devinfo, inst)) + return brw_int_type(type_sz(t), false); + else + return t; + + case SHADER_OPCODE_SEL_EXEC: + if ((!has_64bit || devinfo->has_64bit_float_via_math_pipe) && + type_sz(t) > 4) + return BRW_REGISTER_TYPE_UD; + else + return t; + + case SHADER_OPCODE_QUAD_SWIZZLE: + if (has_dst_aligned_region_restriction(devinfo, inst)) + return brw_int_type(type_sz(t), false); + else + return t; + + case SHADER_OPCODE_CLUSTER_BROADCAST: + /* From the Cherryview PRM Vol 7. "Register Region Restrictions": + * + * "When source or destination datatype is 64b or operation is + * integer DWord multiply, indirect addressing must not be + * used." + * + * For MTL (verx10 == 125), float64 is supported, but int64 is not. + * Therefore we need to lower cluster broadcast using 32-bit int ops. + * + * For gfx12.5+ platforms that support int64, the register regions + * used by cluster broadcast aren't supported by the 64-bit pipeline. + * + * Work around the above and handle platforms that don't + * support 64-bit types at all. + */ + if ((!has_64bit || devinfo->verx10 >= 125 || + devinfo->platform == INTEL_PLATFORM_CHV || + intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4) + return BRW_REGISTER_TYPE_UD; + else + return brw_int_type(type_sz(t), false); + + case SHADER_OPCODE_BROADCAST: + case SHADER_OPCODE_MOV_INDIRECT: + if (((devinfo->verx10 == 70 || + devinfo->platform == INTEL_PLATFORM_CHV || + intel_device_info_is_9lp(devinfo) || + devinfo->verx10 >= 125) && type_sz(inst->src[0].type) > 4) || + (devinfo->verx10 >= 125 && + brw_reg_type_is_floating_point(inst->src[0].type))) + return brw_int_type(type_sz(t), false); + else + return t; + + default: + return t; + } + } + + /* + * Return the stride between channels of the specified register in + * byte units, or ~0u if the region cannot be represented with a + * single one-dimensional stride. + */ + unsigned + byte_stride(const fs_reg ®) + { + switch (reg.file) { + case BAD_FILE: + case UNIFORM: + case IMM: + case VGRF: + case MRF: + case ATTR: + return reg.stride * type_sz(reg.type); + case ARF: + case FIXED_GRF: + if (reg.is_null()) { + return 0; + } else { + const unsigned hstride = reg.hstride ? 1 << (reg.hstride - 1) : 0; + const unsigned vstride = reg.vstride ? 1 << (reg.vstride - 1) : 0; + const unsigned width = 1 << reg.width; + + if (width == 1) { + return vstride * type_sz(reg.type); + } else if (hstride * width == vstride) { + return hstride * type_sz(reg.type); + } else { + return ~0u; + } + } + default: + unreachable("Invalid register file"); + } + } + + /* + * Return whether the instruction has an unsupported channel bit layout + * specified for the i-th source region. + */ + bool + has_invalid_src_region(const intel_device_info *devinfo, const fs_inst *inst, + unsigned i) + { + if (is_send(inst) || inst->is_math() || inst->is_control_source(i) || + inst->opcode == BRW_OPCODE_DPAS) { + return false; + } + + /* Empirical testing shows that Broadwell has a bug affecting half-float + * MAD instructions when any of its sources has a non-zero offset, such + * as: + * + * mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q }; + * + * We used to generate code like this for SIMD8 executions where we + * used to pack components Y and W of a vector at offset 16B of a SIMD + * register. The problem doesn't occur if the stride of the source is 0. + */ + if (devinfo->ver == 8 && + inst->opcode == BRW_OPCODE_MAD && + inst->src[i].type == BRW_REGISTER_TYPE_HF && + reg_offset(inst->src[i]) % REG_SIZE > 0 && + inst->src[i].stride != 0) { + return true; + } + + const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE); + const unsigned src_byte_offset = reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE); + + return has_dst_aligned_region_restriction(devinfo, inst) && + !is_uniform(inst->src[i]) && + (byte_stride(inst->src[i]) != byte_stride(inst->dst) || + src_byte_offset != dst_byte_offset); + } + + /* + * Return whether the instruction has an unsupported channel bit layout + * specified for the destination region. + */ + bool + has_invalid_dst_region(const intel_device_info *devinfo, + const fs_inst *inst) + { + if (is_send(inst) || inst->is_math()) { + return false; + } else { + const brw_reg_type exec_type = get_exec_type(inst); + const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE); + const bool is_narrowing_conversion = !is_byte_raw_mov(inst) && + type_sz(inst->dst.type) < type_sz(exec_type); + + return (has_dst_aligned_region_restriction(devinfo, inst) && + (required_dst_byte_stride(inst) != byte_stride(inst->dst) || + required_dst_byte_offset(devinfo, inst) != dst_byte_offset)) || + (is_narrowing_conversion && + required_dst_byte_stride(inst) != byte_stride(inst->dst)); + } + } + + /** + * Return a non-zero value if the execution type of the instruction is + * unsupported. The destination and sources matching the returned mask + * will be bit-cast to an integer type of appropriate size, lowering any + * source or destination modifiers into separate MOV instructions. + */ + unsigned + has_invalid_exec_type(const intel_device_info *devinfo, const fs_inst *inst) + { + if (required_exec_type(devinfo, inst) != get_exec_type(inst)) { + switch (inst->opcode) { + case SHADER_OPCODE_SHUFFLE: + case SHADER_OPCODE_QUAD_SWIZZLE: + case SHADER_OPCODE_CLUSTER_BROADCAST: + case SHADER_OPCODE_BROADCAST: + case SHADER_OPCODE_MOV_INDIRECT: + return 0x1; + + case SHADER_OPCODE_SEL_EXEC: + return 0x3; + + default: + unreachable("Unknown invalid execution type source mask."); + } + } else { + return 0; + } + } + + /* + * Return whether the instruction has unsupported source modifiers + * specified for the i-th source region. + */ + bool + has_invalid_src_modifiers(const intel_device_info *devinfo, + const fs_inst *inst, unsigned i) + { + return (!inst->can_do_source_mods(devinfo) && + (inst->src[i].negate || inst->src[i].abs)) || + ((has_invalid_exec_type(devinfo, inst) & (1u << i)) && + (inst->src[i].negate || inst->src[i].abs || + inst->src[i].type != get_exec_type(inst))); + } + + /* + * Return whether the instruction has an unsupported type conversion + * specified for the destination. + */ + bool + has_invalid_conversion(const intel_device_info *devinfo, const fs_inst *inst) + { + switch (inst->opcode) { + case BRW_OPCODE_MOV: + return false; + case BRW_OPCODE_SEL: + return inst->dst.type != get_exec_type(inst); + default: + /* FIXME: We assume the opcodes not explicitly mentioned before just + * work fine with arbitrary conversions, unless they need to be + * bit-cast. + */ + return has_invalid_exec_type(devinfo, inst) && + inst->dst.type != get_exec_type(inst); + } + } + + /** + * Return whether the instruction has unsupported destination modifiers. + */ + bool + has_invalid_dst_modifiers(const intel_device_info *devinfo, const fs_inst *inst) + { + return (has_invalid_exec_type(devinfo, inst) && + (inst->saturate || inst->conditional_mod)) || + has_invalid_conversion(devinfo, inst); + } + + /** + * Return whether the instruction has non-standard semantics for the + * conditional mod which don't cause the flag register to be updated with + * the comparison result. + */ + bool + has_inconsistent_cmod(const fs_inst *inst) + { + return inst->opcode == BRW_OPCODE_SEL || + inst->opcode == BRW_OPCODE_CSEL || + inst->opcode == BRW_OPCODE_IF || + inst->opcode == BRW_OPCODE_WHILE; + } + + bool + lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst); +} + +namespace brw { + /** + * Remove any modifiers from the \p i-th source region of the instruction, + * including negate, abs and any implicit type conversion to the execution + * type. Instead any source modifiers will be implemented as a separate + * MOV instruction prior to the original instruction. + */ + bool + lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i) + { + assert(inst->components_read(i) == 1); + assert(v->devinfo->has_integer_dword_mul || + inst->opcode != BRW_OPCODE_MUL || + brw_reg_type_is_floating_point(get_exec_type(inst)) || + MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4 || + type_sz(inst->src[i].type) == get_exec_type_size(inst)); + + const fs_builder ibld(v, block, inst); + const fs_reg tmp = ibld.vgrf(get_exec_type(inst)); + + lower_instruction(v, block, ibld.MOV(tmp, inst->src[i])); + inst->src[i] = tmp; + + return true; + } +} + +namespace { + /** + * Remove any modifiers from the destination region of the instruction, + * including saturate, conditional mod and any implicit type conversion + * from the execution type. Instead any destination modifiers will be + * implemented as a separate MOV instruction after the original + * instruction. + */ + bool + lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst) + { + const fs_builder ibld(v, block, inst); + const brw_reg_type type = get_exec_type(inst); + /* Not strictly necessary, but if possible use a temporary with the same + * channel alignment as the current destination in order to avoid + * violating the restrictions enforced later on by lower_src_region() + * and lower_dst_region(), which would introduce additional copy + * instructions into the program unnecessarily. + */ + const unsigned stride = + type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 : + type_sz(inst->dst.type) * inst->dst.stride / type_sz(type); + fs_reg tmp = ibld.vgrf(type, stride); + ibld.UNDEF(tmp); + tmp = horiz_stride(tmp, stride); + + /* Emit a MOV taking care of all the destination modifiers. */ + fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp); + mov->saturate = inst->saturate; + if (!has_inconsistent_cmod(inst)) + mov->conditional_mod = inst->conditional_mod; + if (inst->opcode != BRW_OPCODE_SEL) { + mov->predicate = inst->predicate; + mov->predicate_inverse = inst->predicate_inverse; + } + mov->flag_subreg = inst->flag_subreg; + lower_instruction(v, block, mov); + + /* Point the original instruction at the temporary, and clean up any + * destination modifiers. + */ + assert(inst->size_written == inst->dst.component_size(inst->exec_size)); + inst->dst = tmp; + inst->size_written = inst->dst.component_size(inst->exec_size); + inst->saturate = false; + if (!has_inconsistent_cmod(inst)) + inst->conditional_mod = BRW_CONDITIONAL_NONE; + + assert(!inst->flags_written(v->devinfo) || !mov->predicate); + return true; + } + + /** + * Remove any non-trivial shuffling of data from the \p i-th source region + * of the instruction. Instead implement the region as a series of integer + * copies into a temporary with the same channel layout as the destination. + */ + bool + lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i) + { + assert(inst->components_read(i) == 1); + const fs_builder ibld(v, block, inst); + const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride / + type_sz(inst->src[i].type); + assert(stride > 0); + fs_reg tmp = ibld.vgrf(inst->src[i].type, stride); + ibld.UNDEF(tmp); + tmp = horiz_stride(tmp, stride); + + /* Emit a series of 32-bit integer copies with any source modifiers + * cleaned up (because their semantics are dependent on the type). + */ + const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4), + false); + const unsigned n = type_sz(tmp.type) / type_sz(raw_type); + fs_reg raw_src = inst->src[i]; + raw_src.negate = false; + raw_src.abs = false; + + for (unsigned j = 0; j < n; j++) + ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j)); + + /* Point the original instruction at the temporary, making sure to keep + * any source modifiers in the instruction. + */ + fs_reg lower_src = tmp; + lower_src.negate = inst->src[i].negate; + lower_src.abs = inst->src[i].abs; + inst->src[i] = lower_src; + + return true; + } + + /** + * Remove any non-trivial shuffling of data from the destination region of + * the instruction. Instead implement the region as a series of integer + * copies from a temporary with a channel layout compatible with the + * sources. + */ + bool + lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst) + { + /* We cannot replace the result of an integer multiply which writes the + * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit + * value whereas the MOV will act on only 32 or 33 bits of the + * accumulator. + */ + assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() || + brw_reg_type_is_floating_point(inst->dst.type)); + + const fs_builder ibld(v, block, inst); + const unsigned stride = required_dst_byte_stride(inst) / + type_sz(inst->dst.type); + assert(stride > 0); + fs_reg tmp = ibld.vgrf(inst->dst.type, stride); + ibld.UNDEF(tmp); + tmp = horiz_stride(tmp, stride); + + /* Emit a series of 32-bit integer copies from the temporary into the + * original destination. + */ + const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4), + false); + const unsigned n = type_sz(tmp.type) / type_sz(raw_type); + + if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) { + /* Note that in general we cannot simply predicate the copies on the + * same flag register as the original instruction, since it may have + * been overwritten by the instruction itself. Instead initialize + * the temporary with the previous contents of the destination + * register. + */ + for (unsigned j = 0; j < n; j++) + ibld.MOV(subscript(tmp, raw_type, j), + subscript(inst->dst, raw_type, j)); + } + + for (unsigned j = 0; j < n; j++) + ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j), + subscript(tmp, raw_type, j)); + + /* Point the original instruction at the temporary, making sure to keep + * any destination modifiers in the instruction. + */ + assert(inst->size_written == inst->dst.component_size(inst->exec_size)); + inst->dst = tmp; + inst->size_written = inst->dst.component_size(inst->exec_size); + + return true; + } + + /** + * Change sources and destination of the instruction to an + * appropriate legal type, splitting the instruction into multiple + * ones of smaller execution type if necessary, to be used in cases + * where the execution type of an instruction is unsupported. + */ + bool + lower_exec_type(fs_visitor *v, bblock_t *block, fs_inst *inst) + { + assert(inst->dst.type == get_exec_type(inst)); + const unsigned mask = has_invalid_exec_type(v->devinfo, inst); + const brw_reg_type raw_type = required_exec_type(v->devinfo, inst); + const unsigned n = get_exec_type_size(inst) / type_sz(raw_type); + const fs_builder ibld(v, block, inst); + + fs_reg tmp = ibld.vgrf(inst->dst.type, inst->dst.stride); + ibld.UNDEF(tmp); + tmp = horiz_stride(tmp, inst->dst.stride); + + for (unsigned j = 0; j < n; j++) { + fs_inst sub_inst = *inst; + + for (unsigned i = 0; i < inst->sources; i++) { + if (mask & (1u << i)) { + assert(inst->src[i].type == inst->dst.type); + sub_inst.src[i] = subscript(inst->src[i], raw_type, j); + } + } + + sub_inst.dst = subscript(tmp, raw_type, j); + + assert(sub_inst.size_written == sub_inst.dst.component_size(sub_inst.exec_size)); + assert(!sub_inst.flags_written(v->devinfo) && !sub_inst.saturate); + ibld.emit(sub_inst); + + fs_inst *mov = ibld.MOV(subscript(inst->dst, raw_type, j), + subscript(tmp, raw_type, j)); + if (inst->opcode != BRW_OPCODE_SEL) { + mov->predicate = inst->predicate; + mov->predicate_inverse = inst->predicate_inverse; + } + lower_instruction(v, block, mov); + } + + inst->remove(block); + + return true; + } + + /** + * Legalize the source and destination regioning controls of the specified + * instruction. + */ + bool + lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst) + { + const intel_device_info *devinfo = v->devinfo; + bool progress = false; + + if (has_invalid_dst_modifiers(devinfo, inst)) + progress |= lower_dst_modifiers(v, block, inst); + + if (has_invalid_dst_region(devinfo, inst)) + progress |= lower_dst_region(v, block, inst); + + for (unsigned i = 0; i < inst->sources; i++) { + if (has_invalid_src_modifiers(devinfo, inst, i)) + progress |= lower_src_modifiers(v, block, inst, i); + + if (has_invalid_src_region(devinfo, inst, i)) + progress |= lower_src_region(v, block, inst, i); + } + + if (has_invalid_exec_type(devinfo, inst)) + progress |= lower_exec_type(v, block, inst); + + return progress; + } +} + +bool +fs_visitor::lower_regioning() +{ + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) + progress |= lower_instruction(this, block, inst); + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + + return progress; +} diff --git a/src/intel/compiler/elk/brw_fs_nir.cpp b/src/intel/compiler/elk/brw_fs_nir.cpp new file mode 100644 index 00000000000..d16ca1a5ae8 --- /dev/null +++ b/src/intel/compiler/elk/brw_fs_nir.cpp @@ -0,0 +1,8804 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_fs.h" +#include "brw_fs_builder.h" +#include "brw_nir.h" +#include "brw_eu.h" +#include "nir.h" +#include "nir_intrinsics.h" +#include "nir_search_helpers.h" +#include "util/u_math.h" +#include "util/bitscan.h" + +#include + +using namespace brw; + +struct brw_fs_bind_info { + bool valid; + bool bindless; + unsigned block; + unsigned set; + unsigned binding; +}; + +struct nir_to_brw_state { + fs_visitor &s; + const nir_shader *nir; + const intel_device_info *devinfo; + void *mem_ctx; + + /* Points to the end of the program. Annotated with the current NIR + * instruction when applicable. + */ + fs_builder bld; + + fs_reg *ssa_values; + fs_inst **resource_insts; + struct brw_fs_bind_info *ssa_bind_infos; + fs_reg *resource_values; + fs_reg *system_values; +}; + +static fs_reg get_nir_src(nir_to_brw_state &ntb, const nir_src &src); +static fs_reg get_nir_def(nir_to_brw_state &ntb, const nir_def &def); +static nir_component_mask_t get_nir_write_mask(const nir_def &def); + +static void fs_nir_emit_intrinsic(nir_to_brw_state &ntb, const fs_builder &bld, nir_intrinsic_instr *instr); +static fs_reg emit_samplepos_setup(nir_to_brw_state &ntb); +static fs_reg emit_sampleid_setup(nir_to_brw_state &ntb); +static fs_reg emit_samplemaskin_setup(nir_to_brw_state &ntb); +static fs_reg emit_shading_rate_setup(nir_to_brw_state &ntb); + +static void fs_nir_emit_impl(nir_to_brw_state &ntb, nir_function_impl *impl); +static void fs_nir_emit_cf_list(nir_to_brw_state &ntb, exec_list *list); +static void fs_nir_emit_if(nir_to_brw_state &ntb, nir_if *if_stmt); +static void fs_nir_emit_loop(nir_to_brw_state &ntb, nir_loop *loop); +static void fs_nir_emit_block(nir_to_brw_state &ntb, nir_block *block); +static void fs_nir_emit_instr(nir_to_brw_state &ntb, nir_instr *instr); + +static void fs_nir_emit_surface_atomic(nir_to_brw_state &ntb, + const fs_builder &bld, + nir_intrinsic_instr *instr, + fs_reg surface, + bool bindless); +static void fs_nir_emit_global_atomic(nir_to_brw_state &ntb, + const fs_builder &bld, + nir_intrinsic_instr *instr); + +static void +fs_nir_setup_outputs(nir_to_brw_state &ntb) +{ + fs_visitor &s = ntb.s; + + if (s.stage == MESA_SHADER_TESS_CTRL || + s.stage == MESA_SHADER_TASK || + s.stage == MESA_SHADER_MESH || + s.stage == MESA_SHADER_FRAGMENT) + return; + + unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, }; + + /* Calculate the size of output registers in a separate pass, before + * allocating them. With ARB_enhanced_layouts, multiple output variables + * may occupy the same slot, but have different type sizes. + */ + nir_foreach_shader_out_variable(var, s.nir) { + const int loc = var->data.driver_location; + const unsigned var_vec4s = nir_variable_count_slots(var, var->type); + vec4s[loc] = MAX2(vec4s[loc], var_vec4s); + } + + for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) { + if (vec4s[loc] == 0) { + loc++; + continue; + } + + unsigned reg_size = vec4s[loc]; + + /* Check if there are any ranges that start within this range and extend + * past it. If so, include them in this allocation. + */ + for (unsigned i = 1; i < reg_size; i++) { + assert(i + loc < ARRAY_SIZE(vec4s)); + reg_size = MAX2(vec4s[i + loc] + i, reg_size); + } + + fs_reg reg = ntb.bld.vgrf(BRW_REGISTER_TYPE_F, 4 * reg_size); + for (unsigned i = 0; i < reg_size; i++) { + assert(loc + i < ARRAY_SIZE(s.outputs)); + s.outputs[loc + i] = offset(reg, ntb.bld, 4 * i); + } + + loc += reg_size; + } +} + +static void +fs_nir_setup_uniforms(fs_visitor &s) +{ + const intel_device_info *devinfo = s.devinfo; + + /* Only the first compile gets to set up uniforms. */ + if (s.push_constant_loc) + return; + + s.uniforms = s.nir->num_uniforms / 4; + + if (gl_shader_stage_is_compute(s.stage) && devinfo->verx10 < 125) { + /* Add uniforms for builtins after regular NIR uniforms. */ + assert(s.uniforms == s.prog_data->nr_params); + + /* Subgroup ID must be the last uniform on the list. This will make + * easier later to split between cross thread and per thread + * uniforms. + */ + uint32_t *param = brw_stage_prog_data_add_params(s.prog_data, 1); + *param = BRW_PARAM_BUILTIN_SUBGROUP_ID; + s.uniforms++; + } +} + +static fs_reg +emit_work_group_id_setup(nir_to_brw_state &ntb) +{ + fs_visitor &s = ntb.s; + const fs_builder &bld = ntb.bld; + + assert(gl_shader_stage_is_compute(s.stage)); + + fs_reg id = bld.vgrf(BRW_REGISTER_TYPE_UD, 3); + + struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD)); + bld.MOV(id, r0_1); + + struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD)); + struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD)); + bld.MOV(offset(id, bld, 1), r0_6); + bld.MOV(offset(id, bld, 2), r0_7); + + return id; +} + +static bool +emit_system_values_block(nir_to_brw_state &ntb, nir_block *block) +{ + fs_visitor &s = ntb.s; + fs_reg *reg; + + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_load_vertex_id: + case nir_intrinsic_load_base_vertex: + unreachable("should be lowered by nir_lower_system_values()."); + + case nir_intrinsic_load_vertex_id_zero_base: + case nir_intrinsic_load_is_indexed_draw: + case nir_intrinsic_load_first_vertex: + case nir_intrinsic_load_instance_id: + case nir_intrinsic_load_base_instance: + unreachable("should be lowered by brw_nir_lower_vs_inputs()."); + break; + + case nir_intrinsic_load_draw_id: + /* For Task/Mesh, draw_id will be handled later in + * nir_emit_mesh_task_intrinsic(). + */ + if (!gl_shader_stage_is_mesh(s.stage)) + unreachable("should be lowered by brw_nir_lower_vs_inputs()."); + break; + + case nir_intrinsic_load_invocation_id: + if (s.stage == MESA_SHADER_TESS_CTRL) + break; + assert(s.stage == MESA_SHADER_GEOMETRY); + reg = &ntb.system_values[SYSTEM_VALUE_INVOCATION_ID]; + if (reg->file == BAD_FILE) { + *reg = s.gs_payload().instance_id; + } + break; + + case nir_intrinsic_load_sample_pos: + case nir_intrinsic_load_sample_pos_or_center: + assert(s.stage == MESA_SHADER_FRAGMENT); + reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_POS]; + if (reg->file == BAD_FILE) + *reg = emit_samplepos_setup(ntb); + break; + + case nir_intrinsic_load_sample_id: + assert(s.stage == MESA_SHADER_FRAGMENT); + reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_ID]; + if (reg->file == BAD_FILE) + *reg = emit_sampleid_setup(ntb); + break; + + case nir_intrinsic_load_sample_mask_in: + assert(s.stage == MESA_SHADER_FRAGMENT); + assert(s.devinfo->ver >= 7); + reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_MASK_IN]; + if (reg->file == BAD_FILE) + *reg = emit_samplemaskin_setup(ntb); + break; + + case nir_intrinsic_load_workgroup_id: + case nir_intrinsic_load_workgroup_id_zero_base: + if (gl_shader_stage_is_mesh(s.stage)) + unreachable("should be lowered by nir_lower_compute_system_values()."); + assert(gl_shader_stage_is_compute(s.stage)); + reg = &ntb.system_values[SYSTEM_VALUE_WORKGROUP_ID]; + if (reg->file == BAD_FILE) + *reg = emit_work_group_id_setup(ntb); + break; + + case nir_intrinsic_load_helper_invocation: + assert(s.stage == MESA_SHADER_FRAGMENT); + reg = &ntb.system_values[SYSTEM_VALUE_HELPER_INVOCATION]; + if (reg->file == BAD_FILE) { + const fs_builder abld = + ntb.bld.annotate("gl_HelperInvocation", NULL); + + /* On Gfx6+ (gl_HelperInvocation is only exposed on Gfx7+) the + * pixel mask is in g1.7 of the thread payload. + * + * We move the per-channel pixel enable bit to the low bit of each + * channel by shifting the byte containing the pixel mask by the + * vector immediate 0x76543210UV. + * + * The region of <1,8,0> reads only 1 byte (the pixel masks for + * subspans 0 and 1) in SIMD8 and an additional byte (the pixel + * masks for 2 and 3) in SIMD16. + */ + fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1); + + for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) { + const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i); + /* According to the "PS Thread Payload for Normal + * Dispatch" pages on the BSpec, the dispatch mask is + * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on + * gfx6+. + */ + const struct brw_reg reg = s.devinfo->ver >= 20 ? + xe2_vec1_grf(i, 15) : brw_vec1_grf(i + 1, 7); + hbld.SHR(offset(shifted, hbld, i), + stride(retype(reg, BRW_REGISTER_TYPE_UB), 1, 8, 0), + brw_imm_v(0x76543210)); + } + + /* A set bit in the pixel mask means the channel is enabled, but + * that is the opposite of gl_HelperInvocation so we need to invert + * the mask. + * + * The negate source-modifier bit of logical instructions on Gfx8+ + * performs 1's complement negation, so we can use that instead of + * a NOT instruction. + */ + fs_reg inverted = negate(shifted); + if (s.devinfo->ver < 8) { + inverted = abld.vgrf(BRW_REGISTER_TYPE_UW); + abld.NOT(inverted, shifted); + } + + /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing + * with 1 and negating. + */ + fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.AND(anded, inverted, brw_imm_uw(1)); + + fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1); + abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D))); + *reg = dst; + } + break; + + case nir_intrinsic_load_frag_shading_rate: + reg = &ntb.system_values[SYSTEM_VALUE_FRAG_SHADING_RATE]; + if (reg->file == BAD_FILE) + *reg = emit_shading_rate_setup(ntb); + break; + + default: + break; + } + } + + return true; +} + +static void +fs_nir_emit_system_values(nir_to_brw_state &ntb) +{ + const fs_builder &bld = ntb.bld; + fs_visitor &s = ntb.s; + + ntb.system_values = ralloc_array(ntb.mem_ctx, fs_reg, SYSTEM_VALUE_MAX); + for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) { + ntb.system_values[i] = fs_reg(); + } + + /* Always emit SUBGROUP_INVOCATION. Dead code will clean it up if we + * never end up using it. + */ + { + const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL); + fs_reg ® = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]; + reg = abld.vgrf(BRW_REGISTER_TYPE_UW); + abld.UNDEF(reg); + + const fs_builder allbld8 = abld.group(8, 0).exec_all(); + allbld8.MOV(reg, brw_imm_v(0x76543210)); + if (s.dispatch_width > 8) + allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u)); + if (s.dispatch_width > 16) { + const fs_builder allbld16 = abld.group(16, 0).exec_all(); + allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u)); + } + } + + nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)s.nir); + nir_foreach_block(block, impl) + emit_system_values_block(ntb, block); +} + +static void +fs_nir_emit_impl(nir_to_brw_state &ntb, nir_function_impl *impl) +{ + ntb.ssa_values = rzalloc_array(ntb.mem_ctx, fs_reg, impl->ssa_alloc); + ntb.resource_insts = rzalloc_array(ntb.mem_ctx, fs_inst *, impl->ssa_alloc); + ntb.ssa_bind_infos = rzalloc_array(ntb.mem_ctx, struct brw_fs_bind_info, impl->ssa_alloc); + ntb.resource_values = rzalloc_array(ntb.mem_ctx, fs_reg, impl->ssa_alloc); + + fs_nir_emit_cf_list(ntb, &impl->body); +} + +static void +fs_nir_emit_cf_list(nir_to_brw_state &ntb, exec_list *list) +{ + exec_list_validate(list); + foreach_list_typed(nir_cf_node, node, node, list) { + switch (node->type) { + case nir_cf_node_if: + fs_nir_emit_if(ntb, nir_cf_node_as_if(node)); + break; + + case nir_cf_node_loop: + fs_nir_emit_loop(ntb, nir_cf_node_as_loop(node)); + break; + + case nir_cf_node_block: + fs_nir_emit_block(ntb, nir_cf_node_as_block(node)); + break; + + default: + unreachable("Invalid CFG node block"); + } + } +} + +static void +fs_nir_emit_if(nir_to_brw_state &ntb, nir_if *if_stmt) +{ + const intel_device_info *devinfo = ntb.devinfo; + const fs_builder &bld = ntb.bld; + + bool invert; + fs_reg cond_reg; + + /* If the condition has the form !other_condition, use other_condition as + * the source, but invert the predicate on the if instruction. + */ + nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition); + if (cond != NULL && cond->op == nir_op_inot) { + invert = true; + cond_reg = get_nir_src(ntb, cond->src[0].src); + cond_reg = offset(cond_reg, bld, cond->src[0].swizzle[0]); + + if (devinfo->ver <= 5 && + (cond->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) { + /* redo boolean resolve on gen5 */ + fs_reg masked = ntb.s.vgrf(glsl_int_type()); + bld.AND(masked, cond_reg, brw_imm_d(1)); + masked.negate = true; + fs_reg tmp = bld.vgrf(cond_reg.type); + bld.MOV(retype(tmp, BRW_REGISTER_TYPE_D), masked); + cond_reg = tmp; + } + } else { + invert = false; + cond_reg = get_nir_src(ntb, if_stmt->condition); + } + + /* first, put the condition into f0 */ + fs_inst *inst = bld.MOV(bld.null_reg_d(), + retype(cond_reg, BRW_REGISTER_TYPE_D)); + inst->conditional_mod = BRW_CONDITIONAL_NZ; + + bld.IF(BRW_PREDICATE_NORMAL)->predicate_inverse = invert; + + fs_nir_emit_cf_list(ntb, &if_stmt->then_list); + + if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) { + bld.emit(BRW_OPCODE_ELSE); + fs_nir_emit_cf_list(ntb, &if_stmt->else_list); + } + + bld.emit(BRW_OPCODE_ENDIF); + + if (devinfo->ver < 7) + ntb.s.limit_dispatch_width(16, "Non-uniform control flow unsupported " + "in SIMD32 mode."); +} + +static void +fs_nir_emit_loop(nir_to_brw_state &ntb, nir_loop *loop) +{ + const intel_device_info *devinfo = ntb.devinfo; + const fs_builder &bld = ntb.bld; + + assert(!nir_loop_has_continue_construct(loop)); + bld.emit(BRW_OPCODE_DO); + + fs_nir_emit_cf_list(ntb, &loop->body); + + bld.emit(BRW_OPCODE_WHILE); + + if (devinfo->ver < 7) + ntb.s.limit_dispatch_width(16, "Non-uniform control flow unsupported " + "in SIMD32 mode."); +} + +static void +fs_nir_emit_block(nir_to_brw_state &ntb, nir_block *block) +{ + fs_builder bld = ntb.bld; + + nir_foreach_instr(instr, block) { + fs_nir_emit_instr(ntb, instr); + } + + ntb.bld = bld; +} + +/** + * Recognizes a parent instruction of nir_op_extract_* and changes the type to + * match instr. + */ +static bool +optimize_extract_to_float(nir_to_brw_state &ntb, nir_alu_instr *instr, + const fs_reg &result) +{ + const intel_device_info *devinfo = ntb.devinfo; + const fs_builder &bld = ntb.bld; + + if (!instr->src[0].src.ssa->parent_instr) + return false; + + if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) + return false; + + nir_alu_instr *src0 = + nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); + + if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 && + src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16) + return false; + + unsigned element = nir_src_as_uint(src0->src[1].src); + + /* Element type to extract.*/ + const brw_reg_type type = brw_int_type( + src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1, + src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8); + + fs_reg op0 = get_nir_src(ntb, src0->src[0].src); + op0.type = brw_type_for_nir_type(devinfo, + (nir_alu_type)(nir_op_infos[src0->op].input_types[0] | + nir_src_bit_size(src0->src[0].src))); + op0 = offset(op0, bld, src0->src[0].swizzle[0]); + + bld.MOV(result, subscript(op0, type, element)); + return true; +} + +static bool +optimize_frontfacing_ternary(nir_to_brw_state &ntb, + nir_alu_instr *instr, + const fs_reg &result) +{ + const intel_device_info *devinfo = ntb.devinfo; + fs_visitor &s = ntb.s; + + nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src); + if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face) + return false; + + if (!nir_src_is_const(instr->src[1].src) || + !nir_src_is_const(instr->src[2].src)) + return false; + + const float value1 = nir_src_as_float(instr->src[1].src); + const float value2 = nir_src_as_float(instr->src[2].src); + if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f) + return false; + + /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */ + assert(value1 == -value2); + + fs_reg tmp = s.vgrf(glsl_int_type()); + + if (devinfo->ver >= 20) { + /* Gfx20+ has separate back-facing bits for each pair of + * subspans in order to support multiple polygons, so we need to + * use a <1;8,0> region in order to select the correct word for + * each channel. Unfortunately they're no longer aligned to the + * sign bit of a 16-bit word, so a left shift is necessary. + */ + fs_reg ff = ntb.bld.vgrf(BRW_REGISTER_TYPE_UW); + + for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) { + const fs_builder hbld = ntb.bld.group(16, i); + const struct brw_reg gi_uw = retype(xe2_vec1_grf(i, 9), + BRW_REGISTER_TYPE_UW); + hbld.SHL(offset(ff, hbld, i), stride(gi_uw, 1, 8, 0), brw_imm_ud(4)); + } + + if (value1 == -1.0f) + ff.negate = true; + + ntb.bld.OR(subscript(tmp, BRW_REGISTER_TYPE_UW, 1), ff, + brw_imm_uw(0x3f80)); + + } else if (devinfo->ver >= 12 && s.max_polygons == 2) { + /* According to the BSpec "PS Thread Payload for Normal + * Dispatch", the front/back facing interpolation bit is stored + * as bit 15 of either the R1.1 or R1.6 poly info field, for the + * first and second polygons respectively in multipolygon PS + * dispatch mode. + */ + assert(s.dispatch_width == 16); + + for (unsigned i = 0; i < s.max_polygons; i++) { + const fs_builder hbld = ntb.bld.group(8, i); + struct brw_reg g1 = retype(brw_vec1_grf(1, 1 + 5 * i), + BRW_REGISTER_TYPE_UW); + + if (value1 == -1.0f) + g1.negate = true; + + hbld.OR(subscript(offset(tmp, hbld, i), BRW_REGISTER_TYPE_UW, 1), + g1, brw_imm_uw(0x3f80)); + } + + } else if (devinfo->ver >= 12) { + /* Bit 15 of g1.1 is 0 if the polygon is front facing. */ + fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W)); + + /* For (gl_FrontFacing ? 1.0 : -1.0), emit: + * + * or(8) tmp.1<2>W g1.1<0,1,0>W 0x00003f80W + * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D + * + * and negate g1.1<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0). + */ + if (value1 == -1.0f) + g1.negate = true; + + ntb.bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1), + g1, brw_imm_uw(0x3f80)); + } else if (devinfo->ver >= 6) { + /* Bit 15 of g0.0 is 0 if the polygon is front facing. */ + fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W)); + + /* For (gl_FrontFacing ? 1.0 : -1.0), emit: + * + * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W + * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D + * + * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0). + * + * This negation looks like it's safe in practice, because bits 0:4 will + * surely be TRIANGLES + */ + + if (value1 == -1.0f) { + g0.negate = true; + } + + ntb.bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1), + g0, brw_imm_uw(0x3f80)); + } else { + /* Bit 31 of g1.6 is 0 if the polygon is front facing. */ + fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D)); + + /* For (gl_FrontFacing ? 1.0 : -1.0), emit: + * + * or(8) tmp<1>D g1.6<0,1,0>D 0x3f800000D + * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D + * + * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0). + * + * This negation looks like it's safe in practice, because bits 0:4 will + * surely be TRIANGLES + */ + + if (value1 == -1.0f) { + g1_6.negate = true; + } + + ntb.bld.OR(tmp, g1_6, brw_imm_d(0x3f800000)); + } + ntb.bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000)); + + return true; +} + +static brw_rnd_mode +brw_rnd_mode_from_nir_op (const nir_op op) { + switch (op) { + case nir_op_f2f16_rtz: + return BRW_RND_MODE_RTZ; + case nir_op_f2f16_rtne: + return BRW_RND_MODE_RTNE; + default: + unreachable("Operation doesn't support rounding mode"); + } +} + +static brw_rnd_mode +brw_rnd_mode_from_execution_mode(unsigned execution_mode) +{ + if (nir_has_any_rounding_mode_rtne(execution_mode)) + return BRW_RND_MODE_RTNE; + if (nir_has_any_rounding_mode_rtz(execution_mode)) + return BRW_RND_MODE_RTZ; + return BRW_RND_MODE_UNSPECIFIED; +} + +static fs_reg +prepare_alu_destination_and_sources(nir_to_brw_state &ntb, + const fs_builder &bld, + nir_alu_instr *instr, + fs_reg *op, + bool need_dest) +{ + const intel_device_info *devinfo = ntb.devinfo; + + fs_reg result = + need_dest ? get_nir_def(ntb, instr->def) : bld.null_reg_ud(); + + result.type = brw_type_for_nir_type(devinfo, + (nir_alu_type)(nir_op_infos[instr->op].output_type | + instr->def.bit_size)); + + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { + op[i] = get_nir_src(ntb, instr->src[i].src); + op[i].type = brw_type_for_nir_type(devinfo, + (nir_alu_type)(nir_op_infos[instr->op].input_types[i] | + nir_src_bit_size(instr->src[i].src))); + } + + /* Move and vecN instrutions may still be vectored. Return the raw, + * vectored source and destination so that fs_visitor::nir_emit_alu can + * handle it. Other callers should not have to handle these kinds of + * instructions. + */ + switch (instr->op) { + case nir_op_mov: + case nir_op_vec2: + case nir_op_vec3: + case nir_op_vec4: + case nir_op_vec8: + case nir_op_vec16: + return result; + default: + break; + } + + /* At this point, we have dealt with any instruction that operates on + * more than a single channel. Therefore, we can just adjust the source + * and destination registers for that channel and emit the instruction. + */ + unsigned channel = 0; + if (nir_op_infos[instr->op].output_size == 0) { + /* Since NIR is doing the scalarizing for us, we should only ever see + * vectorized operations with a single channel. + */ + nir_component_mask_t write_mask = get_nir_write_mask(instr->def); + assert(util_bitcount(write_mask) == 1); + channel = ffs(write_mask) - 1; + + result = offset(result, bld, channel); + } + + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { + assert(nir_op_infos[instr->op].input_sizes[i] < 2); + op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]); + } + + return result; +} + +static fs_reg +resolve_source_modifiers(const fs_builder &bld, const fs_reg &src) +{ + if (!src.abs && !src.negate) + return src; + + fs_reg temp = bld.vgrf(src.type); + bld.MOV(temp, src); + + return temp; +} + +static void +resolve_inot_sources(nir_to_brw_state &ntb, const fs_builder &bld, nir_alu_instr *instr, + fs_reg *op) +{ + for (unsigned i = 0; i < 2; i++) { + nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src); + + if (inot_instr != NULL && inot_instr->op == nir_op_inot) { + /* The source of the inot is now the source of instr. */ + prepare_alu_destination_and_sources(ntb, bld, inot_instr, &op[i], false); + + assert(!op[i].negate); + op[i].negate = true; + } else { + op[i] = resolve_source_modifiers(bld, op[i]); + } + } +} + +static bool +try_emit_b2fi_of_inot(nir_to_brw_state &ntb, const fs_builder &bld, + fs_reg result, + nir_alu_instr *instr) +{ + const intel_device_info *devinfo = bld.shader->devinfo; + + if (devinfo->ver < 6 || devinfo->verx10 >= 125) + return false; + + nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src); + + if (inot_instr == NULL || inot_instr->op != nir_op_inot) + return false; + + /* HF is also possible as a destination on BDW+. For nir_op_b2i, the set + * of valid size-changing combinations is a bit more complex. + * + * The source restriction is just because I was lazy about generating the + * constant below. + */ + if (instr->def.bit_size != 32 || + nir_src_bit_size(inot_instr->src[0].src) != 32) + return false; + + /* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0. Since a can only be 0 or -1, + * this is float(1 + a). + */ + fs_reg op; + + prepare_alu_destination_and_sources(ntb, bld, inot_instr, &op, false); + + /* Ignore the saturate modifier, if there is one. The result of the + * arithmetic can only be 0 or 1, so the clamping will do nothing anyway. + */ + bld.ADD(result, op, brw_imm_d(1)); + + return true; +} + +/** + * Emit code for nir_op_fsign possibly fused with a nir_op_fmul + * + * If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of + * the source of \c instr that is a \c nir_op_fsign. + */ +static void +emit_fsign(nir_to_brw_state &ntb, const fs_builder &bld, const nir_alu_instr *instr, + fs_reg result, fs_reg *op, unsigned fsign_src) +{ + fs_visitor &s = ntb.s; + const intel_device_info *devinfo = ntb.devinfo; + + fs_inst *inst; + + assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul); + assert(fsign_src < nir_op_infos[instr->op].num_inputs); + + if (instr->op != nir_op_fsign) { + const nir_alu_instr *const fsign_instr = + nir_src_as_alu_instr(instr->src[fsign_src].src); + + /* op[fsign_src] has the nominal result of the fsign, and op[1 - + * fsign_src] has the other multiply source. This must be rearranged so + * that op[0] is the source of the fsign op[1] is the other multiply + * source. + */ + if (fsign_src != 0) + op[1] = op[0]; + + op[0] = get_nir_src(ntb, fsign_instr->src[0].src); + + const nir_alu_type t = + (nir_alu_type)(nir_op_infos[instr->op].input_types[0] | + nir_src_bit_size(fsign_instr->src[0].src)); + + op[0].type = brw_type_for_nir_type(devinfo, t); + + unsigned channel = 0; + if (nir_op_infos[instr->op].output_size == 0) { + /* Since NIR is doing the scalarizing for us, we should only ever see + * vectorized operations with a single channel. + */ + nir_component_mask_t write_mask = get_nir_write_mask(instr->def); + assert(util_bitcount(write_mask) == 1); + channel = ffs(write_mask) - 1; + } + + op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]); + } + + if (type_sz(op[0].type) == 2) { + /* AND(val, 0x8000) gives the sign bit. + * + * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero. + */ + fs_reg zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF); + bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ); + + op[0].type = BRW_REGISTER_TYPE_UW; + result.type = BRW_REGISTER_TYPE_UW; + bld.AND(result, op[0], brw_imm_uw(0x8000u)); + + if (instr->op == nir_op_fsign) + inst = bld.OR(result, result, brw_imm_uw(0x3c00u)); + else { + /* Use XOR here to get the result sign correct. */ + inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UW)); + } + + inst->predicate = BRW_PREDICATE_NORMAL; + } else if (type_sz(op[0].type) == 4) { + /* AND(val, 0x80000000) gives the sign bit. + * + * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not + * zero. + */ + bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ); + + op[0].type = BRW_REGISTER_TYPE_UD; + result.type = BRW_REGISTER_TYPE_UD; + bld.AND(result, op[0], brw_imm_ud(0x80000000u)); + + if (instr->op == nir_op_fsign) + inst = bld.OR(result, result, brw_imm_ud(0x3f800000u)); + else { + /* Use XOR here to get the result sign correct. */ + inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UD)); + } + + inst->predicate = BRW_PREDICATE_NORMAL; + } else { + /* For doubles we do the same but we need to consider: + * + * - 2-src instructions can't operate with 64-bit immediates + * - The sign is encoded in the high 32-bit of each DF + * - We need to produce a DF result. + */ + + fs_reg zero = s.vgrf(glsl_double_type()); + bld.MOV(zero, setup_imm_df(bld, 0.0)); + bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ); + + bld.MOV(result, zero); + + fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1); + bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1), + brw_imm_ud(0x80000000u)); + + if (instr->op == nir_op_fsign) { + set_predicate(BRW_PREDICATE_NORMAL, + bld.OR(r, r, brw_imm_ud(0x3ff00000u))); + } else { + if (devinfo->has_64bit_int) { + /* This could be done better in some cases. If the scale is an + * immediate with the low 32-bits all 0, emitting a separate XOR and + * OR would allow an algebraic optimization to remove the OR. There + * are currently zero instances of fsign(double(x))*IMM in shader-db + * or any test suite, so it is hard to care at this time. + */ + fs_reg result_int64 = retype(result, BRW_REGISTER_TYPE_UQ); + inst = bld.XOR(result_int64, result_int64, + retype(op[1], BRW_REGISTER_TYPE_UQ)); + } else { + fs_reg result_int64 = retype(result, BRW_REGISTER_TYPE_UQ); + bld.MOV(subscript(result_int64, BRW_REGISTER_TYPE_UD, 0), + subscript(op[1], BRW_REGISTER_TYPE_UD, 0)); + bld.XOR(subscript(result_int64, BRW_REGISTER_TYPE_UD, 1), + subscript(result_int64, BRW_REGISTER_TYPE_UD, 1), + subscript(op[1], BRW_REGISTER_TYPE_UD, 1)); + } + } + } +} + +/** + * Determine whether sources of a nir_op_fmul can be fused with a nir_op_fsign + * + * Checks the operands of a \c nir_op_fmul to determine whether or not + * \c emit_fsign could fuse the multiplication with the \c sign() calculation. + * + * \param instr The multiplication instruction + * + * \param fsign_src The source of \c instr that may or may not be a + * \c nir_op_fsign + */ +static bool +can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src) +{ + assert(instr->op == nir_op_fmul); + + nir_alu_instr *const fsign_instr = + nir_src_as_alu_instr(instr->src[fsign_src].src); + + /* Rules: + * + * 1. instr->src[fsign_src] must be a nir_op_fsign. + * 2. The nir_op_fsign can only be used by this multiplication. + * 3. The source that is the nir_op_fsign does not have source modifiers. + * \c emit_fsign only examines the source modifiers of the source of the + * \c nir_op_fsign. + * + * The nir_op_fsign must also not have the saturate modifier, but steps + * have already been taken (in nir_opt_algebraic) to ensure that. + */ + return fsign_instr != NULL && fsign_instr->op == nir_op_fsign && + is_used_once(fsign_instr); +} + +static bool +is_const_zero(const nir_src &src) +{ + return nir_src_is_const(src) && nir_src_as_int(src) == 0; +} + +static void +fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr, + bool need_dest) +{ + const intel_device_info *devinfo = ntb.devinfo; + const fs_builder &bld = ntb.bld; + fs_visitor &s = ntb.s; + + fs_inst *inst; + unsigned execution_mode = + bld.shader->nir->info.float_controls_execution_mode; + + fs_reg op[NIR_MAX_VEC_COMPONENTS]; + fs_reg result = prepare_alu_destination_and_sources(ntb, bld, instr, op, need_dest); + +#ifndef NDEBUG + /* Everything except raw moves, some type conversions, iabs, and ineg + * should have 8-bit sources lowered by nir_lower_bit_size in + * brw_preprocess_nir or by brw_nir_lower_conversions in + * brw_postprocess_nir. + */ + switch (instr->op) { + case nir_op_mov: + case nir_op_vec2: + case nir_op_vec3: + case nir_op_vec4: + case nir_op_vec8: + case nir_op_vec16: + case nir_op_i2f16: + case nir_op_i2f32: + case nir_op_i2i16: + case nir_op_i2i32: + case nir_op_u2f16: + case nir_op_u2f32: + case nir_op_u2u16: + case nir_op_u2u32: + case nir_op_iabs: + case nir_op_ineg: + case nir_op_pack_32_4x8_split: + break; + + default: + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { + assert(type_sz(op[i].type) > 1); + } + } +#endif + + switch (instr->op) { + case nir_op_mov: + case nir_op_vec2: + case nir_op_vec3: + case nir_op_vec4: + case nir_op_vec8: + case nir_op_vec16: { + fs_reg temp = result; + bool need_extra_copy = false; + + nir_intrinsic_instr *store_reg = + nir_store_reg_for_def(&instr->def); + if (store_reg != NULL) { + nir_def *dest_reg = store_reg->src[1].ssa; + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { + nir_intrinsic_instr *load_reg = + nir_load_reg_for_def(instr->src[i].src.ssa); + if (load_reg == NULL) + continue; + + if (load_reg->src[0].ssa == dest_reg) { + need_extra_copy = true; + temp = bld.vgrf(result.type, 4); + break; + } + } + } + + nir_component_mask_t write_mask = get_nir_write_mask(instr->def); + unsigned last_bit = util_last_bit(write_mask); + + for (unsigned i = 0; i < last_bit; i++) { + if (!(write_mask & (1 << i))) + continue; + + if (instr->op == nir_op_mov) { + bld.MOV(offset(temp, bld, i), + offset(op[0], bld, instr->src[0].swizzle[i])); + } else { + bld.MOV(offset(temp, bld, i), + offset(op[i], bld, instr->src[i].swizzle[0])); + } + } + + /* In this case the source and destination registers were the same, + * so we need to insert an extra set of moves in order to deal with + * any swizzling. + */ + if (need_extra_copy) { + for (unsigned i = 0; i < last_bit; i++) { + if (!(write_mask & (1 << i))) + continue; + + bld.MOV(offset(result, bld, i), offset(temp, bld, i)); + } + } + return; + } + + case nir_op_i2f32: + case nir_op_u2f32: + if (optimize_extract_to_float(ntb, instr, result)) + return; + inst = bld.MOV(result, op[0]); + break; + + case nir_op_f2f16_rtne: + case nir_op_f2f16_rtz: + case nir_op_f2f16: { + brw_rnd_mode rnd = BRW_RND_MODE_UNSPECIFIED; + + if (nir_op_f2f16 == instr->op) + rnd = brw_rnd_mode_from_execution_mode(execution_mode); + else + rnd = brw_rnd_mode_from_nir_op(instr->op); + + if (BRW_RND_MODE_UNSPECIFIED != rnd) + bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), brw_imm_d(rnd)); + + assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ + inst = bld.F32TO16(result, op[0]); + break; + } + + case nir_op_b2i8: + case nir_op_b2i16: + case nir_op_b2i32: + case nir_op_b2i64: + case nir_op_b2f16: + case nir_op_b2f32: + case nir_op_b2f64: + if (try_emit_b2fi_of_inot(ntb, bld, result, instr)) + break; + op[0].type = BRW_REGISTER_TYPE_D; + op[0].negate = !op[0].negate; + FALLTHROUGH; + case nir_op_i2f64: + case nir_op_i2i64: + case nir_op_u2f64: + case nir_op_u2u64: + case nir_op_f2f64: + case nir_op_f2i64: + case nir_op_f2u64: + case nir_op_i2i32: + case nir_op_u2u32: + case nir_op_f2i32: + case nir_op_f2u32: + case nir_op_i2f16: + case nir_op_u2f16: + case nir_op_f2i16: + case nir_op_f2u16: + case nir_op_f2i8: + case nir_op_f2u8: + if (result.type == BRW_REGISTER_TYPE_B || + result.type == BRW_REGISTER_TYPE_UB || + result.type == BRW_REGISTER_TYPE_HF) + assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ + + if (op[0].type == BRW_REGISTER_TYPE_B || + op[0].type == BRW_REGISTER_TYPE_UB || + op[0].type == BRW_REGISTER_TYPE_HF) + assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */ + + inst = bld.MOV(result, op[0]); + break; + + case nir_op_i2i8: + case nir_op_u2u8: + assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ + FALLTHROUGH; + case nir_op_i2i16: + case nir_op_u2u16: { + /* Emit better code for u2u8(extract_u8(a, b)) and similar patterns. + * Emitting the instructions one by one results in two MOV instructions + * that won't be propagated. By handling both instructions here, a + * single MOV is emitted. + */ + nir_alu_instr *extract_instr = nir_src_as_alu_instr(instr->src[0].src); + if (extract_instr != NULL) { + if (extract_instr->op == nir_op_extract_u8 || + extract_instr->op == nir_op_extract_i8) { + prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false); + + const unsigned byte = nir_src_as_uint(extract_instr->src[1].src); + const brw_reg_type type = + brw_int_type(1, extract_instr->op == nir_op_extract_i8); + + op[0] = subscript(op[0], type, byte); + } else if (extract_instr->op == nir_op_extract_u16 || + extract_instr->op == nir_op_extract_i16) { + prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false); + + const unsigned word = nir_src_as_uint(extract_instr->src[1].src); + const brw_reg_type type = + brw_int_type(2, extract_instr->op == nir_op_extract_i16); + + op[0] = subscript(op[0], type, word); + } + } + + inst = bld.MOV(result, op[0]); + break; + } + + case nir_op_fsat: + inst = bld.MOV(result, op[0]); + inst->saturate = true; + break; + + case nir_op_fneg: + case nir_op_ineg: + op[0].negate = true; + inst = bld.MOV(result, op[0]); + break; + + case nir_op_fabs: + case nir_op_iabs: + op[0].negate = false; + op[0].abs = true; + inst = bld.MOV(result, op[0]); + break; + + case nir_op_f2f32: + if (nir_has_any_rounding_mode_enabled(execution_mode)) { + brw_rnd_mode rnd = + brw_rnd_mode_from_execution_mode(execution_mode); + bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), + brw_imm_d(rnd)); + } + + if (op[0].type == BRW_REGISTER_TYPE_HF) + assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */ + + inst = bld.MOV(result, op[0]); + break; + + case nir_op_fsign: + emit_fsign(ntb, bld, instr, result, op, 0); + break; + + case nir_op_frcp: + inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]); + break; + + case nir_op_fexp2: + inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]); + break; + + case nir_op_flog2: + inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]); + break; + + case nir_op_fsin: + inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]); + break; + + case nir_op_fcos: + inst = bld.emit(SHADER_OPCODE_COS, result, op[0]); + break; + + case nir_op_fddx_fine: + inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]); + break; + case nir_op_fddx: + case nir_op_fddx_coarse: + inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]); + break; + case nir_op_fddy_fine: + inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]); + break; + case nir_op_fddy: + case nir_op_fddy_coarse: + inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]); + break; + + case nir_op_fadd: + if (nir_has_any_rounding_mode_enabled(execution_mode)) { + brw_rnd_mode rnd = + brw_rnd_mode_from_execution_mode(execution_mode); + bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), + brw_imm_d(rnd)); + } + FALLTHROUGH; + case nir_op_iadd: + inst = bld.ADD(result, op[0], op[1]); + break; + + case nir_op_iadd3: + inst = bld.ADD3(result, op[0], op[1], op[2]); + break; + + case nir_op_iadd_sat: + case nir_op_uadd_sat: + inst = bld.ADD(result, op[0], op[1]); + inst->saturate = true; + break; + + case nir_op_isub_sat: + bld.emit(SHADER_OPCODE_ISUB_SAT, result, op[0], op[1]); + break; + + case nir_op_usub_sat: + bld.emit(SHADER_OPCODE_USUB_SAT, result, op[0], op[1]); + break; + + case nir_op_irhadd: + case nir_op_urhadd: + assert(instr->def.bit_size < 64); + inst = bld.AVG(result, op[0], op[1]); + break; + + case nir_op_ihadd: + case nir_op_uhadd: { + assert(instr->def.bit_size < 64); + fs_reg tmp = bld.vgrf(result.type); + + if (devinfo->ver >= 8) { + op[0] = resolve_source_modifiers(bld, op[0]); + op[1] = resolve_source_modifiers(bld, op[1]); + } + + /* AVG(x, y) - ((x ^ y) & 1) */ + bld.XOR(tmp, op[0], op[1]); + bld.AND(tmp, tmp, retype(brw_imm_ud(1), result.type)); + bld.AVG(result, op[0], op[1]); + inst = bld.ADD(result, result, tmp); + inst->src[1].negate = true; + break; + } + + case nir_op_fmul: + for (unsigned i = 0; i < 2; i++) { + if (can_fuse_fmul_fsign(instr, i)) { + emit_fsign(ntb, bld, instr, result, op, i); + return; + } + } + + /* We emit the rounding mode after the previous fsign optimization since + * it won't result in a MUL, but will try to negate the value by other + * means. + */ + if (nir_has_any_rounding_mode_enabled(execution_mode)) { + brw_rnd_mode rnd = + brw_rnd_mode_from_execution_mode(execution_mode); + bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), + brw_imm_d(rnd)); + } + + inst = bld.MUL(result, op[0], op[1]); + break; + + case nir_op_imul_2x32_64: + case nir_op_umul_2x32_64: + bld.MUL(result, op[0], op[1]); + break; + + case nir_op_imul_32x16: + case nir_op_umul_32x16: { + const bool ud = instr->op == nir_op_umul_32x16; + const enum brw_reg_type word_type = + ud ? BRW_REGISTER_TYPE_UW : BRW_REGISTER_TYPE_W; + const enum brw_reg_type dword_type = + ud ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D; + + assert(instr->def.bit_size == 32); + + /* Before copy propagation there are no immediate values. */ + assert(op[0].file != IMM && op[1].file != IMM); + + op[1] = subscript(op[1], word_type, 0); + + if (devinfo->ver >= 7) + bld.MUL(result, retype(op[0], dword_type), op[1]); + else + bld.MUL(result, op[1], retype(op[0], dword_type)); + + break; + } + + case nir_op_imul: + assert(instr->def.bit_size < 64); + bld.MUL(result, op[0], op[1]); + break; + + case nir_op_imul_high: + case nir_op_umul_high: + assert(instr->def.bit_size < 64); + if (instr->def.bit_size == 32) { + bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]); + } else { + fs_reg tmp = bld.vgrf(brw_reg_type_from_bit_size(32, op[0].type)); + bld.MUL(tmp, op[0], op[1]); + bld.MOV(result, subscript(tmp, result.type, 1)); + } + break; + + case nir_op_idiv: + case nir_op_udiv: + assert(instr->def.bit_size < 64); + bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]); + break; + + case nir_op_uadd_carry: + unreachable("Should have been lowered by carry_to_arith()."); + + case nir_op_usub_borrow: + unreachable("Should have been lowered by borrow_to_arith()."); + + case nir_op_umod: + case nir_op_irem: + /* According to the sign table for INT DIV in the Ivy Bridge PRM, it + * appears that our hardware just does the right thing for signed + * remainder. + */ + assert(instr->def.bit_size < 64); + bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]); + break; + + case nir_op_imod: { + /* Get a regular C-style remainder. If a % b == 0, set the predicate. */ + bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]); + + /* Math instructions don't support conditional mod */ + inst = bld.MOV(bld.null_reg_d(), result); + inst->conditional_mod = BRW_CONDITIONAL_NZ; + + /* Now, we need to determine if signs of the sources are different. + * When we XOR the sources, the top bit is 0 if they are the same and 1 + * if they are different. We can then use a conditional modifier to + * turn that into a predicate. This leads us to an XOR.l instruction. + * + * Technically, according to the PRM, you're not allowed to use .l on a + * XOR instruction. However, empirical experiments and Curro's reading + * of the simulator source both indicate that it's safe. + */ + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D); + inst = bld.XOR(tmp, op[0], op[1]); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->conditional_mod = BRW_CONDITIONAL_L; + + /* If the result of the initial remainder operation is non-zero and the + * two sources have different signs, add in a copy of op[1] to get the + * final integer modulus value. + */ + inst = bld.ADD(result, result, op[1]); + inst->predicate = BRW_PREDICATE_NORMAL; + break; + } + + case nir_op_flt32: + case nir_op_fge32: + case nir_op_feq32: + case nir_op_fneu32: { + fs_reg dest = result; + + const uint32_t bit_size = nir_src_bit_size(instr->src[0].src); + if (bit_size != 32) { + dest = bld.vgrf(op[0].type, 1); + bld.UNDEF(dest); + } + + bld.CMP(dest, op[0], op[1], brw_cmod_for_nir_comparison(instr->op)); + + if (bit_size > 32) { + bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); + } else if(bit_size < 32) { + /* When we convert the result to 32-bit we need to be careful and do + * it as a signed conversion to get sign extension (for 32-bit true) + */ + const brw_reg_type src_type = + brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D); + + bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type)); + } + break; + } + + case nir_op_ilt32: + case nir_op_ult32: + case nir_op_ige32: + case nir_op_uge32: + case nir_op_ieq32: + case nir_op_ine32: { + fs_reg dest = result; + + const uint32_t bit_size = type_sz(op[0].type) * 8; + if (bit_size != 32) { + dest = bld.vgrf(op[0].type, 1); + bld.UNDEF(dest); + } + + bld.CMP(dest, op[0], op[1], + brw_cmod_for_nir_comparison(instr->op)); + + if (bit_size > 32) { + bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); + } else if (bit_size < 32) { + /* When we convert the result to 32-bit we need to be careful and do + * it as a signed conversion to get sign extension (for 32-bit true) + */ + const brw_reg_type src_type = + brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D); + + bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type)); + } + break; + } + + case nir_op_inot: + if (devinfo->ver >= 8) { + nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src); + + if (inot_src_instr != NULL && + (inot_src_instr->op == nir_op_ior || + inot_src_instr->op == nir_op_ixor || + inot_src_instr->op == nir_op_iand)) { + /* The sources of the source logical instruction are now the + * sources of the instruction that will be generated. + */ + prepare_alu_destination_and_sources(ntb, bld, inot_src_instr, op, false); + resolve_inot_sources(ntb, bld, inot_src_instr, op); + + /* Smash all of the sources and destination to be signed. This + * doesn't matter for the operation of the instruction, but cmod + * propagation fails on unsigned sources with negation (due to + * fs_inst::can_do_cmod returning false). + */ + result.type = + brw_type_for_nir_type(devinfo, + (nir_alu_type)(nir_type_int | + instr->def.bit_size)); + op[0].type = + brw_type_for_nir_type(devinfo, + (nir_alu_type)(nir_type_int | + nir_src_bit_size(inot_src_instr->src[0].src))); + op[1].type = + brw_type_for_nir_type(devinfo, + (nir_alu_type)(nir_type_int | + nir_src_bit_size(inot_src_instr->src[1].src))); + + /* For XOR, only invert one of the sources. Arbitrarily choose + * the first source. + */ + op[0].negate = !op[0].negate; + if (inot_src_instr->op != nir_op_ixor) + op[1].negate = !op[1].negate; + + switch (inot_src_instr->op) { + case nir_op_ior: + bld.AND(result, op[0], op[1]); + return; + + case nir_op_iand: + bld.OR(result, op[0], op[1]); + return; + + case nir_op_ixor: + bld.XOR(result, op[0], op[1]); + return; + + default: + unreachable("impossible opcode"); + } + } + op[0] = resolve_source_modifiers(bld, op[0]); + } + bld.NOT(result, op[0]); + break; + case nir_op_ixor: + if (devinfo->ver >= 8) { + resolve_inot_sources(ntb, bld, instr, op); + } + bld.XOR(result, op[0], op[1]); + break; + case nir_op_ior: + if (devinfo->ver >= 8) { + resolve_inot_sources(ntb, bld, instr, op); + } + bld.OR(result, op[0], op[1]); + break; + case nir_op_iand: + if (devinfo->ver >= 8) { + resolve_inot_sources(ntb, bld, instr, op); + } + bld.AND(result, op[0], op[1]); + break; + + case nir_op_fdot2: + case nir_op_fdot3: + case nir_op_fdot4: + case nir_op_b32all_fequal2: + case nir_op_b32all_iequal2: + case nir_op_b32all_fequal3: + case nir_op_b32all_iequal3: + case nir_op_b32all_fequal4: + case nir_op_b32all_iequal4: + case nir_op_b32any_fnequal2: + case nir_op_b32any_inequal2: + case nir_op_b32any_fnequal3: + case nir_op_b32any_inequal3: + case nir_op_b32any_fnequal4: + case nir_op_b32any_inequal4: + unreachable("Lowered by nir_lower_alu_reductions"); + + case nir_op_ldexp: + unreachable("not reached: should be handled by ldexp_to_arith()"); + + case nir_op_fsqrt: + inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]); + break; + + case nir_op_frsq: + inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]); + break; + + case nir_op_ftrunc: + inst = bld.RNDZ(result, op[0]); + if (devinfo->ver < 6) { + set_condmod(BRW_CONDITIONAL_R, inst); + set_predicate(BRW_PREDICATE_NORMAL, + bld.ADD(result, result, brw_imm_f(1.0f))); + inst = bld.MOV(result, result); /* for potential saturation */ + } + break; + + case nir_op_fceil: { + op[0].negate = !op[0].negate; + fs_reg temp = s.vgrf(glsl_float_type()); + bld.RNDD(temp, op[0]); + temp.negate = true; + inst = bld.MOV(result, temp); + break; + } + case nir_op_ffloor: + inst = bld.RNDD(result, op[0]); + break; + case nir_op_ffract: + inst = bld.FRC(result, op[0]); + break; + case nir_op_fround_even: + inst = bld.RNDE(result, op[0]); + if (devinfo->ver < 6) { + set_condmod(BRW_CONDITIONAL_R, inst); + set_predicate(BRW_PREDICATE_NORMAL, + bld.ADD(result, result, brw_imm_f(1.0f))); + inst = bld.MOV(result, result); /* for potential saturation */ + } + break; + + case nir_op_fquantize2f16: { + fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D); + fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F); + fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F); + + /* The destination stride must be at least as big as the source stride. */ + tmp16 = subscript(tmp16, BRW_REGISTER_TYPE_HF, 0); + + /* Check for denormal */ + fs_reg abs_src0 = op[0]; + abs_src0.abs = true; + bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)), + BRW_CONDITIONAL_L); + /* Get the appropriately signed zero */ + bld.AND(retype(zero, BRW_REGISTER_TYPE_UD), + retype(op[0], BRW_REGISTER_TYPE_UD), + brw_imm_ud(0x80000000)); + /* Do the actual F32 -> F16 -> F32 conversion */ + bld.F32TO16(tmp16, op[0]); + bld.F16TO32(tmp32, tmp16); + /* Select that or zero based on normal status */ + inst = bld.SEL(result, zero, tmp32); + inst->predicate = BRW_PREDICATE_NORMAL; + break; + } + + case nir_op_imin: + case nir_op_umin: + case nir_op_fmin: + inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L); + break; + + case nir_op_imax: + case nir_op_umax: + case nir_op_fmax: + inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE); + break; + + case nir_op_pack_snorm_2x16: + case nir_op_pack_snorm_4x8: + case nir_op_pack_unorm_2x16: + case nir_op_pack_unorm_4x8: + case nir_op_unpack_snorm_2x16: + case nir_op_unpack_snorm_4x8: + case nir_op_unpack_unorm_2x16: + case nir_op_unpack_unorm_4x8: + case nir_op_unpack_half_2x16: + case nir_op_pack_half_2x16: + unreachable("not reached: should be handled by lower_packing_builtins"); + + case nir_op_unpack_half_2x16_split_x_flush_to_zero: + assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode); + FALLTHROUGH; + case nir_op_unpack_half_2x16_split_x: + inst = bld.F16TO32(result, subscript(op[0], BRW_REGISTER_TYPE_HF, 0)); + break; + + case nir_op_unpack_half_2x16_split_y_flush_to_zero: + assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode); + FALLTHROUGH; + case nir_op_unpack_half_2x16_split_y: + inst = bld.F16TO32(result, subscript(op[0], BRW_REGISTER_TYPE_HF, 1)); + break; + + case nir_op_pack_64_2x32_split: + case nir_op_pack_32_2x16_split: + bld.emit(FS_OPCODE_PACK, result, op[0], op[1]); + break; + + case nir_op_pack_32_4x8_split: + bld.emit(FS_OPCODE_PACK, result, op, 4); + break; + + case nir_op_unpack_64_2x32_split_x: + case nir_op_unpack_64_2x32_split_y: { + if (instr->op == nir_op_unpack_64_2x32_split_x) + bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0)); + else + bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1)); + break; + } + + case nir_op_unpack_32_2x16_split_x: + case nir_op_unpack_32_2x16_split_y: { + if (instr->op == nir_op_unpack_32_2x16_split_x) + bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0)); + else + bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1)); + break; + } + + case nir_op_fpow: + inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]); + break; + + case nir_op_bitfield_reverse: + assert(instr->def.bit_size == 32); + assert(nir_src_bit_size(instr->src[0].src) == 32); + bld.BFREV(result, op[0]); + break; + + case nir_op_bit_count: + assert(instr->def.bit_size == 32); + assert(nir_src_bit_size(instr->src[0].src) < 64); + bld.CBIT(result, op[0]); + break; + + case nir_op_uclz: + assert(instr->def.bit_size == 32); + assert(nir_src_bit_size(instr->src[0].src) == 32); + bld.LZD(retype(result, BRW_REGISTER_TYPE_UD), op[0]); + break; + + case nir_op_ifind_msb: { + assert(instr->def.bit_size == 32); + assert(nir_src_bit_size(instr->src[0].src) == 32); + assert(devinfo->ver >= 7); + + bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]); + + /* FBH counts from the MSB side, while GLSL's findMSB() wants the count + * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then + * subtract the result from 31 to convert the MSB count into an LSB + * count. + */ + bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ); + + inst = bld.ADD(result, result, brw_imm_d(31)); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->src[0].negate = true; + break; + } + + case nir_op_find_lsb: + assert(instr->def.bit_size == 32); + assert(nir_src_bit_size(instr->src[0].src) == 32); + assert(devinfo->ver >= 7); + bld.FBL(result, op[0]); + break; + + case nir_op_ubitfield_extract: + case nir_op_ibitfield_extract: + unreachable("should have been lowered"); + case nir_op_ubfe: + case nir_op_ibfe: + assert(instr->def.bit_size < 64); + bld.BFE(result, op[2], op[1], op[0]); + break; + case nir_op_bfm: + assert(instr->def.bit_size < 64); + bld.BFI1(result, op[0], op[1]); + break; + case nir_op_bfi: + assert(instr->def.bit_size < 64); + + /* bfi is ((...) | (~src0 & src2)). The second part is zero when src2 is + * either 0 or src0. Replacing the 0 with another value can eliminate a + * temporary register. + */ + if (is_const_zero(instr->src[2].src)) + bld.BFI2(result, op[0], op[1], op[0]); + else + bld.BFI2(result, op[0], op[1], op[2]); + + break; + + case nir_op_bitfield_insert: + unreachable("not reached: should have been lowered"); + + /* With regards to implicit masking of the shift counts for 8- and 16-bit + * types, the PRMs are **incorrect**. They falsely state that on Gen9+ only + * the low bits of src1 matching the size of src0 (e.g., 4-bits for W or UW + * src0) are used. The Bspec (backed by data from experimentation) state + * that 0x3f is used for Q and UQ types, and 0x1f is used for **all** other + * types. + * + * The match the behavior expected for the NIR opcodes, explicit masks for + * 8- and 16-bit types must be added. + */ + case nir_op_ishl: + if (instr->def.bit_size < 32) { + bld.AND(result, op[1], brw_imm_ud(instr->def.bit_size - 1)); + bld.SHL(result, op[0], result); + } else { + bld.SHL(result, op[0], op[1]); + } + + break; + case nir_op_ishr: + if (instr->def.bit_size < 32) { + bld.AND(result, op[1], brw_imm_ud(instr->def.bit_size - 1)); + bld.ASR(result, op[0], result); + } else { + bld.ASR(result, op[0], op[1]); + } + + break; + case nir_op_ushr: + if (instr->def.bit_size < 32) { + bld.AND(result, op[1], brw_imm_ud(instr->def.bit_size - 1)); + bld.SHR(result, op[0], result); + } else { + bld.SHR(result, op[0], op[1]); + } + + break; + + case nir_op_urol: + bld.ROL(result, op[0], op[1]); + break; + case nir_op_uror: + bld.ROR(result, op[0], op[1]); + break; + + case nir_op_pack_half_2x16_split: + bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]); + break; + + case nir_op_sdot_4x8_iadd: + case nir_op_sdot_4x8_iadd_sat: + inst = bld.DP4A(retype(result, BRW_REGISTER_TYPE_D), + retype(op[2], BRW_REGISTER_TYPE_D), + retype(op[0], BRW_REGISTER_TYPE_D), + retype(op[1], BRW_REGISTER_TYPE_D)); + + if (instr->op == nir_op_sdot_4x8_iadd_sat) + inst->saturate = true; + break; + + case nir_op_udot_4x8_uadd: + case nir_op_udot_4x8_uadd_sat: + inst = bld.DP4A(retype(result, BRW_REGISTER_TYPE_UD), + retype(op[2], BRW_REGISTER_TYPE_UD), + retype(op[0], BRW_REGISTER_TYPE_UD), + retype(op[1], BRW_REGISTER_TYPE_UD)); + + if (instr->op == nir_op_udot_4x8_uadd_sat) + inst->saturate = true; + break; + + case nir_op_sudot_4x8_iadd: + case nir_op_sudot_4x8_iadd_sat: + inst = bld.DP4A(retype(result, BRW_REGISTER_TYPE_D), + retype(op[2], BRW_REGISTER_TYPE_D), + retype(op[0], BRW_REGISTER_TYPE_D), + retype(op[1], BRW_REGISTER_TYPE_UD)); + + if (instr->op == nir_op_sudot_4x8_iadd_sat) + inst->saturate = true; + break; + + case nir_op_ffma: + if (nir_has_any_rounding_mode_enabled(execution_mode)) { + brw_rnd_mode rnd = + brw_rnd_mode_from_execution_mode(execution_mode); + bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), + brw_imm_d(rnd)); + } + + inst = bld.MAD(result, op[2], op[1], op[0]); + break; + + case nir_op_flrp: + if (nir_has_any_rounding_mode_enabled(execution_mode)) { + brw_rnd_mode rnd = + brw_rnd_mode_from_execution_mode(execution_mode); + bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), + brw_imm_d(rnd)); + } + + inst = bld.LRP(result, op[0], op[1], op[2]); + break; + + case nir_op_b32csel: + if (optimize_frontfacing_ternary(ntb, instr, result)) + return; + + bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ); + inst = bld.SEL(result, op[1], op[2]); + inst->predicate = BRW_PREDICATE_NORMAL; + break; + + case nir_op_extract_u8: + case nir_op_extract_i8: { + unsigned byte = nir_src_as_uint(instr->src[1].src); + + /* The PRMs say: + * + * BDW+ + * There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB. + * Use two instructions and a word or DWord intermediate integer type. + */ + if (instr->def.bit_size == 64) { + const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8); + + if (instr->op == nir_op_extract_i8) { + /* If we need to sign extend, extract to a word first */ + fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W); + bld.MOV(w_temp, subscript(op[0], type, byte)); + bld.MOV(result, w_temp); + } else if (byte & 1) { + /* Extract the high byte from the word containing the desired byte + * offset. + */ + bld.SHR(result, + subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2), + brw_imm_uw(8)); + } else { + /* Otherwise use an AND with 0xff and a word type */ + bld.AND(result, + subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2), + brw_imm_uw(0xff)); + } + } else { + const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8); + bld.MOV(result, subscript(op[0], type, byte)); + } + break; + } + + case nir_op_extract_u16: + case nir_op_extract_i16: { + const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16); + unsigned word = nir_src_as_uint(instr->src[1].src); + bld.MOV(result, subscript(op[0], type, word)); + break; + } + + default: + unreachable("unhandled instruction"); + } + + /* If we need to do a boolean resolve, replace the result with -(x & 1) + * to sign extend the low bit to 0/~0 + */ + if (devinfo->ver <= 5 && + !result.is_null() && + (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) { + fs_reg masked = s.vgrf(glsl_int_type()); + bld.AND(masked, result, brw_imm_d(1)); + masked.negate = true; + bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked); + } +} + +static void +fs_nir_emit_load_const(nir_to_brw_state &ntb, + nir_load_const_instr *instr) +{ + const intel_device_info *devinfo = ntb.devinfo; + const fs_builder &bld = ntb.bld; + + const brw_reg_type reg_type = + brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D); + fs_reg reg = bld.vgrf(reg_type, instr->def.num_components); + + switch (instr->def.bit_size) { + case 8: + for (unsigned i = 0; i < instr->def.num_components; i++) + bld.MOV(offset(reg, bld, i), setup_imm_b(bld, instr->value[i].i8)); + break; + + case 16: + for (unsigned i = 0; i < instr->def.num_components; i++) + bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value[i].i16)); + break; + + case 32: + for (unsigned i = 0; i < instr->def.num_components; i++) + bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value[i].i32)); + break; + + case 64: + assert(devinfo->ver >= 7); + if (!devinfo->has_64bit_int) { + for (unsigned i = 0; i < instr->def.num_components; i++) { + bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF), + setup_imm_df(bld, instr->value[i].f64)); + } + } else { + for (unsigned i = 0; i < instr->def.num_components; i++) + bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value[i].i64)); + } + break; + + default: + unreachable("Invalid bit size"); + } + + ntb.ssa_values[instr->def.index] = reg; +} + +static bool +get_nir_src_bindless(nir_to_brw_state &ntb, const nir_src &src) +{ + return ntb.ssa_bind_infos[src.ssa->index].bindless; +} + +static bool +is_resource_src(nir_src src) +{ + return src.ssa->parent_instr->type == nir_instr_type_intrinsic && + nir_instr_as_intrinsic(src.ssa->parent_instr)->intrinsic == nir_intrinsic_resource_intel; +} + +static fs_reg +get_resource_nir_src(nir_to_brw_state &ntb, const nir_src &src) +{ + if (!is_resource_src(src)) + return fs_reg(); + return ntb.resource_values[src.ssa->index]; +} + +static fs_reg +get_nir_src(nir_to_brw_state &ntb, const nir_src &src) +{ + const intel_device_info *devinfo = ntb.devinfo; + + nir_intrinsic_instr *load_reg = nir_load_reg_for_def(src.ssa); + + fs_reg reg; + if (!load_reg) { + if (nir_src_is_undef(src)) { + const brw_reg_type reg_type = + brw_reg_type_from_bit_size(src.ssa->bit_size, + BRW_REGISTER_TYPE_D); + reg = ntb.bld.vgrf(reg_type, src.ssa->num_components); + } else { + reg = ntb.ssa_values[src.ssa->index]; + } + } else { + nir_intrinsic_instr *decl_reg = nir_reg_get_decl(load_reg->src[0].ssa); + /* We don't handle indirects on locals */ + assert(nir_intrinsic_base(load_reg) == 0); + assert(load_reg->intrinsic != nir_intrinsic_load_reg_indirect); + reg = ntb.ssa_values[decl_reg->def.index]; + } + + if (nir_src_bit_size(src) == 64 && devinfo->ver == 7) { + /* The only 64-bit type available on gfx7 is DF, so use that. */ + reg.type = BRW_REGISTER_TYPE_DF; + } else { + /* To avoid floating-point denorm flushing problems, set the type by + * default to an integer type - instructions that need floating point + * semantics will set this to F if they need to + */ + reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src), + BRW_REGISTER_TYPE_D); + } + + return reg; +} + +/** + * Return an IMM for constants; otherwise call get_nir_src() as normal. + * + * This function should not be called on any value which may be 64 bits. + * We could theoretically support 64-bit on gfx8+ but we choose not to + * because it wouldn't work in general (no gfx7 support) and there are + * enough restrictions in 64-bit immediates that you can't take the return + * value and treat it the same as the result of get_nir_src(). + */ +static fs_reg +get_nir_src_imm(nir_to_brw_state &ntb, const nir_src &src) +{ + assert(nir_src_bit_size(src) == 32); + return nir_src_is_const(src) ? + fs_reg(brw_imm_d(nir_src_as_int(src))) : get_nir_src(ntb, src); +} + +static fs_reg +get_nir_def(nir_to_brw_state &ntb, const nir_def &def) +{ + const fs_builder &bld = ntb.bld; + + nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def); + if (!store_reg) { + const brw_reg_type reg_type = + brw_reg_type_from_bit_size(def.bit_size, + def.bit_size == 8 ? + BRW_REGISTER_TYPE_D : + BRW_REGISTER_TYPE_F); + ntb.ssa_values[def.index] = + bld.vgrf(reg_type, def.num_components); + bld.UNDEF(ntb.ssa_values[def.index]); + return ntb.ssa_values[def.index]; + } else { + nir_intrinsic_instr *decl_reg = + nir_reg_get_decl(store_reg->src[1].ssa); + /* We don't handle indirects on locals */ + assert(nir_intrinsic_base(store_reg) == 0); + assert(store_reg->intrinsic != nir_intrinsic_store_reg_indirect); + return ntb.ssa_values[decl_reg->def.index]; + } +} + +static nir_component_mask_t +get_nir_write_mask(const nir_def &def) +{ + nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def); + if (!store_reg) { + return nir_component_mask(def.num_components); + } else { + return nir_intrinsic_write_mask(store_reg); + } +} + +static fs_inst * +emit_pixel_interpolater_send(const fs_builder &bld, + enum opcode opcode, + const fs_reg &dst, + const fs_reg &src, + const fs_reg &desc, + const fs_reg &flag_reg, + glsl_interp_mode interpolation) +{ + struct brw_wm_prog_data *wm_prog_data = + brw_wm_prog_data(bld.shader->stage_prog_data); + + fs_reg srcs[INTERP_NUM_SRCS]; + srcs[INTERP_SRC_OFFSET] = src; + srcs[INTERP_SRC_MSG_DESC] = desc; + srcs[INTERP_SRC_DYNAMIC_MODE] = flag_reg; + + fs_inst *inst = bld.emit(opcode, dst, srcs, INTERP_NUM_SRCS); + /* 2 floats per slot returned */ + inst->size_written = 2 * dst.component_size(inst->exec_size); + if (interpolation == INTERP_MODE_NOPERSPECTIVE) { + inst->pi_noperspective = true; + /* TGL BSpec says: + * This field cannot be set to "Linear Interpolation" + * unless Non-Perspective Barycentric Enable in 3DSTATE_CLIP is enabled" + */ + wm_prog_data->uses_nonperspective_interp_modes = true; + } + + wm_prog_data->pulls_bary = true; + + return inst; +} + +/** + * Computes 1 << x, given a D/UD register containing some value x. + */ +static fs_reg +intexp2(const fs_builder &bld, const fs_reg &x) +{ + assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D); + + fs_reg result = bld.vgrf(x.type, 1); + fs_reg one = bld.vgrf(x.type, 1); + + bld.MOV(one, retype(brw_imm_d(1), one.type)); + bld.SHL(result, one, x); + return result; +} + +static void +emit_gs_end_primitive(nir_to_brw_state &ntb, const nir_src &vertex_count_nir_src) +{ + fs_visitor &s = ntb.s; + assert(s.stage == MESA_SHADER_GEOMETRY); + + struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data); + + if (s.gs_compile->control_data_header_size_bits == 0) + return; + + /* We can only do EndPrimitive() functionality when the control data + * consists of cut bits. Fortunately, the only time it isn't is when the + * output type is points, in which case EndPrimitive() is a no-op. + */ + if (gs_prog_data->control_data_format != + GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) { + return; + } + + /* Cut bits use one bit per vertex. */ + assert(s.gs_compile->control_data_bits_per_vertex == 1); + + fs_reg vertex_count = get_nir_src(ntb, vertex_count_nir_src); + vertex_count.type = BRW_REGISTER_TYPE_UD; + + /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting + * vertex n, 0 otherwise. So all we need to do here is mark bit + * (vertex_count - 1) % 32 in the cut_bits register to indicate that + * EndPrimitive() was called after emitting vertex (vertex_count - 1); + * vec4_gs_visitor::emit_control_data_bits() will take care of the rest. + * + * Note that if EndPrimitive() is called before emitting any vertices, this + * will cause us to set bit 31 of the control_data_bits register to 1. + * That's fine because: + * + * - If max_vertices < 32, then vertex number 31 (zero-based) will never be + * output, so the hardware will ignore cut bit 31. + * + * - If max_vertices == 32, then vertex number 31 is guaranteed to be the + * last vertex, so setting cut bit 31 has no effect (since the primitive + * is automatically ended when the GS terminates). + * + * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the + * control_data_bits register to 0 when the first vertex is emitted. + */ + + const fs_builder abld = ntb.bld.annotate("end primitive"); + + /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */ + fs_reg prev_count = ntb.bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu)); + fs_reg mask = intexp2(abld, prev_count); + /* Note: we're relying on the fact that the GEN SHL instruction only pays + * attention to the lower 5 bits of its second source argument, so on this + * architecture, 1 << (vertex_count - 1) is equivalent to 1 << + * ((vertex_count - 1) % 32). + */ + abld.OR(s.control_data_bits, s.control_data_bits, mask); +} + +void +fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count) +{ + assert(stage == MESA_SHADER_GEOMETRY); + assert(gs_compile->control_data_bits_per_vertex != 0); + + struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); + + const fs_builder bld = fs_builder(this).at_end(); + const fs_builder abld = bld.annotate("emit control data bits"); + const fs_builder fwa_bld = bld.exec_all(); + + /* We use a single UD register to accumulate control data bits (32 bits + * for each of the SIMD8 channels). So we need to write a DWord (32 bits) + * at a time. + * + * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets. + * We have select a 128-bit group via the Global and Per-Slot Offsets, then + * use the Channel Mask phase to enable/disable which DWord within that + * group to write. (Remember, different SIMD8 channels may have emitted + * different numbers of vertices, so we may need per-slot offsets.) + * + * Channel masking presents an annoying problem: we may have to replicate + * the data up to 4 times: + * + * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data. + * + * To avoid penalizing shaders that emit a small number of vertices, we + * can avoid these sometimes: if the size of the control data header is + * <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land + * land in the same 128-bit group, so we can skip per-slot offsets. + * + * Similarly, if the control data header is <= 32 bits, there is only one + * DWord, so we can skip channel masks. + */ + fs_reg channel_mask, per_slot_offset; + + if (gs_compile->control_data_header_size_bits > 32) + channel_mask = vgrf(glsl_uint_type()); + + if (gs_compile->control_data_header_size_bits > 128) + per_slot_offset = vgrf(glsl_uint_type()); + + /* Figure out which DWord we're trying to write to using the formula: + * + * dword_index = (vertex_count - 1) * bits_per_vertex / 32 + * + * Since bits_per_vertex is a power of two, and is known at compile + * time, this can be optimized to: + * + * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex)) + */ + if (channel_mask.file != BAD_FILE || per_slot_offset.file != BAD_FILE) { + fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu)); + unsigned log2_bits_per_vertex = + util_last_bit(gs_compile->control_data_bits_per_vertex); + abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex)); + + if (per_slot_offset.file != BAD_FILE) { + /* Set the per-slot offset to dword_index / 4, so that we'll write to + * the appropriate OWord within the control data header. + */ + abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u)); + } + + /* Set the channel masks to 1 << (dword_index % 4), so that we'll + * write to the appropriate DWORD within the OWORD. + */ + fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fwa_bld.AND(channel, dword_index, brw_imm_ud(3u)); + channel_mask = intexp2(fwa_bld, channel); + /* Then the channel masks need to be in bits 23:16. */ + fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u)); + } + + /* If there are channel masks, add 3 extra copies of the data. */ + const unsigned length = 1 + 3 * unsigned(channel_mask.file != BAD_FILE); + fs_reg sources[4]; + + for (unsigned i = 0; i < ARRAY_SIZE(sources); i++) + sources[i] = this->control_data_bits; + + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles; + srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offset; + srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = channel_mask; + srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_F, length); + srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length); + abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0); + + fs_inst *inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, + srcs, ARRAY_SIZE(srcs)); + + /* We need to increment Global Offset by 256-bits to make room for + * Broadwell's extra "Vertex Count" payload at the beginning of the + * URB entry. Since this is an OWord message, Global Offset is counted + * in 128-bit units, so we must set it to 2. + */ + if (gs_prog_data->static_vertex_count == -1) + inst->offset = 2; +} + +static void +set_gs_stream_control_data_bits(nir_to_brw_state &ntb, const fs_reg &vertex_count, + unsigned stream_id) +{ + fs_visitor &s = ntb.s; + + /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */ + + /* Note: we are calling this *before* increasing vertex_count, so + * this->vertex_count == vertex_count - 1 in the formula above. + */ + + /* Stream mode uses 2 bits per vertex */ + assert(s.gs_compile->control_data_bits_per_vertex == 2); + + /* Must be a valid stream */ + assert(stream_id < 4); /* MAX_VERTEX_STREAMS */ + + /* Control data bits are initialized to 0 so we don't have to set any + * bits when sending vertices to stream 0. + */ + if (stream_id == 0) + return; + + const fs_builder abld = ntb.bld.annotate("set stream control data bits", NULL); + + /* reg::sid = stream_id */ + fs_reg sid = ntb.bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.MOV(sid, brw_imm_ud(stream_id)); + + /* reg:shift_count = 2 * (vertex_count - 1) */ + fs_reg shift_count = ntb.bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.SHL(shift_count, vertex_count, brw_imm_ud(1u)); + + /* Note: we're relying on the fact that the GEN SHL instruction only pays + * attention to the lower 5 bits of its second source argument, so on this + * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to + * stream_id << ((2 * (vertex_count - 1)) % 32). + */ + fs_reg mask = ntb.bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.SHL(mask, sid, shift_count); + abld.OR(s.control_data_bits, s.control_data_bits, mask); +} + +static void +emit_gs_vertex(nir_to_brw_state &ntb, const nir_src &vertex_count_nir_src, + unsigned stream_id) +{ + fs_visitor &s = ntb.s; + + assert(s.stage == MESA_SHADER_GEOMETRY); + + struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data); + + fs_reg vertex_count = get_nir_src(ntb, vertex_count_nir_src); + vertex_count.type = BRW_REGISTER_TYPE_UD; + + /* Haswell and later hardware ignores the "Render Stream Select" bits + * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled, + * and instead sends all primitives down the pipeline for rasterization. + * If the SOL stage is enabled, "Render Stream Select" is honored and + * primitives bound to non-zero streams are discarded after stream output. + * + * Since the only purpose of primives sent to non-zero streams is to + * be recorded by transform feedback, we can simply discard all geometry + * bound to these streams when transform feedback is disabled. + */ + if (stream_id > 0 && !s.nir->info.has_transform_feedback_varyings) + return; + + /* If we're outputting 32 control data bits or less, then we can wait + * until the shader is over to output them all. Otherwise we need to + * output them as we go. Now is the time to do it, since we're about to + * output the vertex_count'th vertex, so it's guaranteed that the + * control data bits associated with the (vertex_count - 1)th vertex are + * correct. + */ + if (s.gs_compile->control_data_header_size_bits > 32) { + const fs_builder abld = + ntb.bld.annotate("emit vertex: emit control data bits"); + + /* Only emit control data bits if we've finished accumulating a batch + * of 32 bits. This is the case when: + * + * (vertex_count * bits_per_vertex) % 32 == 0 + * + * (in other words, when the last 5 bits of vertex_count * + * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some + * integer n (which is always the case, since bits_per_vertex is + * always 1 or 2), this is equivalent to requiring that the last 5-n + * bits of vertex_count are 0: + * + * vertex_count & (2^(5-n) - 1) == 0 + * + * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is + * equivalent to: + * + * vertex_count & (32 / bits_per_vertex - 1) == 0 + * + * TODO: If vertex_count is an immediate, we could do some of this math + * at compile time... + */ + fs_inst *inst = + abld.AND(ntb.bld.null_reg_d(), vertex_count, + brw_imm_ud(32u / s.gs_compile->control_data_bits_per_vertex - 1u)); + inst->conditional_mod = BRW_CONDITIONAL_Z; + + abld.IF(BRW_PREDICATE_NORMAL); + /* If vertex_count is 0, then no control data bits have been + * accumulated yet, so we can skip emitting them. + */ + abld.CMP(ntb.bld.null_reg_d(), vertex_count, brw_imm_ud(0u), + BRW_CONDITIONAL_NEQ); + abld.IF(BRW_PREDICATE_NORMAL); + s.emit_gs_control_data_bits(vertex_count); + abld.emit(BRW_OPCODE_ENDIF); + + /* Reset control_data_bits to 0 so we can start accumulating a new + * batch. + * + * Note: in the case where vertex_count == 0, this neutralizes the + * effect of any call to EndPrimitive() that the shader may have + * made before outputting its first vertex. + */ + inst = abld.MOV(s.control_data_bits, brw_imm_ud(0u)); + inst->force_writemask_all = true; + abld.emit(BRW_OPCODE_ENDIF); + } + + s.emit_urb_writes(vertex_count); + + /* In stream mode we have to set control data bits for all vertices + * unless we have disabled control data bits completely (which we do + * do for MESA_PRIM_POINTS outputs that don't use streams). + */ + if (s.gs_compile->control_data_header_size_bits > 0 && + gs_prog_data->control_data_format == + GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) { + set_gs_stream_control_data_bits(ntb, vertex_count, stream_id); + } +} + +static void +emit_gs_input_load(nir_to_brw_state &ntb, const fs_reg &dst, + const nir_src &vertex_src, + unsigned base_offset, + const nir_src &offset_src, + unsigned num_components, + unsigned first_component) +{ + const intel_device_info *devinfo = ntb.devinfo; + const fs_builder &bld = ntb.bld; + fs_visitor &s = ntb.s; + + assert(type_sz(dst.type) == 4); + struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data); + const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8; + + /* TODO: figure out push input layout for invocations == 1 */ + if (gs_prog_data->invocations == 1 && + nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) && + 4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) { + int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 + + nir_src_as_uint(vertex_src) * push_reg_count; + const fs_reg attr = fs_reg(ATTR, 0, dst.type); + for (unsigned i = 0; i < num_components; i++) { + ntb.bld.MOV(offset(dst, bld, i), + offset(attr, bld, imm_offset + i + first_component)); + } + return; + } + + /* Resort to the pull model. Ensure the VUE handles are provided. */ + assert(gs_prog_data->base.include_vue_handles); + + fs_reg start = s.gs_payload().icp_handle_start; + fs_reg icp_handle = ntb.bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + + if (gs_prog_data->invocations == 1) { + if (nir_src_is_const(vertex_src)) { + /* The vertex index is constant; just select the proper URB handle. */ + icp_handle = offset(start, ntb.bld, nir_src_as_uint(vertex_src)); + } else { + /* The vertex index is non-constant. We need to use indirect + * addressing to fetch the proper URB handle. + * + * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0> + * indicating that channel should read the handle from + * DWord . We convert that to bytes by multiplying by 4. + * + * Next, we convert the vertex index to bytes by multiplying + * by 32 (shifting by 5), and add the two together. This is + * the final indirect byte offset. + */ + fs_reg sequence = + ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]; + fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + + /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */ + bld.SHL(channel_offsets, sequence, brw_imm_ud(2u)); + /* Convert vertex_index to bytes (multiply by 32) */ + bld.SHL(vertex_offset_bytes, + retype(get_nir_src(ntb, vertex_src), BRW_REGISTER_TYPE_UD), + brw_imm_ud(5u)); + bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets); + + /* Use first_icp_handle as the base offset. There is one register + * of URB handles per vertex, so inform the register allocator that + * we might read up to nir->info.gs.vertices_in registers. + */ + bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start, + fs_reg(icp_offset_bytes), + brw_imm_ud(s.nir->info.gs.vertices_in * REG_SIZE)); + } + } else { + assert(gs_prog_data->invocations > 1); + + if (nir_src_is_const(vertex_src)) { + unsigned vertex = nir_src_as_uint(vertex_src); + assert(devinfo->ver >= 9 || vertex <= 5); + bld.MOV(icp_handle, component(start, vertex)); + } else { + /* The vertex index is non-constant. We need to use indirect + * addressing to fetch the proper URB handle. + * + */ + fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + + /* Convert vertex_index to bytes (multiply by 4) */ + bld.SHL(icp_offset_bytes, + retype(get_nir_src(ntb, vertex_src), BRW_REGISTER_TYPE_UD), + brw_imm_ud(2u)); + + /* Use first_icp_handle as the base offset. There is one DWord + * of URB handles per vertex, so inform the register allocator that + * we might read up to ceil(nir->info.gs.vertices_in / 8) registers. + */ + bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start, + fs_reg(icp_offset_bytes), + brw_imm_ud(DIV_ROUND_UP(s.nir->info.gs.vertices_in, 8) * + REG_SIZE)); + } + } + + fs_inst *inst; + fs_reg indirect_offset = get_nir_src(ntb, offset_src); + + if (nir_src_is_const(offset_src)) { + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle; + + /* Constant indexing - use global offset. */ + if (first_component != 0) { + unsigned read_components = num_components + first_component; + fs_reg tmp = bld.vgrf(dst.type, read_components); + inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs, + ARRAY_SIZE(srcs)); + inst->size_written = read_components * + tmp.component_size(inst->exec_size); + for (unsigned i = 0; i < num_components; i++) { + bld.MOV(offset(dst, bld, i), + offset(tmp, bld, i + first_component)); + } + } else { + inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs, + ARRAY_SIZE(srcs)); + inst->size_written = num_components * + dst.component_size(inst->exec_size); + } + inst->offset = base_offset + nir_src_as_uint(offset_src); + } else { + /* Indirect indexing - use per-slot offsets as well. */ + unsigned read_components = num_components + first_component; + fs_reg tmp = bld.vgrf(dst.type, read_components); + + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle; + srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset; + + if (first_component != 0) { + inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, + srcs, ARRAY_SIZE(srcs)); + inst->size_written = read_components * + tmp.component_size(inst->exec_size); + for (unsigned i = 0; i < num_components; i++) { + bld.MOV(offset(dst, bld, i), + offset(tmp, bld, i + first_component)); + } + } else { + inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, + srcs, ARRAY_SIZE(srcs)); + inst->size_written = num_components * + dst.component_size(inst->exec_size); + } + inst->offset = base_offset; + } +} + +static fs_reg +get_indirect_offset(nir_to_brw_state &ntb, nir_intrinsic_instr *instr) +{ + nir_src *offset_src = nir_get_io_offset_src(instr); + + if (nir_src_is_const(*offset_src)) { + /* The only constant offset we should find is 0. brw_nir.c's + * add_const_offset_to_base() will fold other constant offsets + * into the "base" index. + */ + assert(nir_src_as_uint(*offset_src) == 0); + return fs_reg(); + } + + return get_nir_src(ntb, *offset_src); +} + +static void +fs_nir_emit_vs_intrinsic(nir_to_brw_state &ntb, + nir_intrinsic_instr *instr) +{ + const fs_builder &bld = ntb.bld; + fs_visitor &s = ntb.s; + assert(s.stage == MESA_SHADER_VERTEX); + + fs_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_def(ntb, instr->def); + + switch (instr->intrinsic) { + case nir_intrinsic_load_vertex_id: + case nir_intrinsic_load_base_vertex: + unreachable("should be lowered by nir_lower_system_values()"); + + case nir_intrinsic_load_input: { + assert(instr->def.bit_size == 32); + const fs_reg src = offset(fs_reg(ATTR, 0, dest.type), bld, + nir_intrinsic_base(instr) * 4 + + nir_intrinsic_component(instr) + + nir_src_as_uint(instr->src[0])); + + for (unsigned i = 0; i < instr->num_components; i++) + bld.MOV(offset(dest, bld, i), offset(src, bld, i)); + break; + } + + case nir_intrinsic_load_vertex_id_zero_base: + case nir_intrinsic_load_instance_id: + case nir_intrinsic_load_base_instance: + case nir_intrinsic_load_draw_id: + case nir_intrinsic_load_first_vertex: + case nir_intrinsic_load_is_indexed_draw: + unreachable("lowered by brw_nir_lower_vs_inputs"); + + default: + fs_nir_emit_intrinsic(ntb, bld, instr); + break; + } +} + +static fs_reg +get_tcs_single_patch_icp_handle(nir_to_brw_state &ntb, const fs_builder &bld, + nir_intrinsic_instr *instr) +{ + fs_visitor &s = ntb.s; + + struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(s.prog_data); + const nir_src &vertex_src = instr->src[0]; + nir_intrinsic_instr *vertex_intrin = nir_src_as_intrinsic(vertex_src); + + const fs_reg start = s.tcs_payload().icp_handle_start; + + fs_reg icp_handle; + + if (nir_src_is_const(vertex_src)) { + /* Emit a MOV to resolve <0,1,0> regioning. */ + icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + unsigned vertex = nir_src_as_uint(vertex_src); + bld.MOV(icp_handle, component(start, vertex)); + } else if (tcs_prog_data->instances == 1 && vertex_intrin && + vertex_intrin->intrinsic == nir_intrinsic_load_invocation_id) { + /* For the common case of only 1 instance, an array index of + * gl_InvocationID means reading the handles from the start. Skip all + * the indirect work. + */ + icp_handle = start; + } else { + /* The vertex index is non-constant. We need to use indirect + * addressing to fetch the proper URB handle. + */ + icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + + /* Each ICP handle is a single DWord (4 bytes) */ + fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + bld.SHL(vertex_offset_bytes, + retype(get_nir_src(ntb, vertex_src), BRW_REGISTER_TYPE_UD), + brw_imm_ud(2u)); + + /* We might read up to 4 registers. */ + bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, + start, vertex_offset_bytes, + brw_imm_ud(4 * REG_SIZE)); + } + + return icp_handle; +} + +static fs_reg +get_tcs_multi_patch_icp_handle(nir_to_brw_state &ntb, const fs_builder &bld, + nir_intrinsic_instr *instr) +{ + fs_visitor &s = ntb.s; + const intel_device_info *devinfo = s.devinfo; + + struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) s.key; + const nir_src &vertex_src = instr->src[0]; + const unsigned grf_size_bytes = REG_SIZE * reg_unit(devinfo); + + const fs_reg start = s.tcs_payload().icp_handle_start; + + if (nir_src_is_const(vertex_src)) + return byte_offset(start, nir_src_as_uint(vertex_src) * grf_size_bytes); + + /* The vertex index is non-constant. We need to use indirect + * addressing to fetch the proper URB handle. + * + * First, we start with the sequence indicating that channel + * should read the handle from DWord . We convert that to bytes + * by multiplying by 4. + * + * Next, we convert the vertex index to bytes by multiplying + * by the GRF size (by shifting), and add the two together. This is + * the final indirect byte offset. + */ + fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fs_reg sequence = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]; + fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + + /* Offsets will be 0, 4, 8, ... */ + bld.SHL(channel_offsets, sequence, brw_imm_ud(2u)); + /* Convert vertex_index to bytes (multiply by 32) */ + assert(util_is_power_of_two_nonzero(grf_size_bytes)); /* for ffs() */ + bld.SHL(vertex_offset_bytes, + retype(get_nir_src(ntb, vertex_src), BRW_REGISTER_TYPE_UD), + brw_imm_ud(ffs(grf_size_bytes) - 1)); + bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets); + + /* Use start of ICP handles as the base offset. There is one register + * of URB handles per vertex, so inform the register allocator that + * we might read up to nir->info.gs.vertices_in registers. + */ + bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start, + icp_offset_bytes, + brw_imm_ud(brw_tcs_prog_key_input_vertices(tcs_key) * + grf_size_bytes)); + + return icp_handle; +} + +static void +setup_barrier_message_payload_gfx125(const fs_builder &bld, + const fs_reg &msg_payload) +{ + assert(bld.shader->devinfo->verx10 >= 125); + + /* From BSpec: 54006, mov r0.2[31:24] into m0.2[31:24] and m0.2[23:16] */ + fs_reg m0_10ub = component(retype(msg_payload, BRW_REGISTER_TYPE_UB), 10); + fs_reg r0_11ub = + stride(suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UB), 11), + 0, 1, 0); + bld.exec_all().group(2, 0).MOV(m0_10ub, r0_11ub); +} + +static void +emit_barrier(nir_to_brw_state &ntb) +{ + const intel_device_info *devinfo = ntb.devinfo; + const fs_builder &bld = ntb.bld; + fs_visitor &s = ntb.s; + + /* We are getting the barrier ID from the compute shader header */ + assert(gl_shader_stage_uses_workgroup(s.stage)); + + fs_reg payload = fs_reg(VGRF, s.alloc.allocate(1), BRW_REGISTER_TYPE_UD); + + /* Clear the message payload */ + bld.exec_all().group(8, 0).MOV(payload, brw_imm_ud(0u)); + + if (devinfo->verx10 >= 125) { + setup_barrier_message_payload_gfx125(bld, payload); + } else { + assert(gl_shader_stage_is_compute(s.stage)); + + uint32_t barrier_id_mask; + switch (devinfo->ver) { + case 7: + case 8: + barrier_id_mask = 0x0f000000u; break; + case 9: + barrier_id_mask = 0x8f000000u; break; + case 11: + case 12: + barrier_id_mask = 0x7f000000u; break; + default: + unreachable("barrier is only available on gen >= 7"); + } + + /* Copy the barrier id from r0.2 to the message payload reg.2 */ + fs_reg r0_2 = fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)); + bld.exec_all().group(1, 0).AND(component(payload, 2), r0_2, + brw_imm_ud(barrier_id_mask)); + } + + /* Emit a gateway "barrier" message using the payload we set up, followed + * by a wait instruction. + */ + bld.exec_all().emit(SHADER_OPCODE_BARRIER, reg_undef, payload); +} + +static void +emit_tcs_barrier(nir_to_brw_state &ntb) +{ + const intel_device_info *devinfo = ntb.devinfo; + const fs_builder &bld = ntb.bld; + fs_visitor &s = ntb.s; + + assert(s.stage == MESA_SHADER_TESS_CTRL); + struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(s.prog_data); + + fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fs_reg m0_2 = component(m0, 2); + + const fs_builder chanbld = bld.exec_all().group(1, 0); + + /* Zero the message header */ + bld.exec_all().MOV(m0, brw_imm_ud(0u)); + + if (devinfo->verx10 >= 125) { + setup_barrier_message_payload_gfx125(bld, m0); + } else if (devinfo->ver >= 11) { + chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), + brw_imm_ud(INTEL_MASK(30, 24))); + + /* Set the Barrier Count and the enable bit */ + chanbld.OR(m0_2, m0_2, + brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15))); + } else { + /* Copy "Barrier ID" from r0.2, bits 16:13 */ + chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), + brw_imm_ud(INTEL_MASK(16, 13))); + + /* Shift it up to bits 27:24. */ + chanbld.SHL(m0_2, m0_2, brw_imm_ud(11)); + + /* Set the Barrier Count and the enable bit */ + chanbld.OR(m0_2, m0_2, + brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15))); + } + + bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0); +} + +static void +fs_nir_emit_tcs_intrinsic(nir_to_brw_state &ntb, + nir_intrinsic_instr *instr) +{ + const intel_device_info *devinfo = ntb.devinfo; + const fs_builder &bld = ntb.bld; + fs_visitor &s = ntb.s; + + assert(s.stage == MESA_SHADER_TESS_CTRL); + struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(s.prog_data); + struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base; + + fs_reg dst; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dst = get_nir_def(ntb, instr->def); + + switch (instr->intrinsic) { + case nir_intrinsic_load_primitive_id: + bld.MOV(dst, s.tcs_payload().primitive_id); + break; + case nir_intrinsic_load_invocation_id: + bld.MOV(retype(dst, s.invocation_id.type), s.invocation_id); + break; + + case nir_intrinsic_barrier: + if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE) + fs_nir_emit_intrinsic(ntb, bld, instr); + if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) { + if (tcs_prog_data->instances != 1) + emit_tcs_barrier(ntb); + } + break; + + case nir_intrinsic_load_input: + unreachable("nir_lower_io should never give us these."); + break; + + case nir_intrinsic_load_per_vertex_input: { + assert(instr->def.bit_size == 32); + fs_reg indirect_offset = get_indirect_offset(ntb, instr); + unsigned imm_offset = nir_intrinsic_base(instr); + fs_inst *inst; + + const bool multi_patch = + vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH; + + fs_reg icp_handle = multi_patch ? + get_tcs_multi_patch_icp_handle(ntb, bld, instr) : + get_tcs_single_patch_icp_handle(ntb, bld, instr); + + /* We can only read two double components with each URB read, so + * we send two read messages in that case, each one loading up to + * two double components. + */ + unsigned num_components = instr->num_components; + unsigned first_component = nir_intrinsic_component(instr); + + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle; + + if (indirect_offset.file == BAD_FILE) { + /* Constant indexing - use global offset. */ + if (first_component != 0) { + unsigned read_components = num_components + first_component; + fs_reg tmp = bld.vgrf(dst.type, read_components); + inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs, + ARRAY_SIZE(srcs)); + for (unsigned i = 0; i < num_components; i++) { + bld.MOV(offset(dst, bld, i), + offset(tmp, bld, i + first_component)); + } + } else { + inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs, + ARRAY_SIZE(srcs)); + } + inst->offset = imm_offset; + } else { + /* Indirect indexing - use per-slot offsets as well. */ + srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset; + + if (first_component != 0) { + unsigned read_components = num_components + first_component; + fs_reg tmp = bld.vgrf(dst.type, read_components); + inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, + srcs, ARRAY_SIZE(srcs)); + for (unsigned i = 0; i < num_components; i++) { + bld.MOV(offset(dst, bld, i), + offset(tmp, bld, i + first_component)); + } + } else { + inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, + srcs, ARRAY_SIZE(srcs)); + } + inst->offset = imm_offset; + } + inst->size_written = (num_components + first_component) * + inst->dst.component_size(inst->exec_size); + + /* Copy the temporary to the destination to deal with writemasking. + * + * Also attempt to deal with gl_PointSize being in the .w component. + */ + if (inst->offset == 0 && indirect_offset.file == BAD_FILE) { + assert(type_sz(dst.type) == 4); + inst->dst = bld.vgrf(dst.type, 4); + inst->size_written = 4 * REG_SIZE * reg_unit(devinfo); + bld.MOV(dst, offset(inst->dst, bld, 3)); + } + break; + } + + case nir_intrinsic_load_output: + case nir_intrinsic_load_per_vertex_output: { + assert(instr->def.bit_size == 32); + fs_reg indirect_offset = get_indirect_offset(ntb, instr); + unsigned imm_offset = nir_intrinsic_base(instr); + unsigned first_component = nir_intrinsic_component(instr); + + fs_inst *inst; + if (indirect_offset.file == BAD_FILE) { + /* This MOV replicates the output handle to all enabled channels + * is SINGLE_PATCH mode. + */ + fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + bld.MOV(patch_handle, s.tcs_payload().patch_urb_output); + + { + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = patch_handle; + + if (first_component != 0) { + unsigned read_components = + instr->num_components + first_component; + fs_reg tmp = bld.vgrf(dst.type, read_components); + inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, + srcs, ARRAY_SIZE(srcs)); + inst->size_written = read_components * REG_SIZE * reg_unit(devinfo); + for (unsigned i = 0; i < instr->num_components; i++) { + bld.MOV(offset(dst, bld, i), + offset(tmp, bld, i + first_component)); + } + } else { + inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, + srcs, ARRAY_SIZE(srcs)); + inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo); + } + inst->offset = imm_offset; + } + } else { + /* Indirect indexing - use per-slot offsets as well. */ + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output; + srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset; + + if (first_component != 0) { + unsigned read_components = + instr->num_components + first_component; + fs_reg tmp = bld.vgrf(dst.type, read_components); + inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, + srcs, ARRAY_SIZE(srcs)); + inst->size_written = read_components * REG_SIZE * reg_unit(devinfo); + for (unsigned i = 0; i < instr->num_components; i++) { + bld.MOV(offset(dst, bld, i), + offset(tmp, bld, i + first_component)); + } + } else { + inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, + srcs, ARRAY_SIZE(srcs)); + inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo); + } + inst->offset = imm_offset; + } + break; + } + + case nir_intrinsic_store_output: + case nir_intrinsic_store_per_vertex_output: { + assert(nir_src_bit_size(instr->src[0]) == 32); + fs_reg value = get_nir_src(ntb, instr->src[0]); + fs_reg indirect_offset = get_indirect_offset(ntb, instr); + unsigned imm_offset = nir_intrinsic_base(instr); + unsigned mask = nir_intrinsic_write_mask(instr); + + if (mask == 0) + break; + + unsigned num_components = util_last_bit(mask); + unsigned first_component = nir_intrinsic_component(instr); + assert((first_component + num_components) <= 4); + + mask = mask << first_component; + + const bool has_urb_lsc = devinfo->ver >= 20; + + fs_reg mask_reg; + if (mask != WRITEMASK_XYZW) + mask_reg = brw_imm_ud(mask << 16); + + fs_reg sources[4]; + + unsigned m = has_urb_lsc ? 0 : first_component; + for (unsigned i = 0; i < num_components; i++) { + int c = i + first_component; + if (mask & (1 << c)) { + sources[m++] = offset(value, bld, i); + } else if (devinfo->ver < 20) { + m++; + } + } + + assert(has_urb_lsc || m == (first_component + num_components)); + + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output; + srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset; + srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask_reg; + srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_F, m); + srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(m); + bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, m, 0); + + fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, + srcs, ARRAY_SIZE(srcs)); + inst->offset = imm_offset; + break; + } + + default: + fs_nir_emit_intrinsic(ntb, bld, instr); + break; + } +} + +static void +fs_nir_emit_tes_intrinsic(nir_to_brw_state &ntb, + nir_intrinsic_instr *instr) +{ + const intel_device_info *devinfo = ntb.devinfo; + const fs_builder &bld = ntb.bld; + fs_visitor &s = ntb.s; + + assert(s.stage == MESA_SHADER_TESS_EVAL); + struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(s.prog_data); + + fs_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_def(ntb, instr->def); + + switch (instr->intrinsic) { + case nir_intrinsic_load_primitive_id: + bld.MOV(dest, s.tes_payload().primitive_id); + break; + + case nir_intrinsic_load_tess_coord: + for (unsigned i = 0; i < 3; i++) + bld.MOV(offset(dest, bld, i), s.tes_payload().coords[i]); + break; + + case nir_intrinsic_load_input: + case nir_intrinsic_load_per_vertex_input: { + assert(instr->def.bit_size == 32); + fs_reg indirect_offset = get_indirect_offset(ntb, instr); + unsigned imm_offset = nir_intrinsic_base(instr); + unsigned first_component = nir_intrinsic_component(instr); + + fs_inst *inst; + if (indirect_offset.file == BAD_FILE) { + /* Arbitrarily only push up to 32 vec4 slots worth of data, + * which is 16 registers (since each holds 2 vec4 slots). + */ + const unsigned max_push_slots = 32; + if (imm_offset < max_push_slots) { + const fs_reg src = horiz_offset(fs_reg(ATTR, 0, dest.type), + 4 * imm_offset + first_component); + for (int i = 0; i < instr->num_components; i++) + bld.MOV(offset(dest, bld, i), component(src, i)); + + tes_prog_data->base.urb_read_length = + MAX2(tes_prog_data->base.urb_read_length, + (imm_offset / 2) + 1); + } else { + /* Replicate the patch handle to all enabled channels */ + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = s.tes_payload().patch_urb_input; + + if (first_component != 0) { + unsigned read_components = + instr->num_components + first_component; + fs_reg tmp = bld.vgrf(dest.type, read_components); + inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, + srcs, ARRAY_SIZE(srcs)); + inst->size_written = read_components * REG_SIZE * reg_unit(devinfo); + for (unsigned i = 0; i < instr->num_components; i++) { + bld.MOV(offset(dest, bld, i), + offset(tmp, bld, i + first_component)); + } + } else { + inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dest, + srcs, ARRAY_SIZE(srcs)); + inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo); + } + inst->offset = imm_offset; + } + } else { + /* Indirect indexing - use per-slot offsets as well. */ + + /* We can only read two double components with each URB read, so + * we send two read messages in that case, each one loading up to + * two double components. + */ + unsigned num_components = instr->num_components; + + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = s.tes_payload().patch_urb_input; + srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset; + + if (first_component != 0) { + unsigned read_components = + num_components + first_component; + fs_reg tmp = bld.vgrf(dest.type, read_components); + inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, + srcs, ARRAY_SIZE(srcs)); + for (unsigned i = 0; i < num_components; i++) { + bld.MOV(offset(dest, bld, i), + offset(tmp, bld, i + first_component)); + } + } else { + inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dest, + srcs, ARRAY_SIZE(srcs)); + } + inst->offset = imm_offset; + inst->size_written = (num_components + first_component) * + inst->dst.component_size(inst->exec_size); + } + break; + } + default: + fs_nir_emit_intrinsic(ntb, bld, instr); + break; + } +} + +static void +fs_nir_emit_gs_intrinsic(nir_to_brw_state &ntb, + nir_intrinsic_instr *instr) +{ + const fs_builder &bld = ntb.bld; + fs_visitor &s = ntb.s; + + assert(s.stage == MESA_SHADER_GEOMETRY); + fs_reg indirect_offset; + + fs_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_def(ntb, instr->def); + + switch (instr->intrinsic) { + case nir_intrinsic_load_primitive_id: + assert(s.stage == MESA_SHADER_GEOMETRY); + assert(brw_gs_prog_data(s.prog_data)->include_primitive_id); + bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), s.gs_payload().primitive_id); + break; + + case nir_intrinsic_load_input: + unreachable("load_input intrinsics are invalid for the GS stage"); + + case nir_intrinsic_load_per_vertex_input: + emit_gs_input_load(ntb, dest, instr->src[0], nir_intrinsic_base(instr), + instr->src[1], instr->num_components, + nir_intrinsic_component(instr)); + break; + + case nir_intrinsic_emit_vertex_with_counter: + emit_gs_vertex(ntb, instr->src[0], nir_intrinsic_stream_id(instr)); + break; + + case nir_intrinsic_end_primitive_with_counter: + emit_gs_end_primitive(ntb, instr->src[0]); + break; + + case nir_intrinsic_set_vertex_and_primitive_count: + bld.MOV(s.final_gs_vertex_count, get_nir_src(ntb, instr->src[0])); + break; + + case nir_intrinsic_load_invocation_id: { + fs_reg val = ntb.system_values[SYSTEM_VALUE_INVOCATION_ID]; + assert(val.file != BAD_FILE); + dest.type = val.type; + bld.MOV(dest, val); + break; + } + + default: + fs_nir_emit_intrinsic(ntb, bld, instr); + break; + } +} + +/** + * Fetch the current render target layer index. + */ +static fs_reg +fetch_render_target_array_index(const fs_builder &bld) +{ + const fs_visitor *v = static_cast(bld.shader); + + if (bld.shader->devinfo->ver >= 20) { + /* Gfx20+ has separate Render Target Array indices for each pair + * of subspans in order to support multiple polygons, so we need + * to use a <1;8,0> region in order to select the correct word + * for each channel. + */ + const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD); + + for (unsigned i = 0; i < DIV_ROUND_UP(bld.dispatch_width(), 16); i++) { + const fs_builder hbld = bld.group(16, i); + const struct brw_reg reg = retype(brw_vec1_grf(2 * i + 1, 1), + BRW_REGISTER_TYPE_UW); + hbld.AND(offset(idx, hbld, i), stride(reg, 1, 8, 0), + brw_imm_uw(0x7ff)); + } + + return idx; + } else if (bld.shader->devinfo->ver >= 12 && v->max_polygons == 2) { + /* According to the BSpec "PS Thread Payload for Normal + * Dispatch", the render target array index is stored as bits + * 26:16 of either the R1.1 or R1.6 poly info dwords, for the + * first and second polygons respectively in multipolygon PS + * dispatch mode. + */ + assert(bld.dispatch_width() == 16); + const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD); + + for (unsigned i = 0; i < v->max_polygons; i++) { + const fs_builder hbld = bld.group(8, i); + const struct brw_reg g1 = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 3 + 10 * i); + hbld.AND(offset(idx, hbld, i), g1, brw_imm_uw(0x7ff)); + } + + return idx; + } else if (bld.shader->devinfo->ver >= 12) { + /* The render target array index is provided in the thread payload as + * bits 26:16 of r1.1. + */ + const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 3), + brw_imm_uw(0x7ff)); + return idx; + } else if (bld.shader->devinfo->ver >= 6) { + /* The render target array index is provided in the thread payload as + * bits 26:16 of r0.0. + */ + const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1), + brw_imm_uw(0x7ff)); + return idx; + } else { + /* Pre-SNB we only ever render into the first layer of the framebuffer + * since layered rendering is not implemented. + */ + return brw_imm_ud(0); + } +} + +/* Sample from the MCS surface attached to this multisample texture. */ +static fs_reg +emit_mcs_fetch(nir_to_brw_state &ntb, const fs_reg &coordinate, unsigned components, + const fs_reg &texture, + const fs_reg &texture_handle) +{ + const fs_builder &bld = ntb.bld; + + const fs_reg dest = ntb.s.vgrf(glsl_uvec4_type()); + + fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; + srcs[TEX_LOGICAL_SRC_COORDINATE] = coordinate; + srcs[TEX_LOGICAL_SRC_SURFACE] = texture; + srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0); + srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = texture_handle; + srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(components); + srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0); + srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_d(0); + + fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs, + ARRAY_SIZE(srcs)); + + /* We only care about one or two regs of response, but the sampler always + * writes 4/8. + */ + inst->size_written = 4 * dest.component_size(inst->exec_size); + + return dest; +} + +/** + * Fake non-coherent framebuffer read implemented using TXF to fetch from the + * framebuffer at the current fragment coordinates and sample index. + */ +static fs_inst * +emit_non_coherent_fb_read(nir_to_brw_state &ntb, const fs_builder &bld, const fs_reg &dst, + unsigned target) +{ + fs_visitor &s = ntb.s; + const struct intel_device_info *devinfo = s.devinfo; + + assert(bld.shader->stage == MESA_SHADER_FRAGMENT); + const brw_wm_prog_key *wm_key = + reinterpret_cast(s.key); + assert(!wm_key->coherent_fb_fetch); + + /* Calculate the fragment coordinates. */ + const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3); + bld.MOV(offset(coords, bld, 0), s.pixel_x); + bld.MOV(offset(coords, bld, 1), s.pixel_y); + bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld)); + + /* Calculate the sample index and MCS payload when multisampling. Luckily + * the MCS fetch message behaves deterministically for UMS surfaces, so it + * shouldn't be necessary to recompile based on whether the framebuffer is + * CMS or UMS. + */ + assert(wm_key->multisample_fbo == BRW_ALWAYS || + wm_key->multisample_fbo == BRW_NEVER); + if (wm_key->multisample_fbo && + ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE) + ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb); + + const fs_reg sample = ntb.system_values[SYSTEM_VALUE_SAMPLE_ID]; + const fs_reg mcs = wm_key->multisample_fbo ? + emit_mcs_fetch(ntb, coords, 3, brw_imm_ud(target), fs_reg()) : fs_reg(); + + /* Use either a normal or a CMS texel fetch message depending on whether + * the framebuffer is single or multisample. On SKL+ use the wide CMS + * message just in case the framebuffer uses 16x multisampling, it should + * be equivalent to the normal CMS fetch for lower multisampling modes. + */ + opcode op; + if (wm_key->multisample_fbo) { + /* On SKL+ use the wide CMS message just in case the framebuffer uses 16x + * multisampling, it should be equivalent to the normal CMS fetch for + * lower multisampling modes. + * + * On Gfx12HP, there is only CMS_W variant available. + */ + if (devinfo->verx10 >= 125) + op = SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL; + else if (devinfo->ver >= 9) + op = SHADER_OPCODE_TXF_CMS_W_LOGICAL; + else + op = SHADER_OPCODE_TXF_CMS_LOGICAL; + } else { + op = SHADER_OPCODE_TXF_LOGICAL; + } + + /* Emit the instruction. */ + fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; + srcs[TEX_LOGICAL_SRC_COORDINATE] = coords; + srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_ud(0); + srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = sample; + srcs[TEX_LOGICAL_SRC_MCS] = mcs; + srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(target); + srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0); + srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(3); + srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_ud(0); + srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_ud(0); + + fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs)); + inst->size_written = 4 * inst->dst.component_size(inst->exec_size); + + return inst; +} + +/** + * Actual coherent framebuffer read implemented using the native render target + * read message. Requires SKL+. + */ +static fs_inst * +emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target) +{ + assert(bld.shader->devinfo->ver >= 9); + fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst); + inst->target = target; + inst->size_written = 4 * inst->dst.component_size(inst->exec_size); + + return inst; +} + +static fs_reg +alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n) +{ + if (n && regs[0].file != BAD_FILE) { + return regs[0]; + + } else { + const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size); + + for (unsigned i = 0; i < n; i++) + regs[i] = tmp; + + return tmp; + } +} + +static fs_reg +alloc_frag_output(nir_to_brw_state &ntb, unsigned location) +{ + fs_visitor &s = ntb.s; + + assert(s.stage == MESA_SHADER_FRAGMENT); + const brw_wm_prog_key *const key = + reinterpret_cast(s.key); + const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION); + const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX); + + if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1)) + return alloc_temporary(ntb.bld, 4, &s.dual_src_output, 1); + + else if (l == FRAG_RESULT_COLOR) + return alloc_temporary(ntb.bld, 4, s.outputs, + MAX2(key->nr_color_regions, 1)); + + else if (l == FRAG_RESULT_DEPTH) + return alloc_temporary(ntb.bld, 1, &s.frag_depth, 1); + + else if (l == FRAG_RESULT_STENCIL) + return alloc_temporary(ntb.bld, 1, &s.frag_stencil, 1); + + else if (l == FRAG_RESULT_SAMPLE_MASK) + return alloc_temporary(ntb.bld, 1, &s.sample_mask, 1); + + else if (l >= FRAG_RESULT_DATA0 && + l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS) + return alloc_temporary(ntb.bld, 4, + &s.outputs[l - FRAG_RESULT_DATA0], 1); + + else + unreachable("Invalid location"); +} + +static void +emit_is_helper_invocation(nir_to_brw_state &ntb, fs_reg result) +{ + const fs_builder &bld = ntb.bld; + + /* Unlike the regular gl_HelperInvocation, that is defined at dispatch, + * the helperInvocationEXT() (aka SpvOpIsHelperInvocationEXT) takes into + * consideration demoted invocations. + */ + result.type = BRW_REGISTER_TYPE_UD; + + bld.MOV(result, brw_imm_ud(0)); + + /* See brw_sample_mask_reg() for why we split SIMD32 into SIMD16 here. */ + unsigned width = bld.dispatch_width(); + for (unsigned i = 0; i < DIV_ROUND_UP(width, 16); i++) { + const fs_builder b = bld.group(MIN2(width, 16), i); + + fs_inst *mov = b.MOV(offset(result, b, i), brw_imm_ud(~0)); + + /* The at() ensures that any code emitted to get the predicate happens + * before the mov right above. This is not an issue elsewhere because + * lowering code already set up the builder this way. + */ + brw_emit_predicate_on_sample_mask(b.at(NULL, mov), mov); + mov->predicate_inverse = true; + } +} + +static void +emit_fragcoord_interpolation(nir_to_brw_state &ntb, fs_reg wpos) +{ + const intel_device_info *devinfo = ntb.devinfo; + const fs_builder &bld = ntb.bld; + fs_visitor &s = ntb.s; + + assert(s.stage == MESA_SHADER_FRAGMENT); + + /* gl_FragCoord.x */ + bld.MOV(wpos, s.pixel_x); + wpos = offset(wpos, bld, 1); + + /* gl_FragCoord.y */ + bld.MOV(wpos, s.pixel_y); + wpos = offset(wpos, bld, 1); + + /* gl_FragCoord.z */ + if (devinfo->ver >= 6) { + bld.MOV(wpos, s.pixel_z); + } else { + bld.emit(FS_OPCODE_LINTERP, wpos, + s.delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL], + s.interp_reg(bld, VARYING_SLOT_POS, 2, 0)); + } + wpos = offset(wpos, bld, 1); + + /* gl_FragCoord.w: Already set up in emit_interpolation */ + bld.MOV(wpos, s.wpos_w); +} + +static fs_reg +emit_frontfacing_interpolation(nir_to_brw_state &ntb) +{ + const intel_device_info *devinfo = ntb.devinfo; + const fs_builder &bld = ntb.bld; + fs_visitor &s = ntb.s; + + fs_reg ff = bld.vgrf(BRW_REGISTER_TYPE_D); + + if (devinfo->ver >= 20) { + /* Gfx20+ has separate back-facing bits for each pair of + * subspans in order to support multiple polygons, so we need to + * use a <1;8,0> region in order to select the correct word for + * each channel. + */ + const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW); + + for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) { + const fs_builder hbld = bld.group(16, i); + const struct brw_reg gi_uw = retype(xe2_vec1_grf(i, 9), + BRW_REGISTER_TYPE_UW); + hbld.AND(offset(tmp, hbld, i), gi_uw, brw_imm_uw(0x800)); + } + + bld.CMP(ff, tmp, brw_imm_uw(0), BRW_CONDITIONAL_Z); + + } else if (devinfo->ver >= 12 && s.max_polygons == 2) { + /* According to the BSpec "PS Thread Payload for Normal + * Dispatch", the front/back facing interpolation bit is stored + * as bit 15 of either the R1.1 or R1.6 poly info field, for the + * first and second polygons respectively in multipolygon PS + * dispatch mode. + */ + assert(s.dispatch_width == 16); + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_W); + + for (unsigned i = 0; i < s.max_polygons; i++) { + const fs_builder hbld = bld.group(8, i); + const struct brw_reg g1 = retype(brw_vec1_grf(1, 1 + 5 * i), + BRW_REGISTER_TYPE_W); + hbld.ASR(offset(tmp, hbld, i), g1, brw_imm_d(15)); + } + + bld.NOT(ff, tmp); + + } else if (devinfo->ver >= 12) { + fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W)); + + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_W); + bld.ASR(tmp, g1, brw_imm_d(15)); + bld.NOT(ff, tmp); + } else if (devinfo->ver >= 6) { + /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create + * a boolean result from this (~0/true or 0/false). + * + * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish + * this task in only one instruction: + * - a negation source modifier will flip the bit; and + * - a W -> D type conversion will sign extend the bit into the high + * word of the destination. + * + * An ASR 15 fills the low word of the destination. + */ + fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W)); + g0.negate = true; + + bld.ASR(ff, g0, brw_imm_d(15)); + } else { + /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create + * a boolean result from this (1/true or 0/false). + * + * Like in the above case, since the bit is the MSB of g1.6:UD we can use + * the negation source modifier to flip it. Unfortunately the SHR + * instruction only operates on UD (or D with an abs source modifier) + * sources without negation. + * + * Instead, use ASR (which will give ~0/true or 0/false). + */ + fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D)); + g1_6.negate = true; + + bld.ASR(ff, g1_6, brw_imm_d(31)); + } + + return ff; +} + +static fs_reg +emit_samplepos_setup(nir_to_brw_state &ntb) +{ + const intel_device_info *devinfo = ntb.devinfo; + const fs_builder &bld = ntb.bld; + fs_visitor &s = ntb.s; + + assert(s.stage == MESA_SHADER_FRAGMENT); + struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data); + assert(devinfo->ver >= 6); + + const fs_builder abld = bld.annotate("compute sample position"); + fs_reg pos = abld.vgrf(BRW_REGISTER_TYPE_F, 2); + + if (wm_prog_data->persample_dispatch == BRW_NEVER) { + /* From ARB_sample_shading specification: + * "When rendering to a non-multisample buffer, or if multisample + * rasterization is disabled, gl_SamplePosition will always be + * (0.5, 0.5). + */ + bld.MOV(offset(pos, bld, 0), brw_imm_f(0.5f)); + bld.MOV(offset(pos, bld, 1), brw_imm_f(0.5f)); + return pos; + } + + /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16 + * mode will be enabled. + * + * From the Ivy Bridge PRM, volume 2 part 1, page 344: + * R31.1:0 Position Offset X/Y for Slot[3:0] + * R31.3:2 Position Offset X/Y for Slot[7:4] + * ..... + * + * The X, Y sample positions come in as bytes in thread payload. So, read + * the positions using vstride=16, width=8, hstride=2. + */ + const fs_reg sample_pos_reg = + fetch_payload_reg(abld, s.fs_payload().sample_pos_reg, BRW_REGISTER_TYPE_W); + + for (unsigned i = 0; i < 2; i++) { + fs_reg tmp_d = bld.vgrf(BRW_REGISTER_TYPE_D); + abld.MOV(tmp_d, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, i)); + /* Convert int_sample_pos to floating point */ + fs_reg tmp_f = bld.vgrf(BRW_REGISTER_TYPE_F); + abld.MOV(tmp_f, tmp_d); + /* Scale to the range [0, 1] */ + abld.MUL(offset(pos, abld, i), tmp_f, brw_imm_f(1 / 16.0f)); + } + + if (wm_prog_data->persample_dispatch == BRW_SOMETIMES) { + check_dynamic_msaa_flag(abld, wm_prog_data, + INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH); + for (unsigned i = 0; i < 2; i++) { + set_predicate(BRW_PREDICATE_NORMAL, + bld.SEL(offset(pos, abld, i), offset(pos, abld, i), + brw_imm_f(0.5f))); + } + } + + return pos; +} + +static fs_reg +emit_sampleid_setup(nir_to_brw_state &ntb) +{ + const intel_device_info *devinfo = ntb.devinfo; + const fs_builder &bld = ntb.bld; + fs_visitor &s = ntb.s; + + assert(s.stage == MESA_SHADER_FRAGMENT); + ASSERTED brw_wm_prog_key *key = (brw_wm_prog_key*) s.key; + struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data); + assert(devinfo->ver >= 6); + + const fs_builder abld = bld.annotate("compute sample id"); + fs_reg sample_id = abld.vgrf(BRW_REGISTER_TYPE_UD); + + assert(key->multisample_fbo != BRW_NEVER); + + if (devinfo->ver >= 8) { + /* Sample ID comes in as 4-bit numbers in g1.0: + * + * 15:12 Slot 3 SampleID (only used in SIMD16) + * 11:8 Slot 2 SampleID (only used in SIMD16) + * 7:4 Slot 1 SampleID + * 3:0 Slot 0 SampleID + * + * Each slot corresponds to four channels, so we want to replicate each + * half-byte value to 4 channels in a row: + * + * dst+0: .7 .6 .5 .4 .3 .2 .1 .0 + * 7:4 7:4 7:4 7:4 3:0 3:0 3:0 3:0 + * + * dst+1: .7 .6 .5 .4 .3 .2 .1 .0 (if SIMD16) + * 15:12 15:12 15:12 15:12 11:8 11:8 11:8 11:8 + * + * First, we read g1.0 with a <1,8,0>UB region, causing the first 8 + * channels to read the first byte (7:0), and the second group of 8 + * channels to read the second byte (15:8). Then, we shift right by + * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3 + * values into place. Finally, we AND with 0xf to keep the low nibble. + * + * shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V + * and(16) dst<1>D tmp<8,8,1>W 0xf:W + * + * TODO: These payload bits exist on Gfx7 too, but they appear to always + * be zero, so this code fails to work. We should find out why. + */ + const fs_reg tmp = abld.vgrf(BRW_REGISTER_TYPE_UW); + + for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) { + const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i); + /* According to the "PS Thread Payload for Normal Dispatch" + * pages on the BSpec, the sample ids are stored in R0.8/R1.8 + * on gfx20+ and in R1.0/R2.0 on gfx8+. + */ + const struct brw_reg id_reg = devinfo->ver >= 20 ? xe2_vec1_grf(i, 8) : + brw_vec1_grf(i + 1, 0); + hbld.SHR(offset(tmp, hbld, i), + stride(retype(id_reg, BRW_REGISTER_TYPE_UB), 1, 8, 0), + brw_imm_v(0x44440000)); + } + + abld.AND(sample_id, tmp, brw_imm_w(0xf)); + } else { + const fs_reg t1 = component(abld.vgrf(BRW_REGISTER_TYPE_UD), 0); + const fs_reg t2 = abld.vgrf(BRW_REGISTER_TYPE_UW); + + /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with + * 8x multisampling, subspan 0 will represent sample N (where N + * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or + * 7. We can find the value of N by looking at R0.0 bits 7:6 + * ("Starting Sample Pair Index (SSPI)") and multiplying by two + * (since samples are always delivered in pairs). That is, we + * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then + * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in + * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, + * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by + * populating a temporary variable with the sequence (0, 1, 2, 3), + * and then reading from it using vstride=1, width=4, hstride=0. + * These computations hold good for 4x multisampling as well. + * + * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1): + * the first four slots are sample 0 of subspan 0; the next four + * are sample 1 of subspan 0; the third group is sample 0 of + * subspan 1, and finally sample 1 of subspan 1. + */ + + /* SKL+ has an extra bit for the Starting Sample Pair Index to + * accommodate 16x MSAA. + */ + abld.exec_all().group(1, 0) + .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)), + brw_imm_ud(0xc0)); + abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5)); + + /* This works for SIMD8-SIMD16. It also works for SIMD32 but only if we + * can assume 4x MSAA. Disallow it on IVB+ + * + * FINISHME: One day, we could come up with a way to do this that + * actually works on gfx7. + */ + if (devinfo->ver >= 7) + s.limit_dispatch_width(16, "gl_SampleId is unsupported in SIMD32 on gfx7"); + abld.exec_all().group(8, 0).MOV(t2, brw_imm_v(0x32103210)); + + /* This special instruction takes care of setting vstride=1, + * width=4, hstride=0 of t2 during an ADD instruction. + */ + abld.emit(FS_OPCODE_SET_SAMPLE_ID, sample_id, t1, t2); + } + + if (key->multisample_fbo == BRW_SOMETIMES) { + check_dynamic_msaa_flag(abld, wm_prog_data, + INTEL_MSAA_FLAG_MULTISAMPLE_FBO); + set_predicate(BRW_PREDICATE_NORMAL, + abld.SEL(sample_id, sample_id, brw_imm_ud(0))); + } + + return sample_id; +} + +static fs_reg +emit_samplemaskin_setup(nir_to_brw_state &ntb) +{ + const intel_device_info *devinfo = ntb.devinfo; + const fs_builder &bld = ntb.bld; + fs_visitor &s = ntb.s; + + assert(s.stage == MESA_SHADER_FRAGMENT); + struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data); + assert(devinfo->ver >= 6); + + /* The HW doesn't provide us with expected values. */ + assert(wm_prog_data->coarse_pixel_dispatch != BRW_ALWAYS); + + fs_reg coverage_mask = + fetch_payload_reg(bld, s.fs_payload().sample_mask_in_reg, BRW_REGISTER_TYPE_D); + + if (wm_prog_data->persample_dispatch == BRW_NEVER) + return coverage_mask; + + /* gl_SampleMaskIn[] comes from two sources: the input coverage mask, + * and a mask representing which sample is being processed by the + * current shader invocation. + * + * From the OES_sample_variables specification: + * "When per-sample shading is active due to the use of a fragment input + * qualified by "sample" or due to the use of the gl_SampleID or + * gl_SamplePosition variables, only the bit for the current sample is + * set in gl_SampleMaskIn." + */ + const fs_builder abld = bld.annotate("compute gl_SampleMaskIn"); + + if (ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE) + ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb); + + fs_reg one = s.vgrf(glsl_int_type()); + fs_reg enabled_mask = s.vgrf(glsl_int_type()); + abld.MOV(one, brw_imm_d(1)); + abld.SHL(enabled_mask, one, ntb.system_values[SYSTEM_VALUE_SAMPLE_ID]); + fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_D); + abld.AND(mask, enabled_mask, coverage_mask); + + if (wm_prog_data->persample_dispatch == BRW_ALWAYS) + return mask; + + check_dynamic_msaa_flag(abld, wm_prog_data, + INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH); + set_predicate(BRW_PREDICATE_NORMAL, abld.SEL(mask, mask, coverage_mask)); + + return mask; +} + +static fs_reg +emit_shading_rate_setup(nir_to_brw_state &ntb) +{ + const intel_device_info *devinfo = ntb.devinfo; + const fs_builder &bld = ntb.bld; + + assert(devinfo->ver >= 11); + + struct brw_wm_prog_data *wm_prog_data = + brw_wm_prog_data(bld.shader->stage_prog_data); + + /* Coarse pixel shading size fields overlap with other fields of not in + * coarse pixel dispatch mode, so report 0 when that's not the case. + */ + if (wm_prog_data->coarse_pixel_dispatch == BRW_NEVER) + return brw_imm_ud(0); + + const fs_builder abld = bld.annotate("compute fragment shading rate"); + + /* The shading rates provided in the shader are the actual 2D shading + * rate while the SPIR-V built-in is the enum value that has the shading + * rate encoded as a bitfield. Fortunately, the bitfield value is just + * the shading rate divided by two and shifted. + */ + + /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */ + fs_reg actual_x = fs_reg(retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UB)); + /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */ + fs_reg actual_y = byte_offset(actual_x, 1); + + fs_reg int_rate_x = bld.vgrf(BRW_REGISTER_TYPE_UD); + fs_reg int_rate_y = bld.vgrf(BRW_REGISTER_TYPE_UD); + + abld.SHR(int_rate_y, actual_y, brw_imm_ud(1)); + abld.SHR(int_rate_x, actual_x, brw_imm_ud(1)); + abld.SHL(int_rate_x, int_rate_x, brw_imm_ud(2)); + + fs_reg rate = abld.vgrf(BRW_REGISTER_TYPE_UD); + abld.OR(rate, int_rate_x, int_rate_y); + + if (wm_prog_data->coarse_pixel_dispatch == BRW_ALWAYS) + return rate; + + check_dynamic_msaa_flag(abld, wm_prog_data, + INTEL_MSAA_FLAG_COARSE_RT_WRITES); + set_predicate(BRW_PREDICATE_NORMAL, abld.SEL(rate, rate, brw_imm_ud(0))); + + return rate; +} + +static void +fs_nir_emit_fs_intrinsic(nir_to_brw_state &ntb, + nir_intrinsic_instr *instr) +{ + const intel_device_info *devinfo = ntb.devinfo; + const fs_builder &bld = ntb.bld; + fs_visitor &s = ntb.s; + + assert(s.stage == MESA_SHADER_FRAGMENT); + + fs_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_def(ntb, instr->def); + + switch (instr->intrinsic) { + case nir_intrinsic_load_front_face: + bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), + emit_frontfacing_interpolation(ntb)); + break; + + case nir_intrinsic_load_sample_pos: + case nir_intrinsic_load_sample_pos_or_center: { + fs_reg sample_pos = ntb.system_values[SYSTEM_VALUE_SAMPLE_POS]; + assert(sample_pos.file != BAD_FILE); + dest.type = sample_pos.type; + bld.MOV(dest, sample_pos); + bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1)); + break; + } + + case nir_intrinsic_load_layer_id: + dest.type = BRW_REGISTER_TYPE_UD; + bld.MOV(dest, fetch_render_target_array_index(bld)); + break; + + case nir_intrinsic_is_helper_invocation: + emit_is_helper_invocation(ntb, dest); + break; + + case nir_intrinsic_load_helper_invocation: + case nir_intrinsic_load_sample_mask_in: + case nir_intrinsic_load_sample_id: + case nir_intrinsic_load_frag_shading_rate: { + gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); + fs_reg val = ntb.system_values[sv]; + assert(val.file != BAD_FILE); + dest.type = val.type; + bld.MOV(dest, val); + break; + } + + case nir_intrinsic_store_output: { + const fs_reg src = get_nir_src(ntb, instr->src[0]); + const unsigned store_offset = nir_src_as_uint(instr->src[1]); + const unsigned location = nir_intrinsic_base(instr) + + SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION); + const fs_reg new_dest = retype(alloc_frag_output(ntb, location), + src.type); + + for (unsigned j = 0; j < instr->num_components; j++) + bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j), + offset(src, bld, j)); + + break; + } + + case nir_intrinsic_load_output: { + const unsigned l = GET_FIELD(nir_intrinsic_base(instr), + BRW_NIR_FRAG_OUTPUT_LOCATION); + assert(l >= FRAG_RESULT_DATA0); + const unsigned load_offset = nir_src_as_uint(instr->src[0]); + const unsigned target = l - FRAG_RESULT_DATA0 + load_offset; + const fs_reg tmp = bld.vgrf(dest.type, 4); + + if (reinterpret_cast(s.key)->coherent_fb_fetch) + emit_coherent_fb_read(bld, tmp, target); + else + emit_non_coherent_fb_read(ntb, bld, tmp, target); + + for (unsigned j = 0; j < instr->num_components; j++) { + bld.MOV(offset(dest, bld, j), + offset(tmp, bld, nir_intrinsic_component(instr) + j)); + } + + break; + } + + case nir_intrinsic_demote: + case nir_intrinsic_discard: + case nir_intrinsic_terminate: + case nir_intrinsic_demote_if: + case nir_intrinsic_discard_if: + case nir_intrinsic_terminate_if: { + /* We track our discarded pixels in f0.1/f1.0. By predicating on it, we + * can update just the flag bits that aren't yet discarded. If there's + * no condition, we emit a CMP of g0 != g0, so all currently executing + * channels will get turned off. + */ + fs_inst *cmp = NULL; + if (instr->intrinsic == nir_intrinsic_demote_if || + instr->intrinsic == nir_intrinsic_discard_if || + instr->intrinsic == nir_intrinsic_terminate_if) { + nir_alu_instr *alu = nir_src_as_alu_instr(instr->src[0]); + + if (alu != NULL && + alu->op != nir_op_bcsel && + (devinfo->ver > 5 || + (alu->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) != BRW_NIR_BOOLEAN_NEEDS_RESOLVE || + alu->op == nir_op_fneu32 || alu->op == nir_op_feq32 || + alu->op == nir_op_flt32 || alu->op == nir_op_fge32 || + alu->op == nir_op_ine32 || alu->op == nir_op_ieq32 || + alu->op == nir_op_ilt32 || alu->op == nir_op_ige32 || + alu->op == nir_op_ult32 || alu->op == nir_op_uge32)) { + /* Re-emit the instruction that generated the Boolean value, but + * do not store it. Since this instruction will be conditional, + * other instructions that want to use the real Boolean value may + * get garbage. This was a problem for piglit's fs-discard-exit-2 + * test. + * + * Ideally we'd detect that the instruction cannot have a + * conditional modifier before emitting the instructions. Alas, + * that is nigh impossible. Instead, we're going to assume the + * instruction (or last instruction) generated can have a + * conditional modifier. If it cannot, fallback to the old-style + * compare, and hope dead code elimination will clean up the + * extra instructions generated. + */ + fs_nir_emit_alu(ntb, alu, false); + + cmp = (fs_inst *) s.instructions.get_tail(); + if (cmp->conditional_mod == BRW_CONDITIONAL_NONE) { + if (cmp->can_do_cmod()) + cmp->conditional_mod = BRW_CONDITIONAL_Z; + else + cmp = NULL; + } else { + /* The old sequence that would have been generated is, + * basically, bool_result == false. This is equivalent to + * !bool_result, so negate the old modifier. + */ + cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod); + } + } + + if (cmp == NULL) { + cmp = bld.CMP(bld.null_reg_f(), get_nir_src(ntb, instr->src[0]), + brw_imm_d(0), BRW_CONDITIONAL_Z); + } + } else { + fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0), + BRW_REGISTER_TYPE_UW)); + cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ); + } + + cmp->predicate = BRW_PREDICATE_NORMAL; + cmp->flag_subreg = sample_mask_flag_subreg(s); + + fs_inst *jump = bld.emit(BRW_OPCODE_HALT); + jump->flag_subreg = sample_mask_flag_subreg(s); + jump->predicate_inverse = true; + + if (instr->intrinsic == nir_intrinsic_terminate || + instr->intrinsic == nir_intrinsic_terminate_if) { + jump->predicate = BRW_PREDICATE_NORMAL; + } else { + /* Only jump when the whole quad is demoted. For historical + * reasons this is also used for discard. + */ + jump->predicate = (devinfo->ver >= 20 ? XE2_PREDICATE_ANY : + BRW_PREDICATE_ALIGN1_ANY4H); + } + + if (devinfo->ver < 7) + s.limit_dispatch_width( + 16, "Fragment discard/demote not implemented in SIMD32 mode.\n"); + break; + } + + case nir_intrinsic_load_input: { + /* In Fragment Shaders load_input is used either for flat inputs or + * per-primitive inputs. + */ + assert(instr->def.bit_size == 32); + unsigned base = nir_intrinsic_base(instr); + unsigned comp = nir_intrinsic_component(instr); + unsigned num_components = instr->num_components; + + const struct brw_wm_prog_key *wm_key = (brw_wm_prog_key*) s.key; + + if (wm_key->mesh_input == BRW_SOMETIMES) { + assert(devinfo->verx10 >= 125); + /* The FS payload gives us the viewport and layer clamped to valid + * ranges, but the spec for gl_ViewportIndex and gl_Layer includes + * the language: + * the fragment stage will read the same value written by the + * geometry stage, even if that value is out of range. + * + * Which is why these are normally passed as regular attributes. + * This isn't tested anywhere except some GL-only piglit tests + * though, so for the case where the FS may be used against either a + * traditional pipeline or a mesh one, where the position of these + * will change depending on the previous stage, read them from the + * payload to simplify things until the requisite magic is in place. + */ + if (base == VARYING_SLOT_LAYER || base == VARYING_SLOT_VIEWPORT) { + assert(num_components == 1); + fs_reg g1(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD)); + + unsigned mask, shift_count; + if (base == VARYING_SLOT_LAYER) { + shift_count = 16; + mask = 0x7ff << shift_count; + } else { + shift_count = 27; + mask = 0xf << shift_count; + } + + fs_reg vp_or_layer = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.AND(vp_or_layer, g1, brw_imm_ud(mask)); + fs_reg shifted_value = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.SHR(shifted_value, vp_or_layer, brw_imm_ud(shift_count)); + bld.MOV(offset(dest, bld, 0), retype(shifted_value, dest.type)); + break; + } + } + + /* TODO(mesh): Multiview. Verify and handle these special cases for Mesh. */ + + /* Special case fields in the VUE header */ + if (base == VARYING_SLOT_LAYER) + comp = 1; + else if (base == VARYING_SLOT_VIEWPORT) + comp = 2; + + if (BITFIELD64_BIT(base) & s.nir->info.per_primitive_inputs) { + assert(base != VARYING_SLOT_PRIMITIVE_INDICES); + for (unsigned int i = 0; i < num_components; i++) { + bld.MOV(offset(dest, bld, i), + retype(s.per_primitive_reg(bld, base, comp + i), dest.type)); + } + } else { + /* Gfx20+ packs the plane parameters of a single logical + * input in a vec3 format instead of the previously used vec4 + * format. + */ + const unsigned k = devinfo->ver >= 20 ? 0 : 3; + for (unsigned int i = 0; i < num_components; i++) { + bld.MOV(offset(dest, bld, i), + retype(s.interp_reg(bld, base, comp + i, k), dest.type)); + } + } + break; + } + + case nir_intrinsic_load_fs_input_interp_deltas: { + assert(s.stage == MESA_SHADER_FRAGMENT); + assert(nir_src_as_uint(instr->src[0]) == 0); + const unsigned base = nir_intrinsic_base(instr); + const unsigned comp = nir_intrinsic_component(instr); + dest.type = BRW_REGISTER_TYPE_F; + + /* Gfx20+ packs the plane parameters of a single logical + * input in a vec3 format instead of the previously used vec4 + * format. + */ + if (devinfo->ver >= 20) { + bld.MOV(offset(dest, bld, 0), s.interp_reg(bld, base, comp, 0)); + bld.MOV(offset(dest, bld, 1), s.interp_reg(bld, base, comp, 2)); + bld.MOV(offset(dest, bld, 2), s.interp_reg(bld, base, comp, 1)); + } else { + bld.MOV(offset(dest, bld, 0), s.interp_reg(bld, base, comp, 3)); + bld.MOV(offset(dest, bld, 1), s.interp_reg(bld, base, comp, 1)); + bld.MOV(offset(dest, bld, 2), s.interp_reg(bld, base, comp, 0)); + } + + break; + } + + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_centroid: + case nir_intrinsic_load_barycentric_sample: { + /* Use the delta_xy values computed from the payload */ + enum brw_barycentric_mode bary = brw_barycentric_mode(instr); + const fs_reg srcs[] = { offset(s.delta_xy[bary], bld, 0), + offset(s.delta_xy[bary], bld, 1) }; + bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0); + break; + } + + case nir_intrinsic_load_barycentric_at_sample: { + const glsl_interp_mode interpolation = + (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); + + fs_reg msg_data; + if (nir_src_is_const(instr->src[0])) { + msg_data = brw_imm_ud(nir_src_as_uint(instr->src[0]) << 4); + } else { + const fs_reg sample_src = retype(get_nir_src(ntb, instr->src[0]), + BRW_REGISTER_TYPE_UD); + const fs_reg sample_id = bld.emit_uniformize(sample_src); + msg_data = component(bld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD), 0); + bld.exec_all().group(1, 0).SHL(msg_data, sample_id, brw_imm_ud(4u)); + } + + fs_reg flag_reg; + struct brw_wm_prog_key *wm_prog_key = (struct brw_wm_prog_key *) s.key; + if (wm_prog_key->multisample_fbo == BRW_SOMETIMES) { + struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data); + + check_dynamic_msaa_flag(bld.exec_all().group(8, 0), + wm_prog_data, + INTEL_MSAA_FLAG_MULTISAMPLE_FBO); + flag_reg = brw_flag_reg(0, 0); + } + + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_SAMPLE, + dest, + fs_reg(), /* src */ + msg_data, + flag_reg, + interpolation); + break; + } + + case nir_intrinsic_load_barycentric_at_offset: { + const glsl_interp_mode interpolation = + (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); + + nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); + + if (const_offset) { + assert(nir_src_bit_size(instr->src[0]) == 32); + unsigned off_x = const_offset[0].u32 & 0xf; + unsigned off_y = const_offset[1].u32 & 0xf; + + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, + dest, + fs_reg(), /* src */ + brw_imm_ud(off_x | (off_y << 4)), + fs_reg(), /* flag_reg */ + interpolation); + } else { + fs_reg src = retype(get_nir_src(ntb, instr->src[0]), BRW_REGISTER_TYPE_D); + const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET; + emit_pixel_interpolater_send(bld, + opcode, + dest, + src, + brw_imm_ud(0u), + fs_reg(), /* flag_reg */ + interpolation); + } + break; + } + + case nir_intrinsic_load_frag_coord: + emit_fragcoord_interpolation(ntb, dest); + break; + + case nir_intrinsic_load_interpolated_input: { + assert(instr->src[0].ssa && + instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic); + nir_intrinsic_instr *bary_intrinsic = + nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr); + nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic; + enum glsl_interp_mode interp_mode = + (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic); + fs_reg dst_xy; + + if (bary_intrin == nir_intrinsic_load_barycentric_at_offset || + bary_intrin == nir_intrinsic_load_barycentric_at_sample) { + /* Use the result of the PI message. */ + dst_xy = retype(get_nir_src(ntb, instr->src[0]), BRW_REGISTER_TYPE_F); + } else { + /* Use the delta_xy values computed from the payload */ + enum brw_barycentric_mode bary = brw_barycentric_mode(bary_intrinsic); + dst_xy = s.delta_xy[bary]; + } + + for (unsigned int i = 0; i < instr->num_components; i++) { + fs_reg interp = + s.interp_reg(bld, nir_intrinsic_base(instr), + nir_intrinsic_component(instr) + i, 0); + interp.type = BRW_REGISTER_TYPE_F; + dest.type = BRW_REGISTER_TYPE_F; + + if (devinfo->ver < 6 && interp_mode == INTERP_MODE_SMOOTH) { + fs_reg tmp = s.vgrf(glsl_float_type()); + bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp); + bld.MUL(offset(dest, bld, i), tmp, s.pixel_w); + } else { + bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp); + } + } + break; + } + + default: + fs_nir_emit_intrinsic(ntb, bld, instr); + break; + } +} + +static void +fs_nir_emit_cs_intrinsic(nir_to_brw_state &ntb, + nir_intrinsic_instr *instr) +{ + const intel_device_info *devinfo = ntb.devinfo; + const fs_builder &bld = ntb.bld; + fs_visitor &s = ntb.s; + + assert(gl_shader_stage_uses_workgroup(s.stage)); + struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(s.prog_data); + + fs_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_def(ntb, instr->def); + + switch (instr->intrinsic) { + case nir_intrinsic_barrier: + if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE) + fs_nir_emit_intrinsic(ntb, bld, instr); + if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) { + /* The whole workgroup fits in a single HW thread, so all the + * invocations are already executed lock-step. Instead of an actual + * barrier just emit a scheduling fence, that will generate no code. + */ + if (!s.nir->info.workgroup_size_variable && + s.workgroup_size() <= s.dispatch_width) { + bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE); + break; + } + + emit_barrier(ntb); + cs_prog_data->uses_barrier = true; + } + break; + + case nir_intrinsic_load_subgroup_id: + s.cs_payload().load_subgroup_id(bld, dest); + break; + + case nir_intrinsic_load_local_invocation_id: + /* This is only used for hardware generated local IDs. */ + assert(cs_prog_data->generate_local_id); + + dest.type = BRW_REGISTER_TYPE_UD; + + for (unsigned i = 0; i < 3; i++) + bld.MOV(offset(dest, bld, i), s.cs_payload().local_invocation_id[i]); + break; + + case nir_intrinsic_load_workgroup_id: + case nir_intrinsic_load_workgroup_id_zero_base: { + fs_reg val = ntb.system_values[SYSTEM_VALUE_WORKGROUP_ID]; + assert(val.file != BAD_FILE); + dest.type = val.type; + for (unsigned i = 0; i < 3; i++) + bld.MOV(offset(dest, bld, i), offset(val, bld, i)); + break; + } + + case nir_intrinsic_load_num_workgroups: { + assert(instr->def.bit_size == 32); + + cs_prog_data->uses_num_work_groups = true; + + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(0); + srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(3); /* num components */ + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = brw_imm_ud(0); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); + fs_inst *inst = + bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, + dest, srcs, SURFACE_LOGICAL_NUM_SRCS); + inst->size_written = 3 * s.dispatch_width * 4; + break; + } + + case nir_intrinsic_shared_atomic: + case nir_intrinsic_shared_atomic_swap: + fs_nir_emit_surface_atomic(ntb, bld, instr, brw_imm_ud(GFX7_BTI_SLM), + false /* bindless */); + break; + + case nir_intrinsic_load_shared: { + assert(devinfo->ver >= 7); + + const unsigned bit_size = instr->def.bit_size; + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM); + + fs_reg addr = get_nir_src(ntb, instr->src[0]); + int base = nir_intrinsic_base(instr); + if (base) { + fs_reg addr_off = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + bld.ADD(addr_off, addr, brw_imm_d(base)); + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr_off; + } else { + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr; + } + + srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); + + /* Make dest unsigned because that's what the temporary will be */ + dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); + + /* Read the vector */ + assert(bit_size <= 32); + assert(nir_intrinsic_align(instr) > 0); + if (bit_size == 32 && + nir_intrinsic_align(instr) >= 4) { + assert(instr->def.num_components <= 4); + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); + fs_inst *inst = + bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, + dest, srcs, SURFACE_LOGICAL_NUM_SRCS); + inst->size_written = instr->num_components * s.dispatch_width * 4; + } else { + assert(instr->def.num_components == 1); + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); + + fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, + read_result, srcs, SURFACE_LOGICAL_NUM_SRCS); + bld.MOV(dest, subscript(read_result, dest.type, 0)); + } + break; + } + + case nir_intrinsic_store_shared: { + assert(devinfo->ver >= 7); + + const unsigned bit_size = nir_src_bit_size(instr->src[0]); + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM); + + fs_reg addr = get_nir_src(ntb, instr->src[1]); + int base = nir_intrinsic_base(instr); + if (base) { + fs_reg addr_off = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + bld.ADD(addr_off, addr, brw_imm_d(base)); + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr_off; + } else { + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr; + } + + srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + /* No point in masking with sample mask, here we're handling compute + * intrinsics. + */ + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); + + fs_reg data = get_nir_src(ntb, instr->src[0]); + data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); + + assert(bit_size <= 32); + assert(nir_intrinsic_write_mask(instr) == + (1u << instr->num_components) - 1); + assert(nir_intrinsic_align(instr) > 0); + if (bit_size == 32 && + nir_intrinsic_align(instr) >= 4) { + assert(nir_src_num_components(instr->src[0]) <= 4); + srcs[SURFACE_LOGICAL_SRC_DATA] = data; + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); + bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, + fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); + } else { + assert(nir_src_num_components(instr->src[0]) == 1); + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); + + srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data); + + bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, + fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); + } + break; + } + + case nir_intrinsic_load_workgroup_size: { + /* Should have been lowered by brw_nir_lower_cs_intrinsics() or + * crocus/iris_setup_uniforms() for the variable group size case. + */ + unreachable("Should have been lowered"); + break; + } + + case nir_intrinsic_dpas_intel: { + const unsigned sdepth = nir_intrinsic_systolic_depth(instr); + const unsigned rcount = nir_intrinsic_repeat_count(instr); + + const brw_reg_type dest_type = + brw_type_for_nir_type(devinfo, nir_intrinsic_dest_type(instr)); + const brw_reg_type src_type = + brw_type_for_nir_type(devinfo, nir_intrinsic_src_type(instr)); + + dest = retype(dest, dest_type); + fs_reg src2 = retype(get_nir_src(ntb, instr->src[2]), dest_type); + const fs_reg dest_hf = dest; + + fs_builder bld8 = bld.exec_all().group(8, 0); + fs_builder bld16 = bld.exec_all().group(16, 0); + + /* DG2 cannot have the destination or source 0 of DPAS be float16. It is + * still advantageous to support these formats for memory and bandwidth + * savings. + * + * The float16 source must be expanded to float32. + */ + if (devinfo->verx10 == 125 && dest_type == BRW_REGISTER_TYPE_HF && + !s.compiler->lower_dpas) { + dest = bld8.vgrf(BRW_REGISTER_TYPE_F, rcount); + + if (src2.file != ARF) { + const fs_reg src2_hf = src2; + + src2 = bld8.vgrf(BRW_REGISTER_TYPE_F, rcount); + + for (unsigned i = 0; i < 4; i++) { + bld16.MOV(byte_offset(src2, REG_SIZE * i * 2), + byte_offset(src2_hf, REG_SIZE * i)); + } + } else { + src2 = retype(src2, BRW_REGISTER_TYPE_F); + } + } + + bld8.DPAS(dest, + src2, + retype(get_nir_src(ntb, instr->src[1]), src_type), + retype(get_nir_src(ntb, instr->src[0]), src_type), + sdepth, + rcount) + ->saturate = nir_intrinsic_saturate(instr); + + /* Compact the destination to float16 (from float32). */ + if (!dest.equals(dest_hf)) { + for (unsigned i = 0; i < 4; i++) { + bld16.MOV(byte_offset(dest_hf, REG_SIZE * i), + byte_offset(dest, REG_SIZE * i * 2)); + } + } + + cs_prog_data->uses_systolic = true; + break; + } + + default: + fs_nir_emit_intrinsic(ntb, bld, instr); + break; + } +} + +static void +emit_rt_lsc_fence(const fs_builder &bld, + enum lsc_fence_scope scope, + enum lsc_flush_type flush_type) +{ + const intel_device_info *devinfo = bld.shader->devinfo; + + const fs_builder ubld = bld.exec_all().group(8, 0); + fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD); + fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, tmp, + brw_imm_ud(0) /* desc */, + brw_imm_ud(0) /* ex_desc */, + brw_vec8_grf(0, 0) /* payload */); + send->sfid = GFX12_SFID_UGM; + send->desc = lsc_fence_msg_desc(devinfo, scope, flush_type, true); + send->mlen = reg_unit(devinfo); /* g0 header */ + send->ex_mlen = 0; + /* Temp write for scheduling */ + send->size_written = REG_SIZE * reg_unit(devinfo); + send->send_has_side_effects = true; + + ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), tmp); +} + + +static void +fs_nir_emit_bs_intrinsic(nir_to_brw_state &ntb, + nir_intrinsic_instr *instr) +{ + const fs_builder &bld = ntb.bld; + fs_visitor &s = ntb.s; + + assert(brw_shader_stage_is_bindless(s.stage)); + const bs_thread_payload &payload = s.bs_payload(); + + fs_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_def(ntb, instr->def); + + switch (instr->intrinsic) { + case nir_intrinsic_load_btd_global_arg_addr_intel: + bld.MOV(dest, retype(payload.global_arg_ptr, dest.type)); + break; + + case nir_intrinsic_load_btd_local_arg_addr_intel: + bld.MOV(dest, retype(payload.local_arg_ptr, dest.type)); + break; + + case nir_intrinsic_load_btd_shader_type_intel: + payload.load_shader_type(bld, dest); + break; + + default: + fs_nir_emit_intrinsic(ntb, bld, instr); + break; + } +} + +static fs_reg +brw_nir_reduction_op_identity(const fs_builder &bld, + nir_op op, brw_reg_type type) +{ + nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8); + switch (type_sz(type)) { + case 1: + if (type == BRW_REGISTER_TYPE_UB) { + return brw_imm_uw(value.u8); + } else { + assert(type == BRW_REGISTER_TYPE_B); + return brw_imm_w(value.i8); + } + case 2: + return retype(brw_imm_uw(value.u16), type); + case 4: + return retype(brw_imm_ud(value.u32), type); + case 8: + if (type == BRW_REGISTER_TYPE_DF) + return setup_imm_df(bld, value.f64); + else + return retype(brw_imm_u64(value.u64), type); + default: + unreachable("Invalid type size"); + } +} + +static opcode +brw_op_for_nir_reduction_op(nir_op op) +{ + switch (op) { + case nir_op_iadd: return BRW_OPCODE_ADD; + case nir_op_fadd: return BRW_OPCODE_ADD; + case nir_op_imul: return BRW_OPCODE_MUL; + case nir_op_fmul: return BRW_OPCODE_MUL; + case nir_op_imin: return BRW_OPCODE_SEL; + case nir_op_umin: return BRW_OPCODE_SEL; + case nir_op_fmin: return BRW_OPCODE_SEL; + case nir_op_imax: return BRW_OPCODE_SEL; + case nir_op_umax: return BRW_OPCODE_SEL; + case nir_op_fmax: return BRW_OPCODE_SEL; + case nir_op_iand: return BRW_OPCODE_AND; + case nir_op_ior: return BRW_OPCODE_OR; + case nir_op_ixor: return BRW_OPCODE_XOR; + default: + unreachable("Invalid reduction operation"); + } +} + +static brw_conditional_mod +brw_cond_mod_for_nir_reduction_op(nir_op op) +{ + switch (op) { + case nir_op_iadd: return BRW_CONDITIONAL_NONE; + case nir_op_fadd: return BRW_CONDITIONAL_NONE; + case nir_op_imul: return BRW_CONDITIONAL_NONE; + case nir_op_fmul: return BRW_CONDITIONAL_NONE; + case nir_op_imin: return BRW_CONDITIONAL_L; + case nir_op_umin: return BRW_CONDITIONAL_L; + case nir_op_fmin: return BRW_CONDITIONAL_L; + case nir_op_imax: return BRW_CONDITIONAL_GE; + case nir_op_umax: return BRW_CONDITIONAL_GE; + case nir_op_fmax: return BRW_CONDITIONAL_GE; + case nir_op_iand: return BRW_CONDITIONAL_NONE; + case nir_op_ior: return BRW_CONDITIONAL_NONE; + case nir_op_ixor: return BRW_CONDITIONAL_NONE; + default: + unreachable("Invalid reduction operation"); + } +} + +struct rebuild_resource { + unsigned idx; + std::vector array; +}; + +static bool +add_rebuild_src(nir_src *src, void *state) +{ + struct rebuild_resource *res = (struct rebuild_resource *) state; + + for (nir_def *def : res->array) { + if (def == src->ssa) + return true; + } + + nir_foreach_src(src->ssa->parent_instr, add_rebuild_src, state); + res->array.push_back(src->ssa); + return true; +} + +static fs_reg +try_rebuild_resource(nir_to_brw_state &ntb, const brw::fs_builder &bld, nir_def *resource_def) +{ + /* Create a build at the location of the resource_intel intrinsic */ + fs_builder ubld8 = bld.exec_all().group(8, 0); + + struct rebuild_resource resources = {}; + resources.idx = 0; + + if (!nir_foreach_src(resource_def->parent_instr, + add_rebuild_src, &resources)) + return fs_reg(); + resources.array.push_back(resource_def); + + if (resources.array.size() == 1) { + nir_def *def = resources.array[0]; + + if (def->parent_instr->type == nir_instr_type_load_const) { + nir_load_const_instr *load_const = + nir_instr_as_load_const(def->parent_instr); + return brw_imm_ud(load_const->value[0].i32); + } else { + assert(def->parent_instr->type == nir_instr_type_intrinsic && + (nir_instr_as_intrinsic(def->parent_instr)->intrinsic == + nir_intrinsic_load_uniform)); + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(def->parent_instr); + unsigned base_offset = nir_intrinsic_base(intrin); + unsigned load_offset = nir_src_as_uint(intrin->src[0]); + fs_reg src(UNIFORM, base_offset / 4, BRW_REGISTER_TYPE_UD); + src.offset = load_offset + base_offset % 4; + return src; + } + } + + for (unsigned i = 0; i < resources.array.size(); i++) { + nir_def *def = resources.array[i]; + + nir_instr *instr = def->parent_instr; + switch (instr->type) { + case nir_instr_type_load_const: { + nir_load_const_instr *load_const = + nir_instr_as_load_const(instr); + fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD); + ntb.resource_insts[def->index] = + ubld8.MOV(dst, brw_imm_ud(load_const->value[0].i32)); + break; + } + + case nir_instr_type_alu: { + nir_alu_instr *alu = nir_instr_as_alu(instr); + + if (nir_op_infos[alu->op].num_inputs == 2) { + if (alu->src[0].swizzle[0] != 0 || + alu->src[1].swizzle[0] != 0) + break; + } else if (nir_op_infos[alu->op].num_inputs == 3) { + if (alu->src[0].swizzle[0] != 0 || + alu->src[1].swizzle[0] != 0 || + alu->src[2].swizzle[0] != 0) + break; + } else { + /* Not supported ALU input count */ + break; + } + + switch (alu->op) { + case nir_op_iadd: { + fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD); + fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst; + fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst; + assert(src0.file != BAD_FILE && src1.file != BAD_FILE); + assert(src0.type == BRW_REGISTER_TYPE_UD); + ntb.resource_insts[def->index] = + ubld8.ADD(dst, + src0.file != IMM ? src0 : src1, + src0.file != IMM ? src1 : src0); + break; + } + case nir_op_iadd3: { + fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD); + fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst; + fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst; + fs_reg src2 = ntb.resource_insts[alu->src[2].src.ssa->index]->dst; + assert(src0.file != BAD_FILE && src1.file != BAD_FILE && src2.file != BAD_FILE); + assert(src0.type == BRW_REGISTER_TYPE_UD); + ntb.resource_insts[def->index] = + ubld8.ADD3(dst, + src1.file == IMM ? src1 : src0, + src1.file == IMM ? src0 : src1, + src2); + break; + } + case nir_op_ushr: { + fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD); + fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst; + fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst; + assert(src0.file != BAD_FILE && src1.file != BAD_FILE); + assert(src0.type == BRW_REGISTER_TYPE_UD); + ntb.resource_insts[def->index] = ubld8.SHR(dst, src0, src1); + break; + } + case nir_op_ishl: { + fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD); + fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst; + fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst; + assert(src0.file != BAD_FILE && src1.file != BAD_FILE); + assert(src0.type == BRW_REGISTER_TYPE_UD); + ntb.resource_insts[def->index] = ubld8.SHL(dst, src0, src1); + break; + } + case nir_op_mov: { + break; + } + default: + break; + } + break; + } + + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_resource_intel: + ntb.resource_insts[def->index] = + ntb.resource_insts[intrin->src[1].ssa->index]; + break; + + case nir_intrinsic_load_uniform: { + if (!nir_src_is_const(intrin->src[0])) + break; + + unsigned base_offset = nir_intrinsic_base(intrin); + unsigned load_offset = nir_src_as_uint(intrin->src[0]); + fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD); + fs_reg src(UNIFORM, base_offset / 4, BRW_REGISTER_TYPE_UD); + src.offset = load_offset + base_offset % 4; + ntb.resource_insts[def->index] = ubld8.MOV(dst, src); + break; + } + + default: + break; + } + break; + } + + default: + break; + } + + if (ntb.resource_insts[def->index] == NULL) + return fs_reg(); + } + + assert(ntb.resource_insts[resource_def->index] != NULL); + return component(ntb.resource_insts[resource_def->index]->dst, 0); +} + +static fs_reg +get_nir_image_intrinsic_image(nir_to_brw_state &ntb, const brw::fs_builder &bld, + nir_intrinsic_instr *instr) +{ + if (is_resource_src(instr->src[0])) { + fs_reg surf_index = get_resource_nir_src(ntb, instr->src[0]); + if (surf_index.file != BAD_FILE) + return surf_index; + } + + fs_reg image = retype(get_nir_src_imm(ntb, instr->src[0]), BRW_REGISTER_TYPE_UD); + fs_reg surf_index = image; + + return bld.emit_uniformize(surf_index); +} + +static fs_reg +get_nir_buffer_intrinsic_index(nir_to_brw_state &ntb, const brw::fs_builder &bld, + nir_intrinsic_instr *instr) +{ + /* SSBO stores are weird in that their index is in src[1] */ + const bool is_store = + instr->intrinsic == nir_intrinsic_store_ssbo || + instr->intrinsic == nir_intrinsic_store_ssbo_block_intel; + nir_src src = is_store ? instr->src[1] : instr->src[0]; + + if (nir_src_is_const(src)) { + return brw_imm_ud(nir_src_as_uint(src)); + } else if (is_resource_src(src)) { + fs_reg surf_index = get_resource_nir_src(ntb, src); + if (surf_index.file != BAD_FILE) + return surf_index; + } + return bld.emit_uniformize(get_nir_src(ntb, src)); +} + +/** + * The offsets we get from NIR act as if each SIMD channel has it's own blob + * of contiguous space. However, if we actually place each SIMD channel in + * it's own space, we end up with terrible cache performance because each SIMD + * channel accesses a different cache line even when they're all accessing the + * same byte offset. To deal with this problem, we swizzle the address using + * a simple algorithm which ensures that any time a SIMD message reads or + * writes the same address, it's all in the same cache line. We have to keep + * the bottom two bits fixed so that we can read/write up to a dword at a time + * and the individual element is contiguous. We do this by splitting the + * address as follows: + * + * 31 4-6 2 0 + * +-------------------------------+------------+----------+ + * | Hi address bits | chan index | addr low | + * +-------------------------------+------------+----------+ + * + * In other words, the bottom two address bits stay, and the top 30 get + * shifted up so that we can stick the SIMD channel index in the middle. This + * way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit + * at the same logical offset, the scratch read/write instruction acts on + * continuous elements and we get good cache locality. + */ +static fs_reg +swizzle_nir_scratch_addr(nir_to_brw_state &ntb, + const brw::fs_builder &bld, + const fs_reg &nir_addr, + bool in_dwords) +{ + fs_visitor &s = ntb.s; + + const fs_reg &chan_index = + ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]; + const unsigned chan_index_bits = ffs(s.dispatch_width) - 1; + + fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD); + if (in_dwords) { + /* In this case, we know the address is aligned to a DWORD and we want + * the final address in DWORDs. + */ + bld.SHL(addr, nir_addr, brw_imm_ud(chan_index_bits - 2)); + bld.OR(addr, addr, chan_index); + } else { + /* This case substantially more annoying because we have to pay + * attention to those pesky two bottom bits. + */ + fs_reg addr_hi = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.AND(addr_hi, nir_addr, brw_imm_ud(~0x3u)); + bld.SHL(addr_hi, addr_hi, brw_imm_ud(chan_index_bits)); + fs_reg chan_addr = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.SHL(chan_addr, chan_index, brw_imm_ud(2)); + bld.AND(addr, nir_addr, brw_imm_ud(0x3u)); + bld.OR(addr, addr, addr_hi); + bld.OR(addr, addr, chan_addr); + } + return addr; +} + +static unsigned +choose_oword_block_size_dwords(const struct intel_device_info *devinfo, + unsigned dwords) +{ + unsigned block; + if (devinfo->has_lsc && dwords >= 64) { + block = 64; + } else if (dwords >= 32) { + block = 32; + } else if (dwords >= 16) { + block = 16; + } else { + block = 8; + } + assert(block <= dwords); + return block; +} + +static void +increment_a64_address(const fs_builder &bld, fs_reg address, uint32_t v) +{ + if (bld.shader->devinfo->has_64bit_int) { + bld.ADD(address, address, brw_imm_ud(v)); + } else { + fs_reg low = retype(address, BRW_REGISTER_TYPE_UD); + fs_reg high = offset(low, bld, 1); + + /* Add low and if that overflows, add carry to high. */ + bld.ADD(low, low, brw_imm_ud(v))->conditional_mod = BRW_CONDITIONAL_O; + bld.ADD(high, high, brw_imm_ud(0x1))->predicate = BRW_PREDICATE_NORMAL; + } +} + +static fs_reg +emit_fence(const fs_builder &bld, enum opcode opcode, + uint8_t sfid, uint32_t desc, + bool commit_enable, uint8_t bti) +{ + assert(opcode == SHADER_OPCODE_INTERLOCK || + opcode == SHADER_OPCODE_MEMORY_FENCE); + + fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD); + fs_inst *fence = bld.emit(opcode, dst, brw_vec8_grf(0, 0), + brw_imm_ud(commit_enable), + brw_imm_ud(bti)); + fence->sfid = sfid; + fence->desc = desc; + + return dst; +} + +static uint32_t +lsc_fence_descriptor_for_intrinsic(const struct intel_device_info *devinfo, + nir_intrinsic_instr *instr) +{ + assert(devinfo->has_lsc); + + enum lsc_fence_scope scope = LSC_FENCE_LOCAL; + enum lsc_flush_type flush_type = LSC_FLUSH_TYPE_NONE; + + if (nir_intrinsic_has_memory_scope(instr)) { + switch (nir_intrinsic_memory_scope(instr)) { + case SCOPE_DEVICE: + case SCOPE_QUEUE_FAMILY: + scope = LSC_FENCE_TILE; + flush_type = LSC_FLUSH_TYPE_EVICT; + break; + case SCOPE_WORKGROUP: + scope = LSC_FENCE_THREADGROUP; + break; + case SCOPE_SHADER_CALL: + case SCOPE_INVOCATION: + case SCOPE_SUBGROUP: + case SCOPE_NONE: + break; + } + } else { + /* No scope defined. */ + scope = LSC_FENCE_TILE; + flush_type = LSC_FLUSH_TYPE_EVICT; + } + return lsc_fence_msg_desc(devinfo, scope, flush_type, true); +} + +/** + * Create a MOV to read the timestamp register. + */ +static fs_reg +get_timestamp(const fs_builder &bld) +{ + fs_visitor &s = *bld.shader; + const intel_device_info *devinfo = s.devinfo; + + assert(devinfo->ver >= 7); + + fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_TIMESTAMP, + 0), + BRW_REGISTER_TYPE_UD)); + + fs_reg dst = fs_reg(VGRF, s.alloc.allocate(1), BRW_REGISTER_TYPE_UD); + + /* We want to read the 3 fields we care about even if it's not enabled in + * the dispatch. + */ + bld.group(4, 0).exec_all().MOV(dst, ts); + + return dst; +} + +static unsigned +component_from_intrinsic(nir_intrinsic_instr *instr) +{ + if (nir_intrinsic_has_component(instr)) + return nir_intrinsic_component(instr); + else + return 0; +} + +static void +adjust_handle_and_offset(const fs_builder &bld, + fs_reg &urb_handle, + unsigned &urb_global_offset) +{ + /* Make sure that URB global offset is below 2048 (2^11), because + * that's the maximum possible value encoded in Message Descriptor. + */ + unsigned adjustment = (urb_global_offset >> 11) << 11; + + if (adjustment) { + fs_builder ubld8 = bld.group(8, 0).exec_all(); + /* Allocate new register to not overwrite the shared URB handle. */ + fs_reg new_handle = ubld8.vgrf(BRW_REGISTER_TYPE_UD); + ubld8.ADD(new_handle, urb_handle, brw_imm_ud(adjustment)); + urb_handle = new_handle; + urb_global_offset -= adjustment; + } +} + +static void +emit_urb_direct_vec4_write(const fs_builder &bld, + unsigned urb_global_offset, + const fs_reg &src, + fs_reg urb_handle, + unsigned dst_comp_offset, + unsigned comps, + unsigned mask) +{ + for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) { + fs_builder bld8 = bld.group(8, q); + + fs_reg payload_srcs[8]; + unsigned length = 0; + + for (unsigned i = 0; i < dst_comp_offset; i++) + payload_srcs[length++] = reg_undef; + + for (unsigned c = 0; c < comps; c++) + payload_srcs[length++] = quarter(offset(src, bld, c), q); + + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle; + srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16); + srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length), + BRW_REGISTER_TYPE_F); + srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length); + bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0); + + fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, + reg_undef, srcs, ARRAY_SIZE(srcs)); + inst->offset = urb_global_offset; + assert(inst->offset < 2048); + } +} + +static void +emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr, + const fs_reg &src, fs_reg urb_handle) +{ + assert(nir_src_bit_size(instr->src[0]) == 32); + + nir_src *offset_nir_src = nir_get_io_offset_src(instr); + assert(nir_src_is_const(*offset_nir_src)); + + const unsigned comps = nir_src_num_components(instr->src[0]); + assert(comps <= 4); + + const unsigned offset_in_dwords = nir_intrinsic_base(instr) + + nir_src_as_uint(*offset_nir_src) + + component_from_intrinsic(instr); + + /* URB writes are vec4 aligned but the intrinsic offsets are in dwords. + * We can write up to 8 dwords, so single vec4 write is enough. + */ + const unsigned comp_shift = offset_in_dwords % 4; + const unsigned mask = nir_intrinsic_write_mask(instr) << comp_shift; + + unsigned urb_global_offset = offset_in_dwords / 4; + adjust_handle_and_offset(bld, urb_handle, urb_global_offset); + + emit_urb_direct_vec4_write(bld, urb_global_offset, src, urb_handle, + comp_shift, comps, mask); +} + +static void +emit_urb_direct_vec4_write_xe2(const fs_builder &bld, + unsigned offset_in_bytes, + const fs_reg &src, + fs_reg urb_handle, + unsigned comps, + unsigned mask) +{ + const struct intel_device_info *devinfo = bld.shader->devinfo; + const unsigned runit = reg_unit(devinfo); + const unsigned write_size = 8 * runit; + + if (offset_in_bytes > 0) { + fs_builder bldall = bld.group(write_size, 0).exec_all(); + fs_reg new_handle = bldall.vgrf(BRW_REGISTER_TYPE_UD); + bldall.ADD(new_handle, urb_handle, brw_imm_ud(offset_in_bytes)); + urb_handle = new_handle; + } + + for (unsigned q = 0; q < bld.dispatch_width() / write_size; q++) { + fs_builder hbld = bld.group(write_size, q); + + fs_reg payload_srcs[comps]; + + for (unsigned c = 0; c < comps; c++) + payload_srcs[c] = horiz_offset(offset(src, bld, c), write_size * q); + + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle; + srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16); + int nr = bld.shader->alloc.allocate(comps * runit); + srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, nr, BRW_REGISTER_TYPE_F); + srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(comps); + hbld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, comps, 0); + + hbld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, + reg_undef, srcs, ARRAY_SIZE(srcs)); + } +} + +static void +emit_urb_direct_writes_xe2(const fs_builder &bld, nir_intrinsic_instr *instr, + const fs_reg &src, fs_reg urb_handle) +{ + assert(nir_src_bit_size(instr->src[0]) == 32); + + nir_src *offset_nir_src = nir_get_io_offset_src(instr); + assert(nir_src_is_const(*offset_nir_src)); + + const unsigned comps = nir_src_num_components(instr->src[0]); + assert(comps <= 4); + + const unsigned offset_in_dwords = nir_intrinsic_base(instr) + + nir_src_as_uint(*offset_nir_src) + + component_from_intrinsic(instr); + + const unsigned mask = nir_intrinsic_write_mask(instr); + + emit_urb_direct_vec4_write_xe2(bld, offset_in_dwords * 4, src, + urb_handle, comps, mask); +} + +static void +emit_urb_indirect_vec4_write(const fs_builder &bld, + const fs_reg &offset_src, + unsigned base, + const fs_reg &src, + fs_reg urb_handle, + unsigned dst_comp_offset, + unsigned comps, + unsigned mask) +{ + for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) { + fs_builder bld8 = bld.group(8, q); + + /* offset is always positive, so signedness doesn't matter */ + assert(offset_src.type == BRW_REGISTER_TYPE_D || + offset_src.type == BRW_REGISTER_TYPE_UD); + fs_reg off = bld8.vgrf(offset_src.type, 1); + bld8.MOV(off, quarter(offset_src, q)); + bld8.ADD(off, off, brw_imm_ud(base)); + bld8.SHR(off, off, brw_imm_ud(2)); + + fs_reg payload_srcs[8]; + unsigned length = 0; + + for (unsigned i = 0; i < dst_comp_offset; i++) + payload_srcs[length++] = reg_undef; + + for (unsigned c = 0; c < comps; c++) + payload_srcs[length++] = quarter(offset(src, bld, c), q); + + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle; + srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off; + srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16); + srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length), + BRW_REGISTER_TYPE_F); + srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length); + bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0); + + fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, + reg_undef, srcs, ARRAY_SIZE(srcs)); + inst->offset = 0; + } +} + +static void +emit_urb_indirect_writes_mod(const fs_builder &bld, nir_intrinsic_instr *instr, + const fs_reg &src, const fs_reg &offset_src, + fs_reg urb_handle, unsigned mod) +{ + assert(nir_src_bit_size(instr->src[0]) == 32); + + const unsigned comps = nir_src_num_components(instr->src[0]); + assert(comps <= 4); + + const unsigned base_in_dwords = nir_intrinsic_base(instr) + + component_from_intrinsic(instr); + + const unsigned comp_shift = mod; + const unsigned mask = nir_intrinsic_write_mask(instr) << comp_shift; + + emit_urb_indirect_vec4_write(bld, offset_src, base_in_dwords, src, + urb_handle, comp_shift, comps, mask); +} + +static void +emit_urb_indirect_writes_xe2(const fs_builder &bld, nir_intrinsic_instr *instr, + const fs_reg &src, const fs_reg &offset_src, + fs_reg urb_handle) +{ + assert(nir_src_bit_size(instr->src[0]) == 32); + + const struct intel_device_info *devinfo = bld.shader->devinfo; + const unsigned runit = reg_unit(devinfo); + const unsigned write_size = 8 * runit; + + const unsigned comps = nir_src_num_components(instr->src[0]); + assert(comps <= 4); + + const unsigned base_in_dwords = nir_intrinsic_base(instr) + + component_from_intrinsic(instr); + + if (base_in_dwords > 0) { + fs_builder bldall = bld.group(write_size, 0).exec_all(); + fs_reg new_handle = bldall.vgrf(BRW_REGISTER_TYPE_UD); + bldall.ADD(new_handle, urb_handle, brw_imm_ud(base_in_dwords * 4)); + urb_handle = new_handle; + } + + const unsigned mask = nir_intrinsic_write_mask(instr); + + for (unsigned q = 0; q < bld.dispatch_width() / write_size; q++) { + fs_builder wbld = bld.group(write_size, q); + + fs_reg payload_srcs[comps]; + + for (unsigned c = 0; c < comps; c++) + payload_srcs[c] = horiz_offset(offset(src, bld, c), write_size * q); + + fs_reg addr = wbld.vgrf(BRW_REGISTER_TYPE_UD); + wbld.SHL(addr, horiz_offset(offset_src, write_size * q), brw_imm_ud(2)); + wbld.ADD(addr, addr, urb_handle); + + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = addr; + srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16); + int nr = bld.shader->alloc.allocate(comps * runit); + srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, nr, BRW_REGISTER_TYPE_F); + srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(comps); + wbld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, comps, 0); + + wbld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, + reg_undef, srcs, ARRAY_SIZE(srcs)); + } +} + +static void +emit_urb_indirect_writes(const fs_builder &bld, nir_intrinsic_instr *instr, + const fs_reg &src, const fs_reg &offset_src, + fs_reg urb_handle) +{ + assert(nir_src_bit_size(instr->src[0]) == 32); + + const unsigned comps = nir_src_num_components(instr->src[0]); + assert(comps <= 4); + + const unsigned base_in_dwords = nir_intrinsic_base(instr) + + component_from_intrinsic(instr); + + /* Use URB write message that allow different offsets per-slot. The offset + * is in units of vec4s (128 bits), so we use a write for each component, + * replicating it in the sources and applying the appropriate mask based on + * the dword offset. + */ + + for (unsigned c = 0; c < comps; c++) { + if (((1 << c) & nir_intrinsic_write_mask(instr)) == 0) + continue; + + fs_reg src_comp = offset(src, bld, c); + + for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) { + fs_builder bld8 = bld.group(8, q); + + /* offset is always positive, so signedness doesn't matter */ + assert(offset_src.type == BRW_REGISTER_TYPE_D || + offset_src.type == BRW_REGISTER_TYPE_UD); + fs_reg off = bld8.vgrf(offset_src.type, 1); + bld8.MOV(off, quarter(offset_src, q)); + bld8.ADD(off, off, brw_imm_ud(c + base_in_dwords)); + + fs_reg mask = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1); + bld8.AND(mask, off, brw_imm_ud(0x3)); + + fs_reg one = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1); + bld8.MOV(one, brw_imm_ud(1)); + bld8.SHL(mask, one, mask); + bld8.SHL(mask, mask, brw_imm_ud(16)); + + bld8.SHR(off, off, brw_imm_ud(2)); + + fs_reg payload_srcs[4]; + unsigned length = 0; + + for (unsigned j = 0; j < 4; j++) + payload_srcs[length++] = quarter(src_comp, q); + + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle; + srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off; + srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask; + srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length), + BRW_REGISTER_TYPE_F); + srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length); + bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0); + + fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, + reg_undef, srcs, ARRAY_SIZE(srcs)); + inst->offset = 0; + } + } +} + +static void +emit_urb_direct_reads(const fs_builder &bld, nir_intrinsic_instr *instr, + const fs_reg &dest, fs_reg urb_handle) +{ + assert(instr->def.bit_size == 32); + + unsigned comps = instr->def.num_components; + if (comps == 0) + return; + + nir_src *offset_nir_src = nir_get_io_offset_src(instr); + assert(nir_src_is_const(*offset_nir_src)); + + const unsigned offset_in_dwords = nir_intrinsic_base(instr) + + nir_src_as_uint(*offset_nir_src) + + component_from_intrinsic(instr); + + unsigned urb_global_offset = offset_in_dwords / 4; + adjust_handle_and_offset(bld, urb_handle, urb_global_offset); + + const unsigned comp_offset = offset_in_dwords % 4; + const unsigned num_regs = comp_offset + comps; + + fs_builder ubld8 = bld.group(8, 0).exec_all(); + fs_reg data = ubld8.vgrf(BRW_REGISTER_TYPE_UD, num_regs); + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle; + + fs_inst *inst = ubld8.emit(SHADER_OPCODE_URB_READ_LOGICAL, data, + srcs, ARRAY_SIZE(srcs)); + inst->offset = urb_global_offset; + assert(inst->offset < 2048); + inst->size_written = num_regs * REG_SIZE; + + for (unsigned c = 0; c < comps; c++) { + fs_reg dest_comp = offset(dest, bld, c); + fs_reg data_comp = horiz_stride(offset(data, ubld8, comp_offset + c), 0); + bld.MOV(retype(dest_comp, BRW_REGISTER_TYPE_UD), data_comp); + } +} + +static void +emit_urb_direct_reads_xe2(const fs_builder &bld, nir_intrinsic_instr *instr, + const fs_reg &dest, fs_reg urb_handle) +{ + assert(instr->def.bit_size == 32); + + unsigned comps = instr->def.num_components; + if (comps == 0) + return; + + nir_src *offset_nir_src = nir_get_io_offset_src(instr); + assert(nir_src_is_const(*offset_nir_src)); + + fs_builder ubld16 = bld.group(16, 0).exec_all(); + + const unsigned offset_in_dwords = nir_intrinsic_base(instr) + + nir_src_as_uint(*offset_nir_src) + + component_from_intrinsic(instr); + + if (offset_in_dwords > 0) { + fs_reg new_handle = ubld16.vgrf(BRW_REGISTER_TYPE_UD); + ubld16.ADD(new_handle, urb_handle, brw_imm_ud(offset_in_dwords * 4)); + urb_handle = new_handle; + } + + fs_reg data = ubld16.vgrf(BRW_REGISTER_TYPE_UD, comps); + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle; + + fs_inst *inst = ubld16.emit(SHADER_OPCODE_URB_READ_LOGICAL, + data, srcs, ARRAY_SIZE(srcs)); + inst->size_written = 2 * comps * REG_SIZE; + + for (unsigned c = 0; c < comps; c++) { + fs_reg dest_comp = offset(dest, bld, c); + fs_reg data_comp = horiz_stride(offset(data, ubld16, c), 0); + bld.MOV(retype(dest_comp, BRW_REGISTER_TYPE_UD), data_comp); + } +} + +static void +emit_urb_indirect_reads(const fs_builder &bld, nir_intrinsic_instr *instr, + const fs_reg &dest, const fs_reg &offset_src, fs_reg urb_handle) +{ + assert(instr->def.bit_size == 32); + + unsigned comps = instr->def.num_components; + if (comps == 0) + return; + + fs_reg seq_ud; + { + fs_builder ubld8 = bld.group(8, 0).exec_all(); + seq_ud = ubld8.vgrf(BRW_REGISTER_TYPE_UD, 1); + fs_reg seq_uw = ubld8.vgrf(BRW_REGISTER_TYPE_UW, 1); + ubld8.MOV(seq_uw, fs_reg(brw_imm_v(0x76543210))); + ubld8.MOV(seq_ud, seq_uw); + ubld8.SHL(seq_ud, seq_ud, brw_imm_ud(2)); + } + + const unsigned base_in_dwords = nir_intrinsic_base(instr) + + component_from_intrinsic(instr); + + for (unsigned c = 0; c < comps; c++) { + for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) { + fs_builder bld8 = bld.group(8, q); + + /* offset is always positive, so signedness doesn't matter */ + assert(offset_src.type == BRW_REGISTER_TYPE_D || + offset_src.type == BRW_REGISTER_TYPE_UD); + fs_reg off = bld8.vgrf(offset_src.type, 1); + bld8.MOV(off, quarter(offset_src, q)); + bld8.ADD(off, off, brw_imm_ud(base_in_dwords + c)); + + STATIC_ASSERT(IS_POT(REG_SIZE) && REG_SIZE > 1); + + fs_reg comp = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1); + bld8.AND(comp, off, brw_imm_ud(0x3)); + bld8.SHL(comp, comp, brw_imm_ud(ffs(REG_SIZE) - 1)); + bld8.ADD(comp, comp, seq_ud); + + bld8.SHR(off, off, brw_imm_ud(2)); + + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle; + srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off; + + fs_reg data = bld8.vgrf(BRW_REGISTER_TYPE_UD, 4); + + fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_READ_LOGICAL, + data, srcs, ARRAY_SIZE(srcs)); + inst->offset = 0; + inst->size_written = 4 * REG_SIZE; + + fs_reg dest_comp = offset(dest, bld, c); + bld8.emit(SHADER_OPCODE_MOV_INDIRECT, + retype(quarter(dest_comp, q), BRW_REGISTER_TYPE_UD), + data, + comp, + brw_imm_ud(4 * REG_SIZE)); + } + } +} + +static void +emit_urb_indirect_reads_xe2(const fs_builder &bld, nir_intrinsic_instr *instr, + const fs_reg &dest, const fs_reg &offset_src, + fs_reg urb_handle) +{ + assert(instr->def.bit_size == 32); + + unsigned comps = instr->def.num_components; + if (comps == 0) + return; + + fs_builder ubld16 = bld.group(16, 0).exec_all(); + + const unsigned offset_in_dwords = nir_intrinsic_base(instr) + + component_from_intrinsic(instr); + + if (offset_in_dwords > 0) { + fs_reg new_handle = ubld16.vgrf(BRW_REGISTER_TYPE_UD); + ubld16.ADD(new_handle, urb_handle, brw_imm_ud(offset_in_dwords * 4)); + urb_handle = new_handle; + } + + fs_reg data = ubld16.vgrf(BRW_REGISTER_TYPE_UD, comps); + + + for (unsigned q = 0; q < bld.dispatch_width() / 16; q++) { + fs_builder wbld = bld.group(16, q); + + fs_reg addr = wbld.vgrf(BRW_REGISTER_TYPE_UD); + wbld.SHL(addr, horiz_offset(offset_src, 16 * q), brw_imm_ud(2)); + wbld.ADD(addr, addr, urb_handle); + + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = addr; + + fs_inst *inst = wbld.emit(SHADER_OPCODE_URB_READ_LOGICAL, + data, srcs, ARRAY_SIZE(srcs)); + inst->size_written = 2 * comps * REG_SIZE; + + for (unsigned c = 0; c < comps; c++) { + fs_reg dest_comp = horiz_offset(offset(dest, bld, c), 16 * q); + fs_reg data_comp = offset(data, wbld, c); + wbld.MOV(retype(dest_comp, BRW_REGISTER_TYPE_UD), data_comp); + } + } +} + +static void +emit_task_mesh_store(nir_to_brw_state &ntb, + const fs_builder &bld, nir_intrinsic_instr *instr, + const fs_reg &urb_handle) +{ + fs_reg src = get_nir_src(ntb, instr->src[0]); + nir_src *offset_nir_src = nir_get_io_offset_src(instr); + + if (nir_src_is_const(*offset_nir_src)) { + if (bld.shader->devinfo->ver >= 20) + emit_urb_direct_writes_xe2(bld, instr, src, urb_handle); + else + emit_urb_direct_writes(bld, instr, src, urb_handle); + } else { + if (bld.shader->devinfo->ver >= 20) { + emit_urb_indirect_writes_xe2(bld, instr, src, get_nir_src(ntb, *offset_nir_src), urb_handle); + return; + } + bool use_mod = false; + unsigned mod; + + /* Try to calculate the value of (offset + base) % 4. If we can do + * this, then we can do indirect writes using only 1 URB write. + */ + use_mod = nir_mod_analysis(nir_get_scalar(offset_nir_src->ssa, 0), nir_type_uint, 4, &mod); + if (use_mod) { + mod += nir_intrinsic_base(instr) + component_from_intrinsic(instr); + mod %= 4; + } + + if (use_mod) { + emit_urb_indirect_writes_mod(bld, instr, src, get_nir_src(ntb, *offset_nir_src), urb_handle, mod); + } else { + emit_urb_indirect_writes(bld, instr, src, get_nir_src(ntb, *offset_nir_src), urb_handle); + } + } +} + +static void +emit_task_mesh_load(nir_to_brw_state &ntb, + const fs_builder &bld, nir_intrinsic_instr *instr, + const fs_reg &urb_handle) +{ + fs_reg dest = get_nir_def(ntb, instr->def); + nir_src *offset_nir_src = nir_get_io_offset_src(instr); + + /* TODO(mesh): for per_vertex and per_primitive, if we could keep around + * the non-array-index offset, we could use to decide if we can perform + * a single large aligned read instead one per component. + */ + + if (nir_src_is_const(*offset_nir_src)) { + if (bld.shader->devinfo->ver >= 20) + emit_urb_direct_reads_xe2(bld, instr, dest, urb_handle); + else + emit_urb_direct_reads(bld, instr, dest, urb_handle); + } else { + if (bld.shader->devinfo->ver >= 20) + emit_urb_indirect_reads_xe2(bld, instr, dest, get_nir_src(ntb, *offset_nir_src), urb_handle); + else + emit_urb_indirect_reads(bld, instr, dest, get_nir_src(ntb, *offset_nir_src), urb_handle); + } +} + +static void +fs_nir_emit_task_mesh_intrinsic(nir_to_brw_state &ntb, const fs_builder &bld, + nir_intrinsic_instr *instr) +{ + fs_visitor &s = ntb.s; + + assert(s.stage == MESA_SHADER_MESH || s.stage == MESA_SHADER_TASK); + const task_mesh_thread_payload &payload = s.task_mesh_payload(); + + fs_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_def(ntb, instr->def); + + switch (instr->intrinsic) { + case nir_intrinsic_load_mesh_inline_data_intel: { + fs_reg data = offset(payload.inline_parameter, 1, nir_intrinsic_align_offset(instr)); + bld.MOV(dest, retype(data, dest.type)); + break; + } + + case nir_intrinsic_load_draw_id: + dest = retype(dest, BRW_REGISTER_TYPE_UD); + bld.MOV(dest, payload.extended_parameter_0); + break; + + case nir_intrinsic_load_local_invocation_id: + unreachable("local invocation id should have been lowered earlier"); + break; + + case nir_intrinsic_load_local_invocation_index: + dest = retype(dest, BRW_REGISTER_TYPE_UD); + bld.MOV(dest, payload.local_index); + break; + + case nir_intrinsic_load_num_workgroups: + dest = retype(dest, BRW_REGISTER_TYPE_UD); + bld.MOV(offset(dest, bld, 0), brw_uw1_grf(0, 13)); /* g0.6 >> 16 */ + bld.MOV(offset(dest, bld, 1), brw_uw1_grf(0, 8)); /* g0.4 & 0xffff */ + bld.MOV(offset(dest, bld, 2), brw_uw1_grf(0, 9)); /* g0.4 >> 16 */ + break; + + case nir_intrinsic_load_workgroup_index: + dest = retype(dest, BRW_REGISTER_TYPE_UD); + bld.MOV(dest, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD)); + break; + + default: + fs_nir_emit_cs_intrinsic(ntb, instr); + break; + } +} + +static void +fs_nir_emit_task_intrinsic(nir_to_brw_state &ntb, + nir_intrinsic_instr *instr) +{ + const fs_builder &bld = ntb.bld; + fs_visitor &s = ntb.s; + + assert(s.stage == MESA_SHADER_TASK); + const task_mesh_thread_payload &payload = s.task_mesh_payload(); + + switch (instr->intrinsic) { + case nir_intrinsic_store_output: + case nir_intrinsic_store_task_payload: + emit_task_mesh_store(ntb, bld, instr, payload.urb_output); + break; + + case nir_intrinsic_load_output: + case nir_intrinsic_load_task_payload: + emit_task_mesh_load(ntb, bld, instr, payload.urb_output); + break; + + default: + fs_nir_emit_task_mesh_intrinsic(ntb, bld, instr); + break; + } +} + +static void +fs_nir_emit_mesh_intrinsic(nir_to_brw_state &ntb, + nir_intrinsic_instr *instr) +{ + const fs_builder &bld = ntb.bld; + fs_visitor &s = ntb.s; + + assert(s.stage == MESA_SHADER_MESH); + const task_mesh_thread_payload &payload = s.task_mesh_payload(); + + switch (instr->intrinsic) { + case nir_intrinsic_store_per_primitive_output: + case nir_intrinsic_store_per_vertex_output: + case nir_intrinsic_store_output: + emit_task_mesh_store(ntb, bld, instr, payload.urb_output); + break; + + case nir_intrinsic_load_per_vertex_output: + case nir_intrinsic_load_per_primitive_output: + case nir_intrinsic_load_output: + emit_task_mesh_load(ntb, bld, instr, payload.urb_output); + break; + + case nir_intrinsic_load_task_payload: + emit_task_mesh_load(ntb, bld, instr, payload.task_urb_input); + break; + + default: + fs_nir_emit_task_mesh_intrinsic(ntb, bld, instr); + break; + } +} + +static void +fs_nir_emit_intrinsic(nir_to_brw_state &ntb, + const fs_builder &bld, nir_intrinsic_instr *instr) +{ + const intel_device_info *devinfo = ntb.devinfo; + fs_visitor &s = ntb.s; + + /* We handle this as a special case */ + if (instr->intrinsic == nir_intrinsic_decl_reg) { + assert(nir_intrinsic_num_array_elems(instr) == 0); + unsigned bit_size = nir_intrinsic_bit_size(instr); + unsigned num_components = nir_intrinsic_num_components(instr); + const brw_reg_type reg_type = + brw_reg_type_from_bit_size(bit_size, bit_size == 8 ? + BRW_REGISTER_TYPE_D : + BRW_REGISTER_TYPE_F); + + /* Re-use the destination's slot in the table for the register */ + ntb.ssa_values[instr->def.index] = + bld.vgrf(reg_type, num_components); + return; + } + + fs_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_def(ntb, instr->def); + + switch (instr->intrinsic) { + case nir_intrinsic_resource_intel: + ntb.ssa_bind_infos[instr->def.index].valid = true; + ntb.ssa_bind_infos[instr->def.index].bindless = + (nir_intrinsic_resource_access_intel(instr) & + nir_resource_intel_bindless) != 0; + ntb.ssa_bind_infos[instr->def.index].block = + nir_intrinsic_resource_block_intel(instr); + ntb.ssa_bind_infos[instr->def.index].set = + nir_intrinsic_desc_set(instr); + ntb.ssa_bind_infos[instr->def.index].binding = + nir_intrinsic_binding(instr); + + if (nir_intrinsic_resource_access_intel(instr) & + nir_resource_intel_non_uniform) { + ntb.resource_values[instr->def.index] = fs_reg(); + } else { + ntb.resource_values[instr->def.index] = + try_rebuild_resource(ntb, bld, instr->src[1].ssa); + } + ntb.ssa_values[instr->def.index] = + ntb.ssa_values[instr->src[1].ssa->index]; + break; + + case nir_intrinsic_load_reg: + case nir_intrinsic_store_reg: + /* Nothing to do with these. */ + break; + + case nir_intrinsic_image_load: + case nir_intrinsic_image_store: + case nir_intrinsic_image_atomic: + case nir_intrinsic_image_atomic_swap: + case nir_intrinsic_bindless_image_load: + case nir_intrinsic_bindless_image_store: + case nir_intrinsic_bindless_image_atomic: + case nir_intrinsic_bindless_image_atomic_swap: { + /* Get some metadata from the image intrinsic. */ + const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; + + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + + switch (instr->intrinsic) { + case nir_intrinsic_image_load: + case nir_intrinsic_image_store: + case nir_intrinsic_image_atomic: + case nir_intrinsic_image_atomic_swap: + srcs[SURFACE_LOGICAL_SRC_SURFACE] = + get_nir_image_intrinsic_image(ntb, bld, instr); + break; + + default: + /* Bindless */ + srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = + get_nir_image_intrinsic_image(ntb, bld, instr); + break; + } + + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]); + srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = + brw_imm_ud(nir_image_intrinsic_coord_components(instr)); + + /* Emit an image load, store or atomic op. */ + if (instr->intrinsic == nir_intrinsic_image_load || + instr->intrinsic == nir_intrinsic_bindless_image_load) { + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); + fs_inst *inst = + bld.emit(SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL, + dest, srcs, SURFACE_LOGICAL_NUM_SRCS); + inst->size_written = instr->num_components * s.dispatch_width * 4; + } else if (instr->intrinsic == nir_intrinsic_image_store || + instr->intrinsic == nir_intrinsic_bindless_image_store) { + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); + srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(ntb, instr->src[3]); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); + bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL, + fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); + } else { + unsigned num_srcs = info->num_srcs; + enum lsc_opcode op = lsc_aop_for_nir_intrinsic(instr); + if (op == LSC_OP_ATOMIC_INC || op == LSC_OP_ATOMIC_DEC) { + assert(num_srcs == 4); + num_srcs = 3; + } + + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); + + fs_reg data; + if (num_srcs >= 4) + data = get_nir_src(ntb, instr->src[3]); + if (num_srcs >= 5) { + fs_reg tmp = bld.vgrf(data.type, 2); + fs_reg sources[2] = { data, get_nir_src(ntb, instr->src[4]) }; + bld.LOAD_PAYLOAD(tmp, sources, 2, 0); + data = tmp; + } + srcs[SURFACE_LOGICAL_SRC_DATA] = data; + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); + + bld.emit(SHADER_OPCODE_TYPED_ATOMIC_LOGICAL, + dest, srcs, SURFACE_LOGICAL_NUM_SRCS); + } + break; + } + + case nir_intrinsic_image_size: + case nir_intrinsic_bindless_image_size: { + /* Cube image sizes should have previously been lowered to a 2D array */ + assert(nir_intrinsic_image_dim(instr) != GLSL_SAMPLER_DIM_CUBE); + + /* Unlike the [un]typed load and store opcodes, the TXS that this turns + * into will handle the binding table index for us in the geneerator. + * Incidentally, this means that we can handle bindless with exactly the + * same code. + */ + fs_reg image = retype(get_nir_src_imm(ntb, instr->src[0]), + BRW_REGISTER_TYPE_UD); + image = bld.emit_uniformize(image); + + assert(nir_src_as_uint(instr->src[1]) == 0); + + fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; + if (instr->intrinsic == nir_intrinsic_image_size) + srcs[TEX_LOGICAL_SRC_SURFACE] = image; + else + srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image; + srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0); + srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0); + srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0); + srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_d(0); + + /* Since the image size is always uniform, we can just emit a SIMD8 + * query instruction and splat the result out. + */ + const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0); + + fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4); + fs_inst *inst = ubld.emit(SHADER_OPCODE_IMAGE_SIZE_LOGICAL, + tmp, srcs, ARRAY_SIZE(srcs)); + inst->size_written = 4 * REG_SIZE * reg_unit(devinfo); + + for (unsigned c = 0; c < instr->def.num_components; ++c) { + bld.MOV(offset(retype(dest, tmp.type), bld, c), + component(offset(tmp, ubld, c), 0)); + } + break; + } + + case nir_intrinsic_image_load_raw_intel: { + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + srcs[SURFACE_LOGICAL_SRC_SURFACE] = + get_nir_image_intrinsic_image(ntb, bld, instr); + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]); + srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); + + fs_inst *inst = + bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, + dest, srcs, SURFACE_LOGICAL_NUM_SRCS); + inst->size_written = instr->num_components * s.dispatch_width * 4; + break; + } + + case nir_intrinsic_image_store_raw_intel: { + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + srcs[SURFACE_LOGICAL_SRC_SURFACE] = + get_nir_image_intrinsic_image(ntb, bld, instr); + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]); + srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(ntb, instr->src[2]); + srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); + + bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, + fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); + break; + } + + case nir_intrinsic_barrier: + case nir_intrinsic_begin_invocation_interlock: + case nir_intrinsic_end_invocation_interlock: { + bool ugm_fence, slm_fence, tgm_fence, urb_fence; + enum opcode opcode = BRW_OPCODE_NOP; + + /* Handling interlock intrinsics here will allow the logic for IVB + * render cache (see below) to be reused. + */ + + switch (instr->intrinsic) { + case nir_intrinsic_barrier: { + /* Note we only care about the memory part of the + * barrier. The execution part will be taken care + * of by the stage specific intrinsic handler functions. + */ + nir_variable_mode modes = nir_intrinsic_memory_modes(instr); + ugm_fence = modes & (nir_var_mem_ssbo | nir_var_mem_global); + slm_fence = modes & nir_var_mem_shared; + tgm_fence = modes & nir_var_image; + urb_fence = modes & (nir_var_shader_out | nir_var_mem_task_payload); + if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE) + opcode = SHADER_OPCODE_MEMORY_FENCE; + break; + } + + case nir_intrinsic_begin_invocation_interlock: + /* For beginInvocationInterlockARB(), we will generate a memory fence + * but with a different opcode so that generator can pick SENDC + * instead of SEND. + */ + assert(s.stage == MESA_SHADER_FRAGMENT); + ugm_fence = tgm_fence = true; + slm_fence = urb_fence = false; + opcode = SHADER_OPCODE_INTERLOCK; + break; + + case nir_intrinsic_end_invocation_interlock: + /* For endInvocationInterlockARB(), we need to insert a memory fence which + * stalls in the shader until the memory transactions prior to that + * fence are complete. This ensures that the shader does not end before + * any writes from its critical section have landed. Otherwise, you can + * end up with a case where the next invocation on that pixel properly + * stalls for previous FS invocation on its pixel to complete but + * doesn't actually wait for the dataport memory transactions from that + * thread to land before submitting its own. + */ + assert(s.stage == MESA_SHADER_FRAGMENT); + ugm_fence = tgm_fence = true; + slm_fence = urb_fence = false; + opcode = SHADER_OPCODE_MEMORY_FENCE; + break; + + default: + unreachable("invalid intrinsic"); + } + + if (opcode == BRW_OPCODE_NOP) + break; + + if (s.nir->info.shared_size > 0) { + assert(gl_shader_stage_uses_workgroup(s.stage)); + } else { + slm_fence = false; + } + + /* If the workgroup fits in a single HW thread, the messages for SLM are + * processed in-order and the shader itself is already synchronized so + * the memory fence is not necessary. + * + * TODO: Check if applies for many HW threads sharing same Data Port. + */ + if (!s.nir->info.workgroup_size_variable && + slm_fence && s.workgroup_size() <= s.dispatch_width) + slm_fence = false; + + switch (s.stage) { + case MESA_SHADER_TESS_CTRL: + case MESA_SHADER_TASK: + case MESA_SHADER_MESH: + break; + default: + urb_fence = false; + break; + } + + unsigned fence_regs_count = 0; + fs_reg fence_regs[4] = {}; + + const fs_builder ubld = bld.group(8, 0); + + /* A memory barrier with acquire semantics requires us to + * guarantee that memory operations of the specified storage + * class sequenced-after the barrier aren't reordered before the + * barrier, nor before any previous atomic operation + * sequenced-before the barrier which may be synchronizing this + * acquire barrier with a prior release sequence. + * + * In order to guarantee the latter we must make sure that any + * such previous operation has completed execution before + * invalidating the relevant caches, since otherwise some cache + * could be polluted by a concurrent thread after its + * invalidation but before the previous atomic completes, which + * could lead to a violation of the expected memory ordering if + * a subsequent memory read hits the polluted cacheline, which + * would return a stale value read from memory before the + * completion of the atomic sequenced-before the barrier. + * + * This ordering inversion can be avoided trivially if the + * operations we need to order are all handled by a single + * in-order cache, since the flush implied by the memory fence + * occurs after any pending operations have completed, however + * that doesn't help us when dealing with multiple caches + * processing requests out of order, in which case we need to + * explicitly stall the EU until any pending memory operations + * have executed. + * + * Note that that might be somewhat heavy handed in some cases. + * In particular when this memory fence was inserted by + * spirv_to_nir() lowering an atomic with acquire semantics into + * an atomic+barrier sequence we could do a better job by + * synchronizing with respect to that one atomic *only*, but + * that would require additional information not currently + * available to the backend. + * + * XXX - Use an alternative workaround on IVB and ICL, since + * SYNC.ALLWR is only available on Gfx12+. + */ + if (devinfo->ver >= 12 && + (!nir_intrinsic_has_memory_scope(instr) || + (nir_intrinsic_memory_semantics(instr) & NIR_MEMORY_ACQUIRE))) { + ubld.exec_all().group(1, 0).emit( + BRW_OPCODE_SYNC, ubld.null_reg_ud(), brw_imm_ud(TGL_SYNC_ALLWR)); + } + + if (devinfo->has_lsc) { + assert(devinfo->verx10 >= 125); + uint32_t desc = + lsc_fence_descriptor_for_intrinsic(devinfo, instr); + if (ugm_fence) { + fence_regs[fence_regs_count++] = + emit_fence(ubld, opcode, GFX12_SFID_UGM, desc, + true /* commit_enable */, + 0 /* bti; ignored for LSC */); + } + + if (tgm_fence) { + fence_regs[fence_regs_count++] = + emit_fence(ubld, opcode, GFX12_SFID_TGM, desc, + true /* commit_enable */, + 0 /* bti; ignored for LSC */); + } + + if (slm_fence) { + assert(opcode == SHADER_OPCODE_MEMORY_FENCE); + if (intel_needs_workaround(devinfo, 14014063774)) { + /* Wa_14014063774 + * + * Before SLM fence compiler needs to insert SYNC.ALLWR in order + * to avoid the SLM data race. + */ + ubld.exec_all().group(1, 0).emit( + BRW_OPCODE_SYNC, ubld.null_reg_ud(), + brw_imm_ud(TGL_SYNC_ALLWR)); + } + fence_regs[fence_regs_count++] = + emit_fence(ubld, opcode, GFX12_SFID_SLM, desc, + true /* commit_enable */, + 0 /* BTI; ignored for LSC */); + } + + if (urb_fence) { + assert(opcode == SHADER_OPCODE_MEMORY_FENCE); + fence_regs[fence_regs_count++] = + emit_fence(ubld, opcode, BRW_SFID_URB, desc, + true /* commit_enable */, + 0 /* BTI; ignored for LSC */); + } + } else if (devinfo->ver >= 11) { + if (tgm_fence || ugm_fence || urb_fence) { + fence_regs[fence_regs_count++] = + emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0, + true /* commit_enable HSD ES # 1404612949 */, + 0 /* BTI = 0 means data cache */); + } + + if (slm_fence) { + assert(opcode == SHADER_OPCODE_MEMORY_FENCE); + fence_regs[fence_regs_count++] = + emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0, + true /* commit_enable HSD ES # 1404612949 */, + GFX7_BTI_SLM); + } + } else { + /* Prior to Icelake, they're all lumped into a single cache except on + * Ivy Bridge and Bay Trail where typed messages actually go through + * the render cache. There, we need both fences because we may + * access storage images as either typed or untyped. + */ + const bool render_fence = tgm_fence && devinfo->verx10 == 70; + + /* Simulation also complains on Gfx9 if we do not enable commit. + */ + const bool commit_enable = render_fence || + instr->intrinsic == nir_intrinsic_end_invocation_interlock || + devinfo->ver == 9; + + if (tgm_fence || ugm_fence || slm_fence || urb_fence) { + fence_regs[fence_regs_count++] = + emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0, + commit_enable, 0 /* BTI */); + } + + if (render_fence) { + fence_regs[fence_regs_count++] = + emit_fence(ubld, opcode, GFX6_SFID_DATAPORT_RENDER_CACHE, 0, + commit_enable, /* bti */ 0); + } + } + + assert(fence_regs_count <= ARRAY_SIZE(fence_regs)); + + /* Be conservative in Gen11+ and always stall in a fence. Since + * there are two different fences, and shader might want to + * synchronize between them. + * + * TODO: Use scope and visibility information for the barriers from NIR + * to make a better decision on whether we need to stall. + */ + bool force_stall = devinfo->ver >= 11; + + /* There are four cases where we want to insert a stall: + * + * 1. If we're a nir_intrinsic_end_invocation_interlock. This is + * required to ensure that the shader EOT doesn't happen until + * after the fence returns. Otherwise, we might end up with the + * next shader invocation for that pixel not respecting our fence + * because it may happen on a different HW thread. + * + * 2. If we have multiple fences. This is required to ensure that + * they all complete and nothing gets weirdly out-of-order. + * + * 3. If we have no fences. In this case, we need at least a + * scheduling barrier to keep the compiler from moving things + * around in an invalid way. + * + * 4. On Gen11+ and platforms with LSC, we have multiple fence types, + * without further information about the fence, we need to force a + * stall. + */ + if (instr->intrinsic == nir_intrinsic_end_invocation_interlock || + fence_regs_count != 1 || devinfo->has_lsc || force_stall) { + ubld.exec_all().group(1, 0).emit( + FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), + fence_regs, fence_regs_count); + } + + break; + } + + case nir_intrinsic_shader_clock: { + /* We cannot do anything if there is an event, so ignore it for now */ + const fs_reg shader_clock = get_timestamp(bld); + const fs_reg srcs[] = { component(shader_clock, 0), + component(shader_clock, 1) }; + bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0); + break; + } + + case nir_intrinsic_load_reloc_const_intel: { + uint32_t id = nir_intrinsic_param_idx(instr); + + /* Emit the reloc in the smallest SIMD size to limit register usage. */ + const fs_builder ubld = bld.exec_all().group(1, 0); + fs_reg small_dest = ubld.vgrf(dest.type); + ubld.UNDEF(small_dest); + ubld.exec_all().group(1, 0).emit(SHADER_OPCODE_MOV_RELOC_IMM, + small_dest, brw_imm_ud(id)); + + /* Copy propagation will get rid of this MOV. */ + bld.MOV(dest, component(small_dest, 0)); + break; + } + + case nir_intrinsic_load_uniform: { + /* Offsets are in bytes but they should always aligned to + * the type size + */ + unsigned base_offset = nir_intrinsic_base(instr); + assert(base_offset % 4 == 0 || base_offset % type_sz(dest.type) == 0); + + fs_reg src(UNIFORM, base_offset / 4, dest.type); + + if (nir_src_is_const(instr->src[0])) { + unsigned load_offset = nir_src_as_uint(instr->src[0]); + assert(load_offset % type_sz(dest.type) == 0); + /* The base offset can only handle 32-bit units, so for 16-bit + * data take the modulo of the offset with 4 bytes and add it to + * the offset to read from within the source register. + */ + src.offset = load_offset + base_offset % 4; + + for (unsigned j = 0; j < instr->num_components; j++) { + bld.MOV(offset(dest, bld, j), offset(src, bld, j)); + } + } else { + fs_reg indirect = retype(get_nir_src(ntb, instr->src[0]), + BRW_REGISTER_TYPE_UD); + + /* We need to pass a size to the MOV_INDIRECT but we don't want it to + * go past the end of the uniform. In order to keep the n'th + * component from running past, we subtract off the size of all but + * one component of the vector. + */ + assert(nir_intrinsic_range(instr) >= + instr->num_components * type_sz(dest.type)); + unsigned read_size = nir_intrinsic_range(instr) - + (instr->num_components - 1) * type_sz(dest.type); + + bool supports_64bit_indirects = + devinfo->platform != INTEL_PLATFORM_CHV && !intel_device_info_is_9lp(devinfo); + + if (type_sz(dest.type) != 8 || supports_64bit_indirects) { + for (unsigned j = 0; j < instr->num_components; j++) { + bld.emit(SHADER_OPCODE_MOV_INDIRECT, + offset(dest, bld, j), offset(src, bld, j), + indirect, brw_imm_ud(read_size)); + } + } else { + const unsigned num_mov_indirects = + type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD); + /* We read a little bit less per MOV INDIRECT, as they are now + * 32-bits ones instead of 64-bit. Fix read_size then. + */ + const unsigned read_size_32bit = read_size - + (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD); + for (unsigned j = 0; j < instr->num_components; j++) { + for (unsigned i = 0; i < num_mov_indirects; i++) { + bld.emit(SHADER_OPCODE_MOV_INDIRECT, + subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i), + subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i), + indirect, brw_imm_ud(read_size_32bit)); + } + } + } + } + break; + } + + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ubo_uniform_block_intel: { + fs_reg surface, surface_handle; + + if (get_nir_src_bindless(ntb, instr->src[0])) + surface_handle = get_nir_buffer_intrinsic_index(ntb, bld, instr); + else + surface = get_nir_buffer_intrinsic_index(ntb, bld, instr); + + if (!nir_src_is_const(instr->src[1])) { + if (instr->intrinsic == nir_intrinsic_load_ubo) { + /* load_ubo with non-uniform offset */ + fs_reg base_offset = retype(get_nir_src(ntb, instr->src[1]), + BRW_REGISTER_TYPE_UD); + + const unsigned comps_per_load = type_sz(dest.type) == 8 ? 2 : 4; + + for (int i = 0; i < instr->num_components; i += comps_per_load) { + const unsigned remaining = instr->num_components - i; + s.VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), + surface, surface_handle, + base_offset, + i * type_sz(dest.type), + instr->def.bit_size / 8, + MIN2(remaining, comps_per_load)); + } + + s.prog_data->has_ubo_pull = true; + } else { + /* load_ubo with uniform offset */ + const fs_builder ubld1 = bld.exec_all().group(1, 0); + const fs_builder ubld8 = bld.exec_all().group(8, 0); + const fs_builder ubld16 = bld.exec_all().group(16, 0); + + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + + srcs[SURFACE_LOGICAL_SRC_SURFACE] = surface; + srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = surface_handle; + + const nir_src load_offset = instr->src[1]; + if (nir_src_is_const(load_offset)) { + fs_reg addr = ubld8.vgrf(BRW_REGISTER_TYPE_UD); + ubld8.MOV(addr, brw_imm_ud(nir_src_as_uint(load_offset))); + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = component(addr, 0); + } else { + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + bld.emit_uniformize(get_nir_src(ntb, load_offset)); + } + + const unsigned total_dwords = + ALIGN(instr->num_components, REG_SIZE * reg_unit(devinfo) / 4); + unsigned loaded_dwords = 0; + + const fs_reg packed_consts = + ubld1.vgrf(BRW_REGISTER_TYPE_UD, total_dwords); + + while (loaded_dwords < total_dwords) { + const unsigned block = + choose_oword_block_size_dwords(devinfo, + total_dwords - loaded_dwords); + const unsigned block_bytes = block * 4; + + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block); + + const fs_builder &ubld = block <= 8 ? ubld8 : ubld16; + ubld.emit(SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL, + retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_REGISTER_TYPE_UD), + srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written = + align(block_bytes, REG_SIZE * reg_unit(devinfo)); + + loaded_dwords += block; + + ubld1.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS], + srcs[SURFACE_LOGICAL_SRC_ADDRESS], + brw_imm_ud(block_bytes)); + } + + for (unsigned c = 0; c < instr->num_components; c++) { + bld.MOV(retype(offset(dest, bld, c), BRW_REGISTER_TYPE_UD), + component(packed_consts, c)); + } + + s.prog_data->has_ubo_pull = true; + } + } else { + /* Even if we are loading doubles, a pull constant load will load + * a 32-bit vec4, so should only reserve vgrf space for that. If we + * need to load a full dvec4 we will have to emit 2 loads. This is + * similar to demote_pull_constants(), except that in that case we + * see individual accesses to each component of the vector and then + * we let CSE deal with duplicate loads. Here we see a vector access + * and we have to split it if necessary. + */ + const unsigned type_size = type_sz(dest.type); + const unsigned load_offset = nir_src_as_uint(instr->src[1]); + const unsigned ubo_block = + brw_nir_ubo_surface_index_get_push_block(instr->src[0]); + const unsigned offset_256b = load_offset / 32; + const unsigned end_256b = + DIV_ROUND_UP(load_offset + type_size * instr->num_components, 32); + + /* See if we've selected this as a push constant candidate */ + fs_reg push_reg; + for (int i = 0; i < 4; i++) { + const struct brw_ubo_range *range = &s.prog_data->ubo_ranges[i]; + if (range->block == ubo_block && + offset_256b >= range->start && + end_256b <= range->start + range->length) { + + push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type); + push_reg.offset = load_offset - 32 * range->start; + break; + } + } + + if (push_reg.file != BAD_FILE) { + for (unsigned i = 0; i < instr->num_components; i++) { + bld.MOV(offset(dest, bld, i), + byte_offset(push_reg, i * type_size)); + } + break; + } + + s.prog_data->has_ubo_pull = true; + + const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ + const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0); + + for (unsigned c = 0; c < instr->num_components;) { + const unsigned base = load_offset + c * type_size; + /* Number of usable components in the next block-aligned load. */ + const unsigned count = MIN2(instr->num_components - c, + (block_sz - base % block_sz) / type_size); + + const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD); + fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS]; + srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = surface; + srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle; + srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1)); + srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz); + + ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts, + srcs, PULL_UNIFORM_CONSTANT_SRCS); + + const fs_reg consts = + retype(byte_offset(packed_consts, base & (block_sz - 1)), + dest.type); + + for (unsigned d = 0; d < count; d++) + bld.MOV(offset(dest, bld, c + d), component(consts, d)); + + c += count; + } + } + break; + } + + case nir_intrinsic_load_global: + case nir_intrinsic_load_global_constant: { + assert(devinfo->ver >= 8); + + assert(instr->def.bit_size <= 32); + assert(nir_intrinsic_align(instr) > 0); + fs_reg srcs[A64_LOGICAL_NUM_SRCS]; + srcs[A64_LOGICAL_ADDRESS] = get_nir_src(ntb, instr->src[0]); + srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */ + srcs[A64_LOGICAL_ENABLE_HELPERS] = + brw_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS); + + if (instr->def.bit_size == 32 && + nir_intrinsic_align(instr) >= 4) { + assert(instr->def.num_components <= 4); + + srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components); + + fs_inst *inst = + bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL, dest, + srcs, A64_LOGICAL_NUM_SRCS); + inst->size_written = instr->num_components * + inst->dst.component_size(inst->exec_size); + } else { + const unsigned bit_size = instr->def.bit_size; + assert(instr->def.num_components == 1); + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); + + srcs[A64_LOGICAL_ARG] = brw_imm_ud(bit_size); + + bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL, tmp, + srcs, A64_LOGICAL_NUM_SRCS); + bld.MOV(dest, subscript(tmp, dest.type, 0)); + } + break; + } + + case nir_intrinsic_store_global: { + assert(devinfo->ver >= 8); + + assert(nir_src_bit_size(instr->src[0]) <= 32); + assert(nir_intrinsic_write_mask(instr) == + (1u << instr->num_components) - 1); + assert(nir_intrinsic_align(instr) > 0); + + fs_reg srcs[A64_LOGICAL_NUM_SRCS]; + srcs[A64_LOGICAL_ADDRESS] = get_nir_src(ntb, instr->src[1]); + srcs[A64_LOGICAL_ENABLE_HELPERS] = + brw_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS); + + if (nir_src_bit_size(instr->src[0]) == 32 && + nir_intrinsic_align(instr) >= 4) { + assert(nir_src_num_components(instr->src[0]) <= 4); + + srcs[A64_LOGICAL_SRC] = get_nir_src(ntb, instr->src[0]); /* Data */ + srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components); + + bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL, fs_reg(), + srcs, A64_LOGICAL_NUM_SRCS); + } else { + assert(nir_src_num_components(instr->src[0]) == 1); + const unsigned bit_size = nir_src_bit_size(instr->src[0]); + brw_reg_type data_type = + brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.MOV(tmp, retype(get_nir_src(ntb, instr->src[0]), data_type)); + + srcs[A64_LOGICAL_SRC] = tmp; + srcs[A64_LOGICAL_ARG] = brw_imm_ud(bit_size); + + bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL, fs_reg(), + srcs, A64_LOGICAL_NUM_SRCS); + } + break; + } + + case nir_intrinsic_global_atomic: + case nir_intrinsic_global_atomic_swap: + fs_nir_emit_global_atomic(ntb, bld, instr); + break; + + case nir_intrinsic_load_global_const_block_intel: { + assert(instr->def.bit_size == 32); + assert(instr->num_components == 8 || instr->num_components == 16); + + const fs_builder ubld = bld.exec_all().group(instr->num_components, 0); + fs_reg load_val; + + bool is_pred_const = nir_src_is_const(instr->src[1]); + if (is_pred_const && nir_src_as_uint(instr->src[1]) == 0) { + /* In this case, we don't want the UBO load at all. We really + * shouldn't get here but it's possible. + */ + load_val = brw_imm_ud(0); + } else { + /* The uniform process may stomp the flag so do this first */ + fs_reg addr = bld.emit_uniformize(get_nir_src(ntb, instr->src[0])); + + load_val = ubld.vgrf(BRW_REGISTER_TYPE_UD); + + /* If the predicate is constant and we got here, then it's non-zero + * and we don't need the predicate at all. + */ + if (!is_pred_const) { + /* Load the predicate */ + fs_reg pred = bld.emit_uniformize(get_nir_src(ntb, instr->src[1])); + fs_inst *mov = ubld.MOV(bld.null_reg_d(), pred); + mov->conditional_mod = BRW_CONDITIONAL_NZ; + + /* Stomp the destination with 0 if we're OOB */ + mov = ubld.MOV(load_val, brw_imm_ud(0)); + mov->predicate = BRW_PREDICATE_NORMAL; + mov->predicate_inverse = true; + } + + fs_reg srcs[A64_LOGICAL_NUM_SRCS]; + srcs[A64_LOGICAL_ADDRESS] = addr; + srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */ + srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components); + /* This intrinsic loads memory from a uniform address, sometimes + * shared across lanes. We never need to mask it. + */ + srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0); + + fs_inst *load = ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL, + load_val, srcs, A64_LOGICAL_NUM_SRCS); + if (!is_pred_const) + load->predicate = BRW_PREDICATE_NORMAL; + } + + /* From the HW perspective, we just did a single SIMD16 instruction + * which loaded a dword in each SIMD channel. From NIR's perspective, + * this instruction returns a vec16. Any users of this data in the + * back-end will expect a vec16 per SIMD channel so we have to emit a + * pile of MOVs to resolve this discrepancy. Fortunately, copy-prop + * will generally clean them up for us. + */ + for (unsigned i = 0; i < instr->num_components; i++) { + bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD), + component(load_val, i)); + } + break; + } + + case nir_intrinsic_load_global_constant_uniform_block_intel: { + const unsigned total_dwords = ALIGN(instr->num_components, + REG_SIZE * reg_unit(devinfo) / 4); + unsigned loaded_dwords = 0; + + const fs_builder ubld1 = bld.exec_all().group(1, 0); + const fs_builder ubld8 = bld.exec_all().group(8, 0); + const fs_builder ubld16 = bld.exec_all().group(16, 0); + + const fs_reg packed_consts = + ubld1.vgrf(BRW_REGISTER_TYPE_UD, total_dwords); + fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[0])); + + while (loaded_dwords < total_dwords) { + const unsigned block = + choose_oword_block_size_dwords(devinfo, + total_dwords - loaded_dwords); + const unsigned block_bytes = block * 4; + + const fs_builder &ubld = block <= 8 ? ubld8 : ubld16; + + fs_reg srcs[A64_LOGICAL_NUM_SRCS]; + srcs[A64_LOGICAL_ADDRESS] = address; + srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */ + srcs[A64_LOGICAL_ARG] = brw_imm_ud(block); + srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0); + ubld.emit(SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL, + retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_REGISTER_TYPE_UD), + srcs, A64_LOGICAL_NUM_SRCS)->size_written = + align(block_bytes, REG_SIZE * reg_unit(devinfo)); + + increment_a64_address(ubld1, address, block_bytes); + loaded_dwords += block; + } + + for (unsigned c = 0; c < instr->num_components; c++) + bld.MOV(retype(offset(dest, bld, c), BRW_REGISTER_TYPE_UD), + component(packed_consts, c)); + + break; + } + + case nir_intrinsic_load_ssbo: { + assert(devinfo->ver >= 7); + + const unsigned bit_size = instr->def.bit_size; + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + srcs[get_nir_src_bindless(ntb, instr->src[0]) ? + SURFACE_LOGICAL_SRC_SURFACE_HANDLE : + SURFACE_LOGICAL_SRC_SURFACE] = + get_nir_buffer_intrinsic_index(ntb, bld, instr); + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]); + srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); + + /* Make dest unsigned because that's what the temporary will be */ + dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); + + /* Read the vector */ + assert(bit_size <= 32); + assert(nir_intrinsic_align(instr) > 0); + if (bit_size == 32 && + nir_intrinsic_align(instr) >= 4) { + assert(instr->def.num_components <= 4); + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); + fs_inst *inst = + bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, + dest, srcs, SURFACE_LOGICAL_NUM_SRCS); + inst->size_written = instr->num_components * s.dispatch_width * 4; + } else { + assert(instr->def.num_components == 1); + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); + + fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, + read_result, srcs, SURFACE_LOGICAL_NUM_SRCS); + bld.MOV(dest, subscript(read_result, dest.type, 0)); + } + break; + } + + case nir_intrinsic_store_ssbo: { + assert(devinfo->ver >= 7); + + const unsigned bit_size = nir_src_bit_size(instr->src[0]); + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + srcs[get_nir_src_bindless(ntb, instr->src[1]) ? + SURFACE_LOGICAL_SRC_SURFACE_HANDLE : + SURFACE_LOGICAL_SRC_SURFACE] = + get_nir_buffer_intrinsic_index(ntb, bld, instr); + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[2]); + srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); + + fs_reg data = get_nir_src(ntb, instr->src[0]); + data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); + + assert(bit_size <= 32); + assert(nir_intrinsic_write_mask(instr) == + (1u << instr->num_components) - 1); + assert(nir_intrinsic_align(instr) > 0); + if (bit_size == 32 && + nir_intrinsic_align(instr) >= 4) { + assert(nir_src_num_components(instr->src[0]) <= 4); + srcs[SURFACE_LOGICAL_SRC_DATA] = data; + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); + bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, + fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); + } else { + assert(nir_src_num_components(instr->src[0]) == 1); + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); + + srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data); + + bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, + fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); + } + break; + } + + case nir_intrinsic_load_ssbo_uniform_block_intel: + case nir_intrinsic_load_shared_uniform_block_intel: { + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + + const bool is_ssbo = + instr->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel; + if (is_ssbo) { + srcs[get_nir_src_bindless(ntb, instr->src[0]) ? + SURFACE_LOGICAL_SRC_SURFACE_HANDLE : + SURFACE_LOGICAL_SRC_SURFACE] = + get_nir_buffer_intrinsic_index(ntb, bld, instr); + } else { + srcs[SURFACE_LOGICAL_SRC_SURFACE] = fs_reg(brw_imm_ud(GFX7_BTI_SLM)); + } + + const unsigned total_dwords = ALIGN(instr->num_components, + REG_SIZE * reg_unit(devinfo) / 4); + unsigned loaded_dwords = 0; + + const fs_builder ubld1 = bld.exec_all().group(1, 0); + const fs_builder ubld8 = bld.exec_all().group(8, 0); + const fs_builder ubld16 = bld.exec_all().group(16, 0); + + const fs_reg packed_consts = + ubld1.vgrf(BRW_REGISTER_TYPE_UD, total_dwords); + + const nir_src load_offset = is_ssbo ? instr->src[1] : instr->src[0]; + if (nir_src_is_const(load_offset)) { + fs_reg addr = ubld8.vgrf(BRW_REGISTER_TYPE_UD); + ubld8.MOV(addr, brw_imm_ud(nir_src_as_uint(load_offset))); + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = component(addr, 0); + } else { + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + bld.emit_uniformize(get_nir_src(ntb, load_offset)); + } + + while (loaded_dwords < total_dwords) { + const unsigned block = + choose_oword_block_size_dwords(devinfo, + total_dwords - loaded_dwords); + const unsigned block_bytes = block * 4; + + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block); + + const fs_builder &ubld = block <= 8 ? ubld8 : ubld16; + ubld.emit(SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL, + retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_REGISTER_TYPE_UD), + srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written = + align(block_bytes, REG_SIZE * reg_unit(devinfo)); + + loaded_dwords += block; + + ubld1.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS], + srcs[SURFACE_LOGICAL_SRC_ADDRESS], + brw_imm_ud(block_bytes)); + } + + for (unsigned c = 0; c < instr->num_components; c++) + bld.MOV(retype(offset(dest, bld, c), BRW_REGISTER_TYPE_UD), + component(packed_consts, c)); + + break; + } + + case nir_intrinsic_store_output: { + assert(nir_src_bit_size(instr->src[0]) == 32); + fs_reg src = get_nir_src(ntb, instr->src[0]); + + unsigned store_offset = nir_src_as_uint(instr->src[1]); + unsigned num_components = instr->num_components; + unsigned first_component = nir_intrinsic_component(instr); + + fs_reg new_dest = retype(offset(s.outputs[instr->const_index[0]], bld, + 4 * store_offset), src.type); + for (unsigned j = 0; j < num_components; j++) { + bld.MOV(offset(new_dest, bld, j + first_component), + offset(src, bld, j)); + } + break; + } + + case nir_intrinsic_ssbo_atomic: + case nir_intrinsic_ssbo_atomic_swap: + fs_nir_emit_surface_atomic(ntb, bld, instr, + get_nir_buffer_intrinsic_index(ntb, bld, instr), + get_nir_src_bindless(ntb, instr->src[0])); + break; + + case nir_intrinsic_get_ssbo_size: { + assert(nir_src_num_components(instr->src[0]) == 1); + + /* A resinfo's sampler message is used to get the buffer size. The + * SIMD8's writeback message consists of four registers and SIMD16's + * writeback message consists of 8 destination registers (two per each + * component). Because we are only interested on the first channel of + * the first returned component, where resinfo returns the buffer size + * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of + * the dispatch width. + */ + const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0); + fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD); + fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4); + + /* Set LOD = 0 */ + ubld.MOV(src_payload, brw_imm_d(0)); + + fs_reg srcs[GET_BUFFER_SIZE_SRCS]; + srcs[get_nir_src_bindless(ntb, instr->src[0]) ? + GET_BUFFER_SIZE_SRC_SURFACE_HANDLE : + GET_BUFFER_SIZE_SRC_SURFACE] = + get_nir_buffer_intrinsic_index(ntb, bld, instr); + srcs[GET_BUFFER_SIZE_SRC_LOD] = src_payload; + fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload, + srcs, GET_BUFFER_SIZE_SRCS); + inst->header_size = 0; + inst->mlen = reg_unit(devinfo); + inst->size_written = 4 * REG_SIZE * reg_unit(devinfo); + + /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting: + * + * "Out-of-bounds checking is always performed at a DWord granularity. If + * any part of the DWord is out-of-bounds then the whole DWord is + * considered out-of-bounds." + * + * This implies that types with size smaller than 4-bytes need to be + * padded if they don't complete the last dword of the buffer. But as we + * need to maintain the original size we need to reverse the padding + * calculation to return the correct size to know the number of elements + * of an unsized array. As we stored in the last two bits of the surface + * size the needed padding for the buffer, we calculate here the + * original buffer_size reversing the surface_size calculation: + * + * surface_size = isl_align(buffer_size, 4) + + * (isl_align(buffer_size) - buffer_size) + * + * buffer_size = surface_size & ~3 - surface_size & 3 + */ + + fs_reg size_aligned4 = ubld.vgrf(BRW_REGISTER_TYPE_UD); + fs_reg size_padding = ubld.vgrf(BRW_REGISTER_TYPE_UD); + fs_reg buffer_size = ubld.vgrf(BRW_REGISTER_TYPE_UD); + + ubld.AND(size_padding, ret_payload, brw_imm_ud(3)); + ubld.AND(size_aligned4, ret_payload, brw_imm_ud(~3)); + ubld.ADD(buffer_size, size_aligned4, negate(size_padding)); + + bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0)); + break; + } + + case nir_intrinsic_load_scratch: { + assert(devinfo->ver >= 7); + + assert(instr->def.num_components == 1); + const unsigned bit_size = instr->def.bit_size; + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + + if (devinfo->verx10 >= 125) { + const fs_builder ubld = bld.exec_all().group(1, 0); + fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0); + ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), + brw_imm_ud(INTEL_MASK(31, 10))); + srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX125_NON_BINDLESS); + srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle; + } else if (devinfo->ver >= 8) { + srcs[SURFACE_LOGICAL_SRC_SURFACE] = + brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT); + } else { + srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS); + } + + srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); + const fs_reg nir_addr = get_nir_src(ntb, instr->src[0]); + + /* Make dest unsigned because that's what the temporary will be */ + dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); + + /* Read the vector */ + assert(instr->def.num_components == 1); + assert(bit_size <= 32); + assert(nir_intrinsic_align(instr) > 0); + if (bit_size == 32 && + nir_intrinsic_align(instr) >= 4) { + if (devinfo->verx10 >= 125) { + assert(bit_size == 32 && + nir_intrinsic_align(instr) >= 4); + + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + swizzle_nir_scratch_addr(ntb, bld, nir_addr, false); + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1); + + bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, + dest, srcs, SURFACE_LOGICAL_NUM_SRCS); + } else { + /* The offset for a DWORD scattered message is in dwords. */ + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + swizzle_nir_scratch_addr(ntb, bld, nir_addr, true); + + bld.emit(SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL, + dest, srcs, SURFACE_LOGICAL_NUM_SRCS); + } + } else { + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + swizzle_nir_scratch_addr(ntb, bld, nir_addr, false); + + fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, + read_result, srcs, SURFACE_LOGICAL_NUM_SRCS); + bld.MOV(dest, read_result); + } + + s.shader_stats.fill_count += DIV_ROUND_UP(s.dispatch_width, 16); + break; + } + + case nir_intrinsic_store_scratch: { + assert(devinfo->ver >= 7); + + assert(nir_src_num_components(instr->src[0]) == 1); + const unsigned bit_size = nir_src_bit_size(instr->src[0]); + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + + if (devinfo->verx10 >= 125) { + const fs_builder ubld = bld.exec_all().group(1, 0); + fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0); + ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), + brw_imm_ud(INTEL_MASK(31, 10))); + srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX125_NON_BINDLESS); + srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle; + } else if (devinfo->ver >= 8) { + srcs[SURFACE_LOGICAL_SRC_SURFACE] = + brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT); + } else { + srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS); + } + + srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); + /** + * While this instruction has side-effects, it should not be predicated + * on sample mask, because otherwise fs helper invocations would + * load undefined values from scratch memory. And scratch memory + * load-stores are produced from operations without side-effects, thus + * they should not have different behaviour in the helper invocations. + */ + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); + const fs_reg nir_addr = get_nir_src(ntb, instr->src[1]); + + fs_reg data = get_nir_src(ntb, instr->src[0]); + data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); + + assert(nir_src_num_components(instr->src[0]) == 1); + assert(bit_size <= 32); + assert(nir_intrinsic_write_mask(instr) == 1); + assert(nir_intrinsic_align(instr) > 0); + if (bit_size == 32 && + nir_intrinsic_align(instr) >= 4) { + if (devinfo->verx10 >= 125) { + srcs[SURFACE_LOGICAL_SRC_DATA] = data; + + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + swizzle_nir_scratch_addr(ntb, bld, nir_addr, false); + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1); + + bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, + dest, srcs, SURFACE_LOGICAL_NUM_SRCS); + } else { + srcs[SURFACE_LOGICAL_SRC_DATA] = data; + + /* The offset for a DWORD scattered message is in dwords. */ + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + swizzle_nir_scratch_addr(ntb, bld, nir_addr, true); + + bld.emit(SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL, + fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); + } + } else { + srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data); + + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + swizzle_nir_scratch_addr(ntb, bld, nir_addr, false); + + bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, + fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); + } + s.shader_stats.spill_count += DIV_ROUND_UP(s.dispatch_width, 16); + break; + } + + case nir_intrinsic_load_subgroup_size: + /* This should only happen for fragment shaders because every other case + * is lowered in NIR so we can optimize on it. + */ + assert(s.stage == MESA_SHADER_FRAGMENT); + bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(s.dispatch_width)); + break; + + case nir_intrinsic_load_subgroup_invocation: + bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), + ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]); + break; + + case nir_intrinsic_load_subgroup_eq_mask: + case nir_intrinsic_load_subgroup_ge_mask: + case nir_intrinsic_load_subgroup_gt_mask: + case nir_intrinsic_load_subgroup_le_mask: + case nir_intrinsic_load_subgroup_lt_mask: + unreachable("not reached"); + + case nir_intrinsic_vote_any: { + const fs_builder ubld1 = bld.exec_all().group(1, 0); + + /* The any/all predicates do not consider channel enables. To prevent + * dead channels from affecting the result, we initialize the flag with + * with the identity value for the logical operation. + */ + if (s.dispatch_width == 32) { + /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */ + ubld1.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD), + brw_imm_ud(0)); + } else { + ubld1.MOV(brw_flag_reg(0, 0), brw_imm_uw(0)); + } + bld.CMP(bld.null_reg_d(), get_nir_src(ntb, instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ); + + /* For some reason, the any/all predicates don't work properly with + * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H + * doesn't read the correct subset of the flag register and you end up + * getting garbage in the second half. Work around this by using a pair + * of 1-wide MOVs and scattering the result. + */ + const fs_builder ubld = devinfo->ver >= 20 ? bld.exec_all() : ubld1; + fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D); + ubld.MOV(res1, brw_imm_d(0)); + set_predicate(devinfo->ver >= 20 ? XE2_PREDICATE_ANY : + s.dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ANY8H : + s.dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H : + BRW_PREDICATE_ALIGN1_ANY32H, + ubld.MOV(res1, brw_imm_d(-1))); + + bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0)); + break; + } + case nir_intrinsic_vote_all: { + const fs_builder ubld1 = bld.exec_all().group(1, 0); + + /* The any/all predicates do not consider channel enables. To prevent + * dead channels from affecting the result, we initialize the flag with + * with the identity value for the logical operation. + */ + if (s.dispatch_width == 32) { + /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */ + ubld1.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD), + brw_imm_ud(0xffffffff)); + } else { + ubld1.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff)); + } + bld.CMP(bld.null_reg_d(), get_nir_src(ntb, instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ); + + /* For some reason, the any/all predicates don't work properly with + * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H + * doesn't read the correct subset of the flag register and you end up + * getting garbage in the second half. Work around this by using a pair + * of 1-wide MOVs and scattering the result. + */ + const fs_builder ubld = devinfo->ver >= 20 ? bld.exec_all() : ubld1; + fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D); + ubld.MOV(res1, brw_imm_d(0)); + set_predicate(devinfo->ver >= 20 ? XE2_PREDICATE_ALL : + s.dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ALL8H : + s.dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H : + BRW_PREDICATE_ALIGN1_ALL32H, + ubld.MOV(res1, brw_imm_d(-1))); + + bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0)); + break; + } + case nir_intrinsic_vote_feq: + case nir_intrinsic_vote_ieq: { + fs_reg value = get_nir_src(ntb, instr->src[0]); + if (instr->intrinsic == nir_intrinsic_vote_feq) { + const unsigned bit_size = nir_src_bit_size(instr->src[0]); + value.type = bit_size == 8 ? BRW_REGISTER_TYPE_B : + brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F); + } + + fs_reg uniformized = bld.emit_uniformize(value); + const fs_builder ubld1 = bld.exec_all().group(1, 0); + + /* The any/all predicates do not consider channel enables. To prevent + * dead channels from affecting the result, we initialize the flag with + * with the identity value for the logical operation. + */ + if (s.dispatch_width == 32) { + /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */ + ubld1.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD), + brw_imm_ud(0xffffffff)); + } else { + ubld1.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff)); + } + bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z); + + /* For some reason, the any/all predicates don't work properly with + * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H + * doesn't read the correct subset of the flag register and you end up + * getting garbage in the second half. Work around this by using a pair + * of 1-wide MOVs and scattering the result. + */ + const fs_builder ubld = devinfo->ver >= 20 ? bld.exec_all() : ubld1; + fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D); + ubld.MOV(res1, brw_imm_d(0)); + set_predicate(devinfo->ver >= 20 ? XE2_PREDICATE_ALL : + s.dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ALL8H : + s.dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H : + BRW_PREDICATE_ALIGN1_ALL32H, + ubld.MOV(res1, brw_imm_d(-1))); + + bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0)); + break; + } + + case nir_intrinsic_ballot: { + const fs_reg value = retype(get_nir_src(ntb, instr->src[0]), + BRW_REGISTER_TYPE_UD); + struct brw_reg flag = brw_flag_reg(0, 0); + /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well + * as f0.0. This is a problem for fragment programs as we currently use + * f0.1 for discards. Fortunately, we don't support SIMD32 fragment + * programs yet so this isn't a problem. When we do, something will + * have to change. + */ + if (s.dispatch_width == 32) + flag.type = BRW_REGISTER_TYPE_UD; + + bld.exec_all().group(1, 0).MOV(flag, brw_imm_ud(0u)); + bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ); + + if (instr->def.bit_size > 32) { + dest.type = BRW_REGISTER_TYPE_UQ; + } else { + dest.type = BRW_REGISTER_TYPE_UD; + } + bld.MOV(dest, flag); + break; + } + + case nir_intrinsic_read_invocation: { + const fs_reg value = get_nir_src(ntb, instr->src[0]); + const fs_reg invocation = get_nir_src(ntb, instr->src[1]); + + fs_reg tmp = bld.vgrf(value.type); + + /* When for some reason the subgroup_size picked by NIR is larger than + * the dispatch size picked by the backend (this could happen in RT, + * FS), bound the invocation to the dispatch size. + */ + fs_reg bound_invocation; + if (s.api_subgroup_size == 0 || + bld.dispatch_width() < s.api_subgroup_size) { + bound_invocation = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.AND(bound_invocation, invocation, brw_imm_ud(s.dispatch_width - 1)); + } else { + bound_invocation = invocation; + } + bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value, + bld.emit_uniformize(bound_invocation)); + + bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0))); + break; + } + + case nir_intrinsic_read_first_invocation: { + const fs_reg value = get_nir_src(ntb, instr->src[0]); + bld.MOV(retype(dest, value.type), bld.emit_uniformize(value)); + break; + } + + case nir_intrinsic_shuffle: { + const fs_reg value = get_nir_src(ntb, instr->src[0]); + const fs_reg index = get_nir_src(ntb, instr->src[1]); + + bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index); + break; + } + + case nir_intrinsic_first_invocation: { + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp); + bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), + fs_reg(component(tmp, 0))); + break; + } + + case nir_intrinsic_last_invocation: { + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.exec_all().emit(SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL, tmp); + bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), + fs_reg(component(tmp, 0))); + break; + } + + case nir_intrinsic_quad_broadcast: { + const fs_reg value = get_nir_src(ntb, instr->src[0]); + const unsigned index = nir_src_as_uint(instr->src[1]); + + bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type), + value, brw_imm_ud(index), brw_imm_ud(4)); + break; + } + + case nir_intrinsic_quad_swap_horizontal: { + const fs_reg value = get_nir_src(ntb, instr->src[0]); + const fs_reg tmp = bld.vgrf(value.type); + if (devinfo->ver <= 7) { + /* The hardware doesn't seem to support these crazy regions with + * compressed instructions on gfx7 and earlier so we fall back to + * using quad swizzles. Fortunately, we don't support 64-bit + * anything in Vulkan on gfx7. + */ + assert(nir_src_bit_size(instr->src[0]) == 32); + const fs_builder ubld = bld.exec_all(); + ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, + brw_imm_ud(BRW_SWIZZLE4(1,0,3,2))); + bld.MOV(retype(dest, value.type), tmp); + } else { + const fs_builder ubld = bld.exec_all().group(s.dispatch_width / 2, 0); + + const fs_reg src_left = horiz_stride(value, 2); + const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2); + const fs_reg tmp_left = horiz_stride(tmp, 2); + const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2); + + ubld.MOV(tmp_left, src_right); + ubld.MOV(tmp_right, src_left); + + } + bld.MOV(retype(dest, value.type), tmp); + break; + } + + case nir_intrinsic_quad_swap_vertical: { + const fs_reg value = get_nir_src(ntb, instr->src[0]); + if (nir_src_bit_size(instr->src[0]) == 32) { + /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */ + const fs_reg tmp = bld.vgrf(value.type); + const fs_builder ubld = bld.exec_all(); + ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, + brw_imm_ud(BRW_SWIZZLE4(2,3,0,1))); + bld.MOV(retype(dest, value.type), tmp); + } else { + /* For larger data types, we have to either emit dispatch_width many + * MOVs or else fall back to doing indirects. + */ + fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); + bld.XOR(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], + brw_imm_w(0x2)); + bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx); + } + break; + } + + case nir_intrinsic_quad_swap_diagonal: { + const fs_reg value = get_nir_src(ntb, instr->src[0]); + if (nir_src_bit_size(instr->src[0]) == 32) { + /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */ + const fs_reg tmp = bld.vgrf(value.type); + const fs_builder ubld = bld.exec_all(); + ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, + brw_imm_ud(BRW_SWIZZLE4(3,2,1,0))); + bld.MOV(retype(dest, value.type), tmp); + } else { + /* For larger data types, we have to either emit dispatch_width many + * MOVs or else fall back to doing indirects. + */ + fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); + bld.XOR(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], + brw_imm_w(0x3)); + bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx); + } + break; + } + + case nir_intrinsic_reduce: { + fs_reg src = get_nir_src(ntb, instr->src[0]); + nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr); + unsigned cluster_size = nir_intrinsic_cluster_size(instr); + if (cluster_size == 0 || cluster_size > s.dispatch_width) + cluster_size = s.dispatch_width; + + /* Figure out the source type */ + src.type = brw_type_for_nir_type(devinfo, + (nir_alu_type)(nir_op_infos[redop].input_types[0] | + nir_src_bit_size(instr->src[0]))); + + fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type); + opcode brw_op = brw_op_for_nir_reduction_op(redop); + brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); + + /* Set up a register for all of our scratching around and initialize it + * to reduction operation's identity value. + */ + fs_reg scan = bld.vgrf(src.type); + bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); + + bld.emit_scan(brw_op, scan, cluster_size, cond_mod); + + dest.type = src.type; + if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) { + /* In this case, CLUSTER_BROADCAST instruction isn't needed because + * the distance between clusters is at least 2 GRFs. In this case, + * we don't need the weird striding of the CLUSTER_BROADCAST + * instruction and can just do regular MOVs. + */ + assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0); + const unsigned groups = + (s.dispatch_width * type_sz(src.type)) / (REG_SIZE * 2); + const unsigned group_size = s.dispatch_width / groups; + for (unsigned i = 0; i < groups; i++) { + const unsigned cluster = (i * group_size) / cluster_size; + const unsigned comp = cluster * cluster_size + (cluster_size - 1); + bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size), + component(scan, comp)); + } + } else { + bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan, + brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size)); + } + break; + } + + case nir_intrinsic_inclusive_scan: + case nir_intrinsic_exclusive_scan: { + fs_reg src = get_nir_src(ntb, instr->src[0]); + nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr); + + /* Figure out the source type */ + src.type = brw_type_for_nir_type(devinfo, + (nir_alu_type)(nir_op_infos[redop].input_types[0] | + nir_src_bit_size(instr->src[0]))); + + fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type); + opcode brw_op = brw_op_for_nir_reduction_op(redop); + brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); + + /* Set up a register for all of our scratching around and initialize it + * to reduction operation's identity value. + */ + fs_reg scan = bld.vgrf(src.type); + const fs_builder allbld = bld.exec_all(); + allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); + + if (instr->intrinsic == nir_intrinsic_exclusive_scan) { + /* Exclusive scan is a bit harder because we have to do an annoying + * shift of the contents before we can begin. To make things worse, + * we can't do this with a normal stride; we have to use indirects. + */ + fs_reg shifted = bld.vgrf(src.type); + fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); + allbld.ADD(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], + brw_imm_w(-1)); + allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx); + allbld.group(1, 0).MOV(component(shifted, 0), identity); + scan = shifted; + } + + bld.emit_scan(brw_op, scan, s.dispatch_width, cond_mod); + + bld.MOV(retype(dest, src.type), scan); + break; + } + + case nir_intrinsic_load_global_block_intel: { + assert(instr->def.bit_size == 32); + + fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[0])); + + const fs_builder ubld1 = bld.exec_all().group(1, 0); + const fs_builder ubld8 = bld.exec_all().group(8, 0); + const fs_builder ubld16 = bld.exec_all().group(16, 0); + + const unsigned total = instr->num_components * s.dispatch_width; + unsigned loaded = 0; + + while (loaded < total) { + const unsigned block = + choose_oword_block_size_dwords(devinfo, total - loaded); + const unsigned block_bytes = block * 4; + + const fs_builder &ubld = block == 8 ? ubld8 : ubld16; + + fs_reg srcs[A64_LOGICAL_NUM_SRCS]; + srcs[A64_LOGICAL_ADDRESS] = address; + srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */ + srcs[A64_LOGICAL_ARG] = brw_imm_ud(block); + srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(1); + ubld.emit(SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL, + retype(byte_offset(dest, loaded * 4), BRW_REGISTER_TYPE_UD), + srcs, A64_LOGICAL_NUM_SRCS)->size_written = block_bytes; + + increment_a64_address(ubld1, address, block_bytes); + loaded += block; + } + + assert(loaded == total); + break; + } + + case nir_intrinsic_store_global_block_intel: { + assert(nir_src_bit_size(instr->src[0]) == 32); + + fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[1])); + fs_reg src = get_nir_src(ntb, instr->src[0]); + + const fs_builder ubld1 = bld.exec_all().group(1, 0); + const fs_builder ubld8 = bld.exec_all().group(8, 0); + const fs_builder ubld16 = bld.exec_all().group(16, 0); + + const unsigned total = instr->num_components * s.dispatch_width; + unsigned written = 0; + + while (written < total) { + const unsigned block = + choose_oword_block_size_dwords(devinfo, total - written); + + fs_reg srcs[A64_LOGICAL_NUM_SRCS]; + srcs[A64_LOGICAL_ADDRESS] = address; + srcs[A64_LOGICAL_SRC] = retype(byte_offset(src, written * 4), + BRW_REGISTER_TYPE_UD); + srcs[A64_LOGICAL_ARG] = brw_imm_ud(block); + srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0); + + const fs_builder &ubld = block == 8 ? ubld8 : ubld16; + ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL, fs_reg(), + srcs, A64_LOGICAL_NUM_SRCS); + + const unsigned block_bytes = block * 4; + increment_a64_address(ubld1, address, block_bytes); + written += block; + } + + assert(written == total); + break; + } + + case nir_intrinsic_load_shared_block_intel: + case nir_intrinsic_load_ssbo_block_intel: { + assert(instr->def.bit_size == 32); + + const bool is_ssbo = + instr->intrinsic == nir_intrinsic_load_ssbo_block_intel; + fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[is_ssbo ? 1 : 0])); + + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ? + get_nir_buffer_intrinsic_index(ntb, bld, instr) : + fs_reg(brw_imm_ud(GFX7_BTI_SLM)); + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address; + + const fs_builder ubld1 = bld.exec_all().group(1, 0); + const fs_builder ubld8 = bld.exec_all().group(8, 0); + const fs_builder ubld16 = bld.exec_all().group(16, 0); + + const unsigned total = instr->num_components * s.dispatch_width; + unsigned loaded = 0; + + while (loaded < total) { + const unsigned block = + choose_oword_block_size_dwords(devinfo, total - loaded); + const unsigned block_bytes = block * 4; + + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block); + + const fs_builder &ubld = block == 8 ? ubld8 : ubld16; + ubld.emit(SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL, + retype(byte_offset(dest, loaded * 4), BRW_REGISTER_TYPE_UD), + srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written = block_bytes; + + ubld1.ADD(address, address, brw_imm_ud(block_bytes)); + loaded += block; + } + + assert(loaded == total); + break; + } + + case nir_intrinsic_store_shared_block_intel: + case nir_intrinsic_store_ssbo_block_intel: { + assert(nir_src_bit_size(instr->src[0]) == 32); + + const bool is_ssbo = + instr->intrinsic == nir_intrinsic_store_ssbo_block_intel; + + fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[is_ssbo ? 2 : 1])); + fs_reg src = get_nir_src(ntb, instr->src[0]); + + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ? + get_nir_buffer_intrinsic_index(ntb, bld, instr) : + fs_reg(brw_imm_ud(GFX7_BTI_SLM)); + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address; + + const fs_builder ubld1 = bld.exec_all().group(1, 0); + const fs_builder ubld8 = bld.exec_all().group(8, 0); + const fs_builder ubld16 = bld.exec_all().group(16, 0); + + const unsigned total = instr->num_components * s.dispatch_width; + unsigned written = 0; + + while (written < total) { + const unsigned block = + choose_oword_block_size_dwords(devinfo, total - written); + + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block); + srcs[SURFACE_LOGICAL_SRC_DATA] = + retype(byte_offset(src, written * 4), BRW_REGISTER_TYPE_UD); + + const fs_builder &ubld = block == 8 ? ubld8 : ubld16; + ubld.emit(SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL, + fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); + + const unsigned block_bytes = block * 4; + ubld1.ADD(address, address, brw_imm_ud(block_bytes)); + written += block; + } + + assert(written == total); + break; + } + + case nir_intrinsic_load_topology_id_intel: { + /* These move around basically every hardware generation, so don't + * do any unbounded checks and fail if the platform hasn't explicitly + * been enabled here. + */ + assert(devinfo->ver >= 12 && devinfo->ver <= 20); + + /* Here is what the layout of SR0 looks like on Gfx12 + * https://gfxspecs.intel.com/Predator/Home/Index/47256 + * [13:11] : Slice ID. + * [10:9] : Dual-SubSlice ID + * [8] : SubSlice ID + * [7] : EUID[2] (aka EU Row ID) + * [6] : Reserved + * [5:4] : EUID[1:0] + * [2:0] : Thread ID + * + * Xe2: Engine 3D and GPGPU Programs, EU Overview, Registers and + * Register Regions, ARF Registers, State Register, + * https://gfxspecs.intel.com/Predator/Home/Index/56623 + * [15:11] : Slice ID. + * [9:8] : SubSlice ID + * [6:4] : EUID + * [2:0] : Thread ID + */ + fs_reg raw_id = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.emit(SHADER_OPCODE_READ_SR_REG, raw_id, brw_imm_ud(0)); + switch (nir_intrinsic_base(instr)) { + case BRW_TOPOLOGY_ID_DSS: + if (devinfo->ver >= 20) { + /* Xe2+: 3D and GPGPU Programs, Shared Functions, Ray Tracing: + * https://gfxspecs.intel.com/Predator/Home/Index/56936 + * + * Note: DSSID in all formulas below is a logical identifier of an + * XeCore (a value that goes from 0 to (number_of_slices * + * number_of_XeCores_per_slice -1). SW can get this value from + * either: + * + * - Message Control Register LogicalSSID field (only in shaders + * eligible for Mid-Thread Preemption). + * - Calculated based of State Register with the following formula: + * DSSID = StateRegister.SliceID * GT_ARCH_SS_PER_SLICE + + * StateRRegister.SubSliceID where GT_SS_PER_SLICE is an + * architectural parameter defined per product SKU. + * + * We are using the state register to calculate the DSSID. + */ + fs_reg slice_id = bld.vgrf(BRW_REGISTER_TYPE_UD); + fs_reg subslice_id = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.AND(slice_id, raw_id, brw_imm_ud(INTEL_MASK(15, 11))); + bld.SHR(slice_id, slice_id, brw_imm_ud(11)); + + /* Assert that max subslices covers at least 2 bits that we use for + * subslices. + */ + assert(devinfo->max_subslices_per_slice >= (1 << 2)); + bld.MUL(slice_id, slice_id, + brw_imm_ud(devinfo->max_subslices_per_slice)); + bld.AND(subslice_id, raw_id, brw_imm_ud(INTEL_MASK(9, 8))); + bld.SHR(subslice_id, subslice_id, brw_imm_ud(8)); + bld.ADD(retype(dest, BRW_REGISTER_TYPE_UD), slice_id, + subslice_id); + } else { + bld.AND(raw_id, raw_id, brw_imm_ud(0x3fff)); + /* Get rid of anything below dualsubslice */ + bld.SHR(retype(dest, BRW_REGISTER_TYPE_UD), raw_id, brw_imm_ud(9)); + } + break; + case BRW_TOPOLOGY_ID_EU_THREAD_SIMD: { + s.limit_dispatch_width(16, "Topology helper for Ray queries, " + "not supported in SIMD32 mode."); + fs_reg dst = retype(dest, BRW_REGISTER_TYPE_UD); + + if (devinfo->ver >= 20) { + /* Xe2+: Graphics Engine, 3D and GPGPU Programs, Shared Functions + * Ray Tracing, + * https://gfxspecs.intel.com/Predator/Home/Index/56936 + * + * SyncStackID = (EUID[2:0] << 8) | (ThreadID[2:0] << 4) | + * SIMDLaneID[3:0]; + * + * This section just deals with the EUID part. + * + * The 3bit EU[2:0] we need to build for ray query memory addresses + * computations is a bit odd : + * + * EU[2:0] = raw_id[6:4] (identified as EUID[2:0]) + */ + bld.AND(dst, raw_id, brw_imm_ud(INTEL_MASK(6, 4))); + bld.SHL(dst, dst, brw_imm_ud(4)); + } else { + /* EU[3:0] << 7 + * + * The 4bit EU[3:0] we need to build for ray query memory addresses + * computations is a bit odd : + * + * EU[1:0] = raw_id[5:4] (identified as EUID[1:0]) + * EU[2] = raw_id[8] (identified as SubSlice ID) + * EU[3] = raw_id[7] (identified as EUID[2] or Row ID) + */ + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.AND(tmp, raw_id, brw_imm_ud(INTEL_MASK(7, 7))); + bld.SHL(dst, tmp, brw_imm_ud(3)); + bld.AND(tmp, raw_id, brw_imm_ud(INTEL_MASK(8, 8))); + bld.SHL(tmp, tmp, brw_imm_ud(1)); + bld.OR(dst, dst, tmp); + bld.AND(tmp, raw_id, brw_imm_ud(INTEL_MASK(5, 4))); + bld.SHL(tmp, tmp, brw_imm_ud(3)); + bld.OR(dst, dst, tmp); + } + + /* ThreadID[2:0] << 4 (ThreadID comes from raw_id[2:0]) */ + { + bld.AND(raw_id, raw_id, brw_imm_ud(INTEL_MASK(2, 0))); + bld.SHL(raw_id, raw_id, brw_imm_ud(4)); + bld.OR(dst, dst, raw_id); + } + + /* LaneID[0:3] << 0 (Use nir SYSTEM_VALUE_SUBGROUP_INVOCATION) */ + assert(bld.dispatch_width() <= 16); /* Limit to 4 bits */ + bld.ADD(dst, dst, + ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]); + break; + } + default: + unreachable("Invalid topology id type"); + } + break; + } + + case nir_intrinsic_load_btd_stack_id_intel: + if (s.stage == MESA_SHADER_COMPUTE) { + assert(brw_cs_prog_data(s.prog_data)->uses_btd_stack_ids); + } else { + assert(brw_shader_stage_is_bindless(s.stage)); + } + /* Stack IDs are always in R1 regardless of whether we're coming from a + * bindless shader or a regular compute shader. + */ + bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), + retype(brw_vec8_grf(1 * reg_unit(devinfo), 0), BRW_REGISTER_TYPE_UW)); + break; + + case nir_intrinsic_btd_spawn_intel: + if (s.stage == MESA_SHADER_COMPUTE) { + assert(brw_cs_prog_data(s.prog_data)->uses_btd_stack_ids); + } else { + assert(brw_shader_stage_is_bindless(s.stage)); + } + /* Make sure all the pointers to resume shaders have landed where other + * threads can see them. + */ + emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE); + + bld.emit(SHADER_OPCODE_BTD_SPAWN_LOGICAL, bld.null_reg_ud(), + bld.emit_uniformize(get_nir_src(ntb, instr->src[0])), + get_nir_src(ntb, instr->src[1])); + break; + + case nir_intrinsic_btd_retire_intel: + if (s.stage == MESA_SHADER_COMPUTE) { + assert(brw_cs_prog_data(s.prog_data)->uses_btd_stack_ids); + } else { + assert(brw_shader_stage_is_bindless(s.stage)); + } + /* Make sure all the pointers to resume shaders have landed where other + * threads can see them. + */ + emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE); + bld.emit(SHADER_OPCODE_BTD_RETIRE_LOGICAL); + break; + + case nir_intrinsic_trace_ray_intel: { + const bool synchronous = nir_intrinsic_synchronous(instr); + assert(brw_shader_stage_is_bindless(s.stage) || synchronous); + + /* Make sure all the previous RT structure writes are visible to the RT + * fixed function within the DSS, as well as stack pointers to resume + * shaders. + */ + emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE); + + fs_reg srcs[RT_LOGICAL_NUM_SRCS]; + + fs_reg globals = get_nir_src(ntb, instr->src[0]); + srcs[RT_LOGICAL_SRC_GLOBALS] = bld.emit_uniformize(globals); + srcs[RT_LOGICAL_SRC_BVH_LEVEL] = get_nir_src(ntb, instr->src[1]); + srcs[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] = get_nir_src(ntb, instr->src[2]); + srcs[RT_LOGICAL_SRC_SYNCHRONOUS] = brw_imm_ud(synchronous); + bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL, bld.null_reg_ud(), + srcs, RT_LOGICAL_NUM_SRCS); + + /* There is no actual value to use in the destination register of the + * synchronous trace instruction. All of the communication with the HW + * unit happens through memory reads/writes. So to ensure that the + * operation has completed before we go read the results in memory, we + * need a barrier followed by an invalidate before accessing memory. + */ + if (synchronous) { + bld.emit(BRW_OPCODE_SYNC, bld.null_reg_ud(), brw_imm_ud(TGL_SYNC_ALLWR)); + emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_INVALIDATE); + } + break; + } + + default: +#ifndef NDEBUG + assert(instr->intrinsic < nir_num_intrinsics); + fprintf(stderr, "intrinsic: %s\n", nir_intrinsic_infos[instr->intrinsic].name); +#endif + unreachable("unknown intrinsic"); + } +} + +static fs_reg +expand_to_32bit(const fs_builder &bld, const fs_reg &src) +{ + if (type_sz(src.type) == 2) { + fs_reg src32 = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.MOV(src32, retype(src, BRW_REGISTER_TYPE_UW)); + return src32; + } else { + return src; + } +} + +static void +fs_nir_emit_surface_atomic(nir_to_brw_state &ntb, const fs_builder &bld, + nir_intrinsic_instr *instr, + fs_reg surface, + bool bindless) +{ + const intel_device_info *devinfo = ntb.devinfo; + fs_visitor &s = ntb.s; + + enum lsc_opcode op = lsc_aop_for_nir_intrinsic(instr); + int num_data = lsc_op_num_data_values(op); + + bool shared = surface.file == IMM && surface.ud == GFX7_BTI_SLM; + + /* The BTI untyped atomic messages only support 32-bit atomics. If you + * just look at the big table of messages in the Vol 7 of the SKL PRM, they + * appear to exist. However, if you look at Vol 2a, there are no message + * descriptors provided for Qword atomic ops except for A64 messages. + * + * 16-bit float atomics are supported, however. + */ + assert(instr->def.bit_size == 32 || + (instr->def.bit_size == 64 && devinfo->has_lsc) || + (instr->def.bit_size == 16 && + (devinfo->has_lsc || lsc_opcode_is_atomic_float(op)))); + + fs_reg dest = get_nir_def(ntb, instr->def); + + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + srcs[bindless ? + SURFACE_LOGICAL_SRC_SURFACE_HANDLE : + SURFACE_LOGICAL_SRC_SURFACE] = surface; + srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); + + if (shared) { + /* SLM - Get the offset */ + if (nir_src_is_const(instr->src[0])) { + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + brw_imm_ud(nir_intrinsic_base(instr) + + nir_src_as_uint(instr->src[0])); + } else { + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = s.vgrf(glsl_uint_type()); + bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS], + retype(get_nir_src(ntb, instr->src[0]), BRW_REGISTER_TYPE_UD), + brw_imm_ud(nir_intrinsic_base(instr))); + } + } else { + /* SSBOs */ + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]); + } + + fs_reg data; + if (num_data >= 1) + data = expand_to_32bit(bld, get_nir_src(ntb, instr->src[shared ? 1 : 2])); + + if (num_data >= 2) { + fs_reg tmp = bld.vgrf(data.type, 2); + fs_reg sources[2] = { + data, + expand_to_32bit(bld, get_nir_src(ntb, instr->src[shared ? 2 : 3])) + }; + bld.LOAD_PAYLOAD(tmp, sources, 2, 0); + data = tmp; + } + srcs[SURFACE_LOGICAL_SRC_DATA] = data; + + /* Emit the actual atomic operation */ + + switch (instr->def.bit_size) { + case 16: { + fs_reg dest32 = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, + retype(dest32, dest.type), + srcs, SURFACE_LOGICAL_NUM_SRCS); + bld.MOV(retype(dest, BRW_REGISTER_TYPE_UW), + retype(dest32, BRW_REGISTER_TYPE_UD)); + break; + } + + case 32: + case 64: + bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, + dest, srcs, SURFACE_LOGICAL_NUM_SRCS); + break; + default: + unreachable("Unsupported bit size"); + } +} + +static void +fs_nir_emit_global_atomic(nir_to_brw_state &ntb, const fs_builder &bld, + nir_intrinsic_instr *instr) +{ + enum lsc_opcode op = lsc_aop_for_nir_intrinsic(instr); + int num_data = lsc_op_num_data_values(op); + + fs_reg dest = get_nir_def(ntb, instr->def); + + fs_reg addr = get_nir_src(ntb, instr->src[0]); + + fs_reg data; + if (num_data >= 1) + data = expand_to_32bit(bld, get_nir_src(ntb, instr->src[1])); + + if (num_data >= 2) { + fs_reg tmp = bld.vgrf(data.type, 2); + fs_reg sources[2] = { + data, + expand_to_32bit(bld, get_nir_src(ntb, instr->src[2])) + }; + bld.LOAD_PAYLOAD(tmp, sources, 2, 0); + data = tmp; + } + + fs_reg srcs[A64_LOGICAL_NUM_SRCS]; + srcs[A64_LOGICAL_ADDRESS] = addr; + srcs[A64_LOGICAL_SRC] = data; + srcs[A64_LOGICAL_ARG] = brw_imm_ud(op); + srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0); + + switch (instr->def.bit_size) { + case 16: { + fs_reg dest32 = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, + retype(dest32, dest.type), + srcs, A64_LOGICAL_NUM_SRCS); + bld.MOV(retype(dest, BRW_REGISTER_TYPE_UW), dest32); + break; + } + case 32: + case 64: + bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, dest, + srcs, A64_LOGICAL_NUM_SRCS); + break; + default: + unreachable("Unsupported bit size"); + } +} + +static void +fs_nir_emit_texture(nir_to_brw_state &ntb, + nir_tex_instr *instr) +{ + const intel_device_info *devinfo = ntb.devinfo; + const fs_builder &bld = ntb.bld; + fs_visitor &s = ntb.s; + + fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; + + /* SKL PRMs: Volume 7: 3D-Media-GPGPU: + * + * "The Pixel Null Mask field, when enabled via the Pixel Null Mask + * Enable will be incorect for sample_c when applied to a surface with + * 64-bit per texel format such as R16G16BA16_UNORM. Pixel Null mask + * Enable may incorrectly report pixels as referencing a Null surface." + * + * We'll take care of this in NIR. + */ + assert(!instr->is_sparse || srcs[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE); + + srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_ud(instr->is_sparse); + + int lod_components = 0; + + /* The hardware requires a LOD for buffer textures */ + if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) + srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0); + + ASSERTED bool got_lod = false; + ASSERTED bool got_bias = false; + uint32_t header_bits = 0; + for (unsigned i = 0; i < instr->num_srcs; i++) { + nir_src nir_src = instr->src[i].src; + fs_reg src = get_nir_src(ntb, nir_src); + switch (instr->src[i].src_type) { + case nir_tex_src_bias: + assert(!got_lod); + got_bias = true; + + srcs[TEX_LOGICAL_SRC_LOD] = + retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_REGISTER_TYPE_F); + break; + case nir_tex_src_comparator: + srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F); + break; + case nir_tex_src_coord: + switch (instr->op) { + case nir_texop_txf: + case nir_texop_txf_ms: + case nir_texop_txf_ms_mcs_intel: + case nir_texop_samples_identical: + srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D); + break; + default: + srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F); + break; + } + break; + case nir_tex_src_ddx: + srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F); + lod_components = nir_tex_instr_src_size(instr, i); + break; + case nir_tex_src_ddy: + srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F); + break; + case nir_tex_src_lod: + assert(!got_bias); + got_lod = true; + + switch (instr->op) { + case nir_texop_txs: + srcs[TEX_LOGICAL_SRC_LOD] = + retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_REGISTER_TYPE_UD); + break; + case nir_texop_txf: + srcs[TEX_LOGICAL_SRC_LOD] = + retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_REGISTER_TYPE_D); + break; + default: + srcs[TEX_LOGICAL_SRC_LOD] = + retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_REGISTER_TYPE_F); + break; + } + break; + case nir_tex_src_min_lod: + srcs[TEX_LOGICAL_SRC_MIN_LOD] = + retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_REGISTER_TYPE_F); + break; + case nir_tex_src_ms_index: + srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD); + break; + + case nir_tex_src_offset: { + uint32_t offset_bits = 0; + if (brw_texture_offset(instr, i, &offset_bits)) { + header_bits |= offset_bits; + } else { + /* On gfx12.5+, if the offsets are not both constant and in the + * {-8,7} range, nir_lower_tex() will have already lowered the + * source offset. So we should never reach this point. + */ + assert(devinfo->verx10 < 125); + srcs[TEX_LOGICAL_SRC_TG4_OFFSET] = + retype(src, BRW_REGISTER_TYPE_D); + } + break; + } + + case nir_tex_src_projector: + unreachable("should be lowered"); + + case nir_tex_src_texture_offset: { + assert(srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE); + /* Emit code to evaluate the actual indexing expression */ + if (instr->texture_index == 0 && is_resource_src(nir_src)) + srcs[TEX_LOGICAL_SRC_SURFACE] = get_resource_nir_src(ntb, nir_src); + if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE) { + fs_reg tmp = s.vgrf(glsl_uint_type()); + bld.ADD(tmp, src, brw_imm_ud(instr->texture_index)); + srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp); + } + assert(srcs[TEX_LOGICAL_SRC_SURFACE].file != BAD_FILE); + break; + } + + case nir_tex_src_sampler_offset: { + /* Emit code to evaluate the actual indexing expression */ + if (instr->sampler_index == 0 && is_resource_src(nir_src)) + srcs[TEX_LOGICAL_SRC_SAMPLER] = get_resource_nir_src(ntb, nir_src); + if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE) { + fs_reg tmp = s.vgrf(glsl_uint_type()); + bld.ADD(tmp, src, brw_imm_ud(instr->sampler_index)); + srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp); + } + break; + } + + case nir_tex_src_texture_handle: + assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1); + srcs[TEX_LOGICAL_SRC_SURFACE] = fs_reg(); + if (is_resource_src(nir_src)) + srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = get_resource_nir_src(ntb, nir_src); + if (srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE].file == BAD_FILE) + srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src); + break; + + case nir_tex_src_sampler_handle: + assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1); + srcs[TEX_LOGICAL_SRC_SAMPLER] = fs_reg(); + if (is_resource_src(nir_src)) + srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = get_resource_nir_src(ntb, nir_src); + if (srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE].file == BAD_FILE) + srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src); + break; + + case nir_tex_src_ms_mcs_intel: + assert(instr->op == nir_texop_txf_ms); + srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D); + break; + + /* If this parameter is present, we are packing either the explicit LOD + * or LOD bias and the array index into a single (32-bit) value when + * 32-bit texture coordinates are used. + */ + case nir_tex_src_backend1: + assert(!got_lod && !got_bias); + got_lod = true; + + assert(instr->op == nir_texop_txl || instr->op == nir_texop_txb); + srcs[TEX_LOGICAL_SRC_LOD] = + retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_REGISTER_TYPE_F); + break; + + default: + unreachable("unknown texture source"); + } + } + + /* If the surface or sampler were not specified through sources, use the + * instruction index. + */ + if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE && + srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE].file == BAD_FILE) + srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(instr->texture_index); + if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE && + srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE].file == BAD_FILE) + srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(instr->sampler_index); + + if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE && + (instr->op == nir_texop_txf_ms || + instr->op == nir_texop_samples_identical)) { + if (devinfo->ver >= 7) { + srcs[TEX_LOGICAL_SRC_MCS] = + emit_mcs_fetch(ntb, srcs[TEX_LOGICAL_SRC_COORDINATE], + instr->coord_components, + srcs[TEX_LOGICAL_SRC_SURFACE], + srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]); + } else { + srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u); + } + } + + srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components); + srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components); + + enum opcode opcode; + switch (instr->op) { + case nir_texop_tex: + opcode = SHADER_OPCODE_TEX_LOGICAL; + break; + case nir_texop_txb: + opcode = FS_OPCODE_TXB_LOGICAL; + break; + case nir_texop_txl: + opcode = SHADER_OPCODE_TXL_LOGICAL; + break; + case nir_texop_txd: + opcode = SHADER_OPCODE_TXD_LOGICAL; + break; + case nir_texop_txf: + opcode = SHADER_OPCODE_TXF_LOGICAL; + break; + case nir_texop_txf_ms: + /* On Gfx12HP there is only CMS_W available. From the Bspec: Shared + * Functions - 3D Sampler - Messages - Message Format: + * + * ld2dms REMOVEDBY(GEN:HAS:1406788836) + */ + if (devinfo->verx10 >= 125) + opcode = SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL; + else if (devinfo->ver >= 9) + opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL; + else + opcode = SHADER_OPCODE_TXF_CMS_LOGICAL; + break; + case nir_texop_txf_ms_mcs_intel: + opcode = SHADER_OPCODE_TXF_MCS_LOGICAL; + break; + case nir_texop_query_levels: + case nir_texop_txs: + opcode = SHADER_OPCODE_TXS_LOGICAL; + break; + case nir_texop_lod: + opcode = SHADER_OPCODE_LOD_LOGICAL; + break; + case nir_texop_tg4: + if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE) + opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL; + else + opcode = SHADER_OPCODE_TG4_LOGICAL; + break; + case nir_texop_texture_samples: + opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL; + break; + case nir_texop_samples_identical: { + fs_reg dst = retype(get_nir_def(ntb, instr->def), BRW_REGISTER_TYPE_D); + + /* If mcs is an immediate value, it means there is no MCS. In that case + * just return false. + */ + if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) { + bld.MOV(dst, brw_imm_ud(0u)); + } else if (devinfo->ver >= 9) { + fs_reg tmp = s.vgrf(glsl_uint_type()); + bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS], + offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1)); + bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ); + } else { + bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u), + BRW_CONDITIONAL_EQ); + } + return; + } + default: + unreachable("unknown texture opcode"); + } + + if (instr->op == nir_texop_tg4) { + if (instr->component == 1 && + s.key_tex->gather_channel_quirk_mask & (1 << instr->texture_index)) { + /* gather4 sampler is broken for green channel on RG32F -- + * we must ask for blue instead. + */ + header_bits |= 2 << 16; + } else { + header_bits |= instr->component << 16; + } + } + + fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4 + instr->is_sparse); + fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs)); + inst->offset = header_bits; + + const unsigned dest_size = nir_tex_instr_dest_size(instr); + if (devinfo->ver >= 9 && + instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) { + unsigned write_mask = nir_def_components_read(&instr->def); + assert(write_mask != 0); /* dead code should have been eliminated */ + if (instr->is_sparse) { + inst->size_written = (util_last_bit(write_mask) - 1) * + inst->dst.component_size(inst->exec_size) + + (reg_unit(devinfo) * REG_SIZE); + } else { + inst->size_written = util_last_bit(write_mask) * + inst->dst.component_size(inst->exec_size); + } + } else { + inst->size_written = 4 * inst->dst.component_size(inst->exec_size) + + (instr->is_sparse ? (reg_unit(devinfo) * REG_SIZE) : 0); + } + + if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE) + inst->shadow_compare = true; + + /* Wa_14012688258: + * + * Don't trim zeros at the end of payload for sample operations + * in cube and cube arrays. + */ + if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && + intel_needs_workaround(devinfo, 14012688258)) { + + /* Compiler should send U,V,R parameters even if V,R are 0. */ + if (srcs[TEX_LOGICAL_SRC_COORDINATE].file != BAD_FILE) + assert(instr->coord_components >= 3u); + + /* See opt_zero_samples(). */ + inst->keep_payload_trailing_zeros = true; + } + + fs_reg nir_dest[5]; + for (unsigned i = 0; i < dest_size; i++) + nir_dest[i] = offset(dst, bld, i); + + if (instr->op == nir_texop_query_levels) { + /* # levels is in .w */ + if (devinfo->ver <= 9) { + /** + * Wa_1940217: + * + * When a surface of type SURFTYPE_NULL is accessed by resinfo, the + * MIPCount returned is undefined instead of 0. + */ + fs_inst *mov = bld.MOV(bld.null_reg_d(), dst); + mov->conditional_mod = BRW_CONDITIONAL_NZ; + nir_dest[0] = bld.vgrf(BRW_REGISTER_TYPE_D); + fs_inst *sel = bld.SEL(nir_dest[0], offset(dst, bld, 3), brw_imm_d(0)); + sel->predicate = BRW_PREDICATE_NORMAL; + } else { + nir_dest[0] = offset(dst, bld, 3); + } + } else if (instr->op == nir_texop_txs && + dest_size >= 3 && devinfo->ver < 7) { + /* Gfx4-6 return 0 instead of 1 for single layer surfaces. */ + fs_reg depth = offset(dst, bld, 2); + nir_dest[2] = s.vgrf(glsl_int_type()); + bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE); + } + + /* The residency bits are only in the first component. */ + if (instr->is_sparse) + nir_dest[dest_size - 1] = component(offset(dst, bld, dest_size - 1), 0); + + bld.LOAD_PAYLOAD(get_nir_def(ntb, instr->def), nir_dest, dest_size, 0); +} + +static void +fs_nir_emit_jump(nir_to_brw_state &ntb, nir_jump_instr *instr) +{ + switch (instr->type) { + case nir_jump_break: + ntb.bld.emit(BRW_OPCODE_BREAK); + break; + case nir_jump_continue: + ntb.bld.emit(BRW_OPCODE_CONTINUE); + break; + case nir_jump_halt: + ntb.bld.emit(BRW_OPCODE_HALT); + break; + case nir_jump_return: + default: + unreachable("unknown jump"); + } +} + +/* + * This helper takes a source register and un/shuffles it into the destination + * register. + * + * If source type size is smaller than destination type size the operation + * needed is a component shuffle. The opposite case would be an unshuffle. If + * source/destination type size is equal a shuffle is done that would be + * equivalent to a simple MOV. + * + * For example, if source is a 16-bit type and destination is 32-bit. A 3 + * components .xyz 16-bit vector on SIMD8 would be. + * + * |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8| + * |z1|z2|z3|z4|z5|z6|z7|z8| | | | | | | | | + * + * This helper will return the following 2 32-bit components with the 16-bit + * values shuffled: + * + * |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8| + * |z1 |z2 |z3 |z4 |z5 |z6 |z7 |z8 | + * + * For unshuffle, the example would be the opposite, a 64-bit type source + * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8 + * would be: + * + * | x1l x1h | x2l x2h | x3l x3h | x4l x4h | + * | x5l x5h | x6l x6h | x7l x7h | x8l x8h | + * | y1l y1h | y2l y2h | y3l y3h | y4l y4h | + * | y5l y5h | y6l y6h | y7l y7h | y8l y8h | + * + * The returned result would be the following 4 32-bit components unshuffled: + * + * | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l | + * | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h | + * | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l | + * | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h | + * + * - Source and destination register must not be overlapped. + * - components units are measured in terms of the smaller type between + * source and destination because we are un/shuffling the smaller + * components from/into the bigger ones. + * - first_component parameter allows skipping source components. + */ +void +shuffle_src_to_dst(const fs_builder &bld, + const fs_reg &dst, + const fs_reg &src, + uint32_t first_component, + uint32_t components) +{ + if (type_sz(src.type) == type_sz(dst.type)) { + assert(!regions_overlap(dst, + type_sz(dst.type) * bld.dispatch_width() * components, + offset(src, bld, first_component), + type_sz(src.type) * bld.dispatch_width() * components)); + for (unsigned i = 0; i < components; i++) { + bld.MOV(retype(offset(dst, bld, i), src.type), + offset(src, bld, i + first_component)); + } + } else if (type_sz(src.type) < type_sz(dst.type)) { + /* Source is shuffled into destination */ + unsigned size_ratio = type_sz(dst.type) / type_sz(src.type); + assert(!regions_overlap(dst, + type_sz(dst.type) * bld.dispatch_width() * + DIV_ROUND_UP(components, size_ratio), + offset(src, bld, first_component), + type_sz(src.type) * bld.dispatch_width() * components)); + + brw_reg_type shuffle_type = + brw_reg_type_from_bit_size(8 * type_sz(src.type), + BRW_REGISTER_TYPE_D); + for (unsigned i = 0; i < components; i++) { + fs_reg shuffle_component_i = + subscript(offset(dst, bld, i / size_ratio), + shuffle_type, i % size_ratio); + bld.MOV(shuffle_component_i, + retype(offset(src, bld, i + first_component), shuffle_type)); + } + } else { + /* Source is unshuffled into destination */ + unsigned size_ratio = type_sz(src.type) / type_sz(dst.type); + assert(!regions_overlap(dst, + type_sz(dst.type) * bld.dispatch_width() * components, + offset(src, bld, first_component / size_ratio), + type_sz(src.type) * bld.dispatch_width() * + DIV_ROUND_UP(components + (first_component % size_ratio), + size_ratio))); + + brw_reg_type shuffle_type = + brw_reg_type_from_bit_size(8 * type_sz(dst.type), + BRW_REGISTER_TYPE_D); + for (unsigned i = 0; i < components; i++) { + fs_reg shuffle_component_i = + subscript(offset(src, bld, (first_component + i) / size_ratio), + shuffle_type, (first_component + i) % size_ratio); + bld.MOV(retype(offset(dst, bld, i), shuffle_type), + shuffle_component_i); + } + } +} + +void +shuffle_from_32bit_read(const fs_builder &bld, + const fs_reg &dst, + const fs_reg &src, + uint32_t first_component, + uint32_t components) +{ + assert(type_sz(src.type) == 4); + + /* This function takes components in units of the destination type while + * shuffle_src_to_dst takes components in units of the smallest type + */ + if (type_sz(dst.type) > 4) { + assert(type_sz(dst.type) == 8); + first_component *= 2; + components *= 2; + } + + shuffle_src_to_dst(bld, dst, src, first_component, components); +} + +fs_reg +setup_imm_df(const fs_builder &bld, double v) +{ + const struct intel_device_info *devinfo = bld.shader->devinfo; + assert(devinfo->ver >= 7); + + if (devinfo->ver >= 8) + return brw_imm_df(v); + + /* gfx7.5 does not support DF immediates straightforward but the DIM + * instruction allows to set the 64-bit immediate value. + */ + if (devinfo->platform == INTEL_PLATFORM_HSW) { + const fs_builder ubld = bld.exec_all().group(1, 0); + fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1); + ubld.DIM(dst, brw_imm_df(v)); + return component(dst, 0); + } + + /* gfx7 does not support DF immediates, so we generate a 64-bit constant by + * writing the low 32-bit of the constant to suboffset 0 of a VGRF and + * the high 32-bit to suboffset 4 and then applying a stride of 0. + * + * Alternatively, we could also produce a normal VGRF (without stride 0) + * by writing to all the channels in the VGRF, however, that would hit the + * gfx7 bug where we have to split writes that span more than 1 register + * into instructions with a width of 4 (otherwise the write to the second + * register written runs into an execmask hardware bug) which isn't very + * nice. + */ + union { + double d; + struct { + uint32_t i1; + uint32_t i2; + }; + } di; + + di.d = v; + + const fs_builder ubld = bld.exec_all().group(1, 0); + const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); + ubld.MOV(tmp, brw_imm_ud(di.i1)); + ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2)); + + return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0); +} + +fs_reg +setup_imm_b(const fs_builder &bld, int8_t v) +{ + const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_B); + bld.MOV(tmp, brw_imm_w(v)); + return tmp; +} + +fs_reg +setup_imm_ub(const fs_builder &bld, uint8_t v) +{ + const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UB); + bld.MOV(tmp, brw_imm_uw(v)); + return tmp; +} + +static void +fs_nir_emit_instr(nir_to_brw_state &ntb, nir_instr *instr) +{ + ntb.bld = ntb.bld.annotate(NULL, instr); + + switch (instr->type) { + case nir_instr_type_alu: + fs_nir_emit_alu(ntb, nir_instr_as_alu(instr), true); + break; + + case nir_instr_type_deref: + unreachable("All derefs should've been lowered"); + break; + + case nir_instr_type_intrinsic: + switch (ntb.s.stage) { + case MESA_SHADER_VERTEX: + fs_nir_emit_vs_intrinsic(ntb, nir_instr_as_intrinsic(instr)); + break; + case MESA_SHADER_TESS_CTRL: + fs_nir_emit_tcs_intrinsic(ntb, nir_instr_as_intrinsic(instr)); + break; + case MESA_SHADER_TESS_EVAL: + fs_nir_emit_tes_intrinsic(ntb, nir_instr_as_intrinsic(instr)); + break; + case MESA_SHADER_GEOMETRY: + fs_nir_emit_gs_intrinsic(ntb, nir_instr_as_intrinsic(instr)); + break; + case MESA_SHADER_FRAGMENT: + fs_nir_emit_fs_intrinsic(ntb, nir_instr_as_intrinsic(instr)); + break; + case MESA_SHADER_COMPUTE: + case MESA_SHADER_KERNEL: + fs_nir_emit_cs_intrinsic(ntb, nir_instr_as_intrinsic(instr)); + break; + case MESA_SHADER_RAYGEN: + case MESA_SHADER_ANY_HIT: + case MESA_SHADER_CLOSEST_HIT: + case MESA_SHADER_MISS: + case MESA_SHADER_INTERSECTION: + case MESA_SHADER_CALLABLE: + fs_nir_emit_bs_intrinsic(ntb, nir_instr_as_intrinsic(instr)); + break; + case MESA_SHADER_TASK: + fs_nir_emit_task_intrinsic(ntb, nir_instr_as_intrinsic(instr)); + break; + case MESA_SHADER_MESH: + fs_nir_emit_mesh_intrinsic(ntb, nir_instr_as_intrinsic(instr)); + break; + default: + unreachable("unsupported shader stage"); + } + break; + + case nir_instr_type_tex: + fs_nir_emit_texture(ntb, nir_instr_as_tex(instr)); + break; + + case nir_instr_type_load_const: + fs_nir_emit_load_const(ntb, nir_instr_as_load_const(instr)); + break; + + case nir_instr_type_undef: + /* We create a new VGRF for undefs on every use (by handling + * them in get_nir_src()), rather than for each definition. + * This helps register coalescing eliminate MOVs from undef. + */ + break; + + case nir_instr_type_jump: + fs_nir_emit_jump(ntb, nir_instr_as_jump(instr)); + break; + + default: + unreachable("unknown instruction type"); + } +} + +static unsigned +brw_rnd_mode_from_nir(unsigned mode, unsigned *mask) +{ + unsigned brw_mode = 0; + *mask = 0; + + if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | + FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | + FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) & + mode) { + brw_mode |= BRW_RND_MODE_RTZ << BRW_CR0_RND_MODE_SHIFT; + *mask |= BRW_CR0_RND_MODE_MASK; + } + if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | + FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 | + FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) & + mode) { + brw_mode |= BRW_RND_MODE_RTNE << BRW_CR0_RND_MODE_SHIFT; + *mask |= BRW_CR0_RND_MODE_MASK; + } + if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) { + brw_mode |= BRW_CR0_FP16_DENORM_PRESERVE; + *mask |= BRW_CR0_FP16_DENORM_PRESERVE; + } + if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) { + brw_mode |= BRW_CR0_FP32_DENORM_PRESERVE; + *mask |= BRW_CR0_FP32_DENORM_PRESERVE; + } + if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) { + brw_mode |= BRW_CR0_FP64_DENORM_PRESERVE; + *mask |= BRW_CR0_FP64_DENORM_PRESERVE; + } + if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16) + *mask |= BRW_CR0_FP16_DENORM_PRESERVE; + if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) + *mask |= BRW_CR0_FP32_DENORM_PRESERVE; + if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64) + *mask |= BRW_CR0_FP64_DENORM_PRESERVE; + if (mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE) + *mask |= BRW_CR0_FP_MODE_MASK; + + if (*mask != 0) + assert((*mask & brw_mode) == brw_mode); + + return brw_mode; +} + +static void +emit_shader_float_controls_execution_mode(nir_to_brw_state &ntb) +{ + const fs_builder &bld = ntb.bld; + fs_visitor &s = ntb.s; + + unsigned execution_mode = s.nir->info.float_controls_execution_mode; + if (execution_mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE) + return; + + fs_builder ubld = bld.exec_all().group(1, 0); + fs_builder abld = ubld.annotate("shader floats control execution mode"); + unsigned mask, mode = brw_rnd_mode_from_nir(execution_mode, &mask); + + if (mask == 0) + return; + + abld.emit(SHADER_OPCODE_FLOAT_CONTROL_MODE, bld.null_reg_ud(), + brw_imm_d(mode), brw_imm_d(mask)); +} + +void +nir_to_brw(fs_visitor *s) +{ + nir_to_brw_state ntb = { + .s = *s, + .nir = s->nir, + .devinfo = s->devinfo, + .mem_ctx = ralloc_context(NULL), + .bld = fs_builder(s).at_end(), + }; + + emit_shader_float_controls_execution_mode(ntb); + + /* emit the arrays used for inputs and outputs - load/store intrinsics will + * be converted to reads/writes of these arrays + */ + fs_nir_setup_outputs(ntb); + fs_nir_setup_uniforms(ntb.s); + fs_nir_emit_system_values(ntb); + ntb.s.last_scratch = ALIGN(ntb.nir->scratch_size, 4) * ntb.s.dispatch_width; + + fs_nir_emit_impl(ntb, nir_shader_get_entrypoint((nir_shader *)ntb.nir)); + + ntb.bld.emit(SHADER_OPCODE_HALT_TARGET); + + ralloc_free(ntb.mem_ctx); +} + diff --git a/src/intel/compiler/elk/brw_fs_reg_allocate.cpp b/src/intel/compiler/elk/brw_fs_reg_allocate.cpp new file mode 100644 index 00000000000..cc0f4762bc6 --- /dev/null +++ b/src/intel/compiler/elk/brw_fs_reg_allocate.cpp @@ -0,0 +1,1412 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt + * + */ + +#include "brw_eu.h" +#include "brw_fs.h" +#include "brw_fs_builder.h" +#include "brw_cfg.h" +#include "util/set.h" +#include "util/register_allocate.h" + +using namespace brw; + +#define REG_CLASS_COUNT 20 + +static void +assign_reg(const struct intel_device_info *devinfo, + unsigned *reg_hw_locations, fs_reg *reg) +{ + if (reg->file == VGRF) { + reg->nr = reg_unit(devinfo) * reg_hw_locations[reg->nr] + reg->offset / REG_SIZE; + reg->offset %= REG_SIZE; + } +} + +void +fs_visitor::assign_regs_trivial() +{ + unsigned hw_reg_mapping[this->alloc.count + 1]; + unsigned i; + int reg_width = dispatch_width / 8; + + /* Note that compressed instructions require alignment to 2 registers. */ + hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width); + for (i = 1; i <= this->alloc.count; i++) { + hw_reg_mapping[i] = (hw_reg_mapping[i - 1] + + DIV_ROUND_UP(this->alloc.sizes[i - 1], + reg_unit(devinfo))); + } + this->grf_used = hw_reg_mapping[this->alloc.count]; + + foreach_block_and_inst(block, fs_inst, inst, cfg) { + assign_reg(devinfo, hw_reg_mapping, &inst->dst); + for (i = 0; i < inst->sources; i++) { + assign_reg(devinfo, hw_reg_mapping, &inst->src[i]); + } + } + + if (this->grf_used >= max_grf) { + fail("Ran out of regs on trivial allocator (%d/%d)\n", + this->grf_used, max_grf); + } else { + this->alloc.count = this->grf_used; + } + +} + +/** + * Size of a register from the aligned_bary_class register class. + */ +static unsigned +aligned_bary_size(unsigned dispatch_width) +{ + return (dispatch_width == 8 ? 2 : 4); +} + +static void +brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width) +{ + const struct intel_device_info *devinfo = compiler->devinfo; + int base_reg_count = BRW_MAX_GRF; + const int index = util_logbase2(dispatch_width / 8); + + if (dispatch_width > 8 && devinfo->ver >= 7) { + /* For IVB+, we don't need the PLN hacks or the even-reg alignment in + * SIMD16. Therefore, we can use the exact same register sets for + * SIMD16 as we do for SIMD8 and we don't need to recalculate them. + */ + compiler->fs_reg_sets[index] = compiler->fs_reg_sets[0]; + return; + } + + /* The registers used to make up almost all values handled in the compiler + * are a scalar value occupying a single register (or 2 registers in the + * case of SIMD16, which is handled by dividing base_reg_count by 2 and + * multiplying allocated register numbers by 2). Things that were + * aggregates of scalar values at the GLSL level were split to scalar + * values by split_virtual_grfs(). + * + * However, texture SEND messages return a series of contiguous registers + * to write into. We currently always ask for 4 registers, but we may + * convert that to use less some day. + * + * Additionally, on gfx5 we need aligned pairs of registers for the PLN + * instruction, and on gfx4 we need 8 contiguous regs for workaround simd16 + * texturing. + */ + assert(REG_CLASS_COUNT == MAX_VGRF_SIZE(devinfo) / reg_unit(devinfo)); + int class_sizes[REG_CLASS_COUNT]; + for (unsigned i = 0; i < REG_CLASS_COUNT; i++) + class_sizes[i] = i + 1; + + struct ra_regs *regs = ra_alloc_reg_set(compiler, BRW_MAX_GRF, false); + if (devinfo->ver >= 6) + ra_set_allocate_round_robin(regs); + struct ra_class **classes = ralloc_array(compiler, struct ra_class *, + REG_CLASS_COUNT); + struct ra_class *aligned_bary_class = NULL; + + /* Now, make the register classes for each size of contiguous register + * allocation we might need to make. + */ + for (int i = 0; i < REG_CLASS_COUNT; i++) { + classes[i] = ra_alloc_contig_reg_class(regs, class_sizes[i]); + + if (devinfo->ver <= 5 && dispatch_width >= 16) { + /* From the G45 PRM: + * + * In order to reduce the hardware complexity, the following + * rules and restrictions apply to the compressed instruction: + * ... + * * Operand Alignment Rule: With the exceptions listed below, a + * source/destination operand in general should be aligned to + * even 256-bit physical register with a region size equal to + * two 256-bit physical register + */ + for (int reg = 0; reg <= base_reg_count - class_sizes[i]; reg += 2) + ra_class_add_reg(classes[i], reg); + } else { + for (int reg = 0; reg <= base_reg_count - class_sizes[i]; reg++) + ra_class_add_reg(classes[i], reg); + } + } + + /* Add a special class for aligned barycentrics, which we'll put the + * first source of LINTERP on so that we can do PLN on Gen <= 6. + */ + if (devinfo->has_pln && (devinfo->ver == 6 || + (dispatch_width == 8 && devinfo->ver <= 5))) { + int contig_len = aligned_bary_size(dispatch_width); + aligned_bary_class = ra_alloc_contig_reg_class(regs, contig_len); + + for (int i = 0; i <= base_reg_count - contig_len; i += 2) + ra_class_add_reg(aligned_bary_class, i); + } + + ra_set_finalize(regs, NULL); + + compiler->fs_reg_sets[index].regs = regs; + for (unsigned i = 0; i < ARRAY_SIZE(compiler->fs_reg_sets[index].classes); i++) + compiler->fs_reg_sets[index].classes[i] = NULL; + for (int i = 0; i < REG_CLASS_COUNT; i++) + compiler->fs_reg_sets[index].classes[class_sizes[i] - 1] = classes[i]; + compiler->fs_reg_sets[index].aligned_bary_class = aligned_bary_class; +} + +void +brw_fs_alloc_reg_sets(struct brw_compiler *compiler) +{ + brw_alloc_reg_set(compiler, 8); + brw_alloc_reg_set(compiler, 16); + brw_alloc_reg_set(compiler, 32); +} + +static int +count_to_loop_end(const bblock_t *block) +{ + if (block->end()->opcode == BRW_OPCODE_WHILE) + return block->end_ip; + + int depth = 1; + /* Skip the first block, since we don't want to count the do the calling + * function found. + */ + for (block = block->next(); + depth > 0; + block = block->next()) { + if (block->start()->opcode == BRW_OPCODE_DO) + depth++; + if (block->end()->opcode == BRW_OPCODE_WHILE) { + depth--; + if (depth == 0) + return block->end_ip; + } + } + unreachable("not reached"); +} + +void fs_visitor::calculate_payload_ranges(unsigned payload_node_count, + int *payload_last_use_ip) const +{ + int loop_depth = 0; + int loop_end_ip = 0; + + for (unsigned i = 0; i < payload_node_count; i++) + payload_last_use_ip[i] = -1; + + int ip = 0; + foreach_block_and_inst(block, fs_inst, inst, cfg) { + switch (inst->opcode) { + case BRW_OPCODE_DO: + loop_depth++; + + /* Since payload regs are deffed only at the start of the shader + * execution, any uses of the payload within a loop mean the live + * interval extends to the end of the outermost loop. Find the ip of + * the end now. + */ + if (loop_depth == 1) + loop_end_ip = count_to_loop_end(block); + break; + case BRW_OPCODE_WHILE: + loop_depth--; + break; + default: + break; + } + + int use_ip; + if (loop_depth > 0) + use_ip = loop_end_ip; + else + use_ip = ip; + + /* Note that UNIFORM args have been turned into FIXED_GRF by + * assign_curbe_setup(), and interpolation uses fixed hardware regs from + * the start (see interp_reg()). + */ + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == FIXED_GRF) { + unsigned reg_nr = inst->src[i].nr; + if (reg_nr / reg_unit(devinfo) >= payload_node_count) + continue; + + for (unsigned j = reg_nr / reg_unit(devinfo); + j < DIV_ROUND_UP(reg_nr + regs_read(inst, i), + reg_unit(devinfo)); + j++) { + payload_last_use_ip[j] = use_ip; + assert(j < payload_node_count); + } + } + } + + if (inst->dst.file == FIXED_GRF) { + unsigned reg_nr = inst->dst.nr; + if (reg_nr / reg_unit(devinfo) < payload_node_count) { + for (unsigned j = reg_nr / reg_unit(devinfo); + j < DIV_ROUND_UP(reg_nr + regs_written(inst), + reg_unit(devinfo)); + j++) { + payload_last_use_ip[j] = use_ip; + assert(j < payload_node_count); + } + } + } + + /* Special case instructions which have extra implied registers used. */ + switch (inst->opcode) { + case CS_OPCODE_CS_TERMINATE: + payload_last_use_ip[0] = use_ip; + break; + + default: + if (inst->eot) { + /* We could omit this for the !inst->header_present case, except + * that the simulator apparently incorrectly reads from g0/g1 + * instead of sideband. It also really freaks out driver + * developers to see g0 used in unusual places, so just always + * reserve it. + */ + payload_last_use_ip[0] = use_ip; + payload_last_use_ip[1] = use_ip; + } + break; + } + + ip++; + } +} + +class fs_reg_alloc { +public: + fs_reg_alloc(fs_visitor *fs): + fs(fs), devinfo(fs->devinfo), compiler(fs->compiler), + live(fs->live_analysis.require()), g(NULL), + have_spill_costs(false) + { + mem_ctx = ralloc_context(NULL); + + /* Stash the number of instructions so we can sanity check that our + * counts still match liveness. + */ + live_instr_count = fs->cfg->last_block()->end_ip + 1; + + spill_insts = _mesa_pointer_set_create(mem_ctx); + + /* Most of this allocation was written for a reg_width of 1 + * (dispatch_width == 8). In extending to SIMD16, the code was + * left in place and it was converted to have the hardware + * registers it's allocating be contiguous physical pairs of regs + * for reg_width == 2. + */ + int reg_width = fs->dispatch_width / 8; + rsi = util_logbase2(reg_width); + payload_node_count = ALIGN(fs->first_non_payload_grf, reg_width); + + /* Get payload IP information */ + payload_last_use_ip = ralloc_array(mem_ctx, int, payload_node_count); + + node_count = 0; + first_payload_node = 0; + first_mrf_hack_node = 0; + scratch_header_node = 0; + grf127_send_hack_node = 0; + first_vgrf_node = 0; + last_vgrf_node = 0; + first_spill_node = 0; + + spill_vgrf_ip = NULL; + spill_vgrf_ip_alloc = 0; + spill_node_count = 0; + } + + ~fs_reg_alloc() + { + ralloc_free(mem_ctx); + } + + bool assign_regs(bool allow_spilling, bool spill_all); + +private: + void setup_live_interference(unsigned node, + int node_start_ip, int node_end_ip); + void setup_inst_interference(const fs_inst *inst); + + void build_interference_graph(bool allow_spilling); + void discard_interference_graph(); + + fs_reg build_lane_offsets(const fs_builder &bld, + uint32_t spill_offset, int ip); + fs_reg build_single_offset(const fs_builder &bld, + uint32_t spill_offset, int ip); + + void emit_unspill(const fs_builder &bld, struct shader_stats *stats, + fs_reg dst, uint32_t spill_offset, unsigned count, int ip); + void emit_spill(const fs_builder &bld, struct shader_stats *stats, + fs_reg src, uint32_t spill_offset, unsigned count, int ip); + + void set_spill_costs(); + int choose_spill_reg(); + fs_reg alloc_scratch_header(); + fs_reg alloc_spill_reg(unsigned size, int ip); + void spill_reg(unsigned spill_reg); + + void *mem_ctx; + fs_visitor *fs; + const intel_device_info *devinfo; + const brw_compiler *compiler; + const fs_live_variables &live; + int live_instr_count; + + set *spill_insts; + + /* Which compiler->fs_reg_sets[] to use */ + int rsi; + + ra_graph *g; + bool have_spill_costs; + + int payload_node_count; + int *payload_last_use_ip; + + int node_count; + int first_payload_node; + int first_mrf_hack_node; + int scratch_header_node; + int grf127_send_hack_node; + int first_vgrf_node; + int last_vgrf_node; + int first_spill_node; + + int *spill_vgrf_ip; + int spill_vgrf_ip_alloc; + int spill_node_count; + + fs_reg scratch_header; +}; + +/** + * Sets the mrf_used array to indicate which MRFs are used by the shader IR + * + * This is used in assign_regs() to decide which of the GRFs that we use as + * MRFs on gfx7 get normally register allocated, and in register spilling to + * see if we can actually use MRFs to do spills without overwriting normal MRF + * contents. + */ +static void +get_used_mrfs(const fs_visitor *v, bool *mrf_used) +{ + int reg_width = v->dispatch_width / 8; + + memset(mrf_used, 0, BRW_MAX_MRF(v->devinfo->ver) * sizeof(bool)); + + foreach_block_and_inst(block, fs_inst, inst, v->cfg) { + if (inst->dst.file == MRF) { + int reg = inst->dst.nr & ~BRW_MRF_COMPR4; + mrf_used[reg] = true; + if (reg_width == 2) { + if (inst->dst.nr & BRW_MRF_COMPR4) { + mrf_used[reg + 4] = true; + } else { + mrf_used[reg + 1] = true; + } + } + } + + if (inst->mlen > 0) { + for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { + mrf_used[inst->base_mrf + i] = true; + } + } + } +} + +namespace { + /** + * Maximum spill block size we expect to encounter in 32B units. + * + * This is somewhat arbitrary and doesn't necessarily limit the maximum + * variable size that can be spilled -- A higher value will allow a + * variable of a given size to be spilled more efficiently with a smaller + * number of scratch messages, but will increase the likelihood of a + * collision between the MRFs reserved for spilling and other MRFs used by + * the program (and possibly increase GRF register pressure on platforms + * without hardware MRFs), what could cause register allocation to fail. + * + * For the moment reserve just enough space so a register of 32 bit + * component type and natural region width can be spilled without splitting + * into multiple (force_writemask_all) scratch messages. + */ + unsigned + spill_max_size(const backend_shader *s) + { + /* LSC is limited to SIMD16 sends */ + if (s->devinfo->has_lsc) + return 2; + + /* FINISHME - On Gfx7+ it should be possible to avoid this limit + * altogether by spilling directly from the temporary GRF + * allocated to hold the result of the instruction (and the + * scratch write header). + */ + /* FINISHME - The shader's dispatch width probably belongs in + * backend_shader (or some nonexistent fs_shader class?) + * rather than in the visitor class. + */ + return static_cast(s)->dispatch_width / 8; + } + + /** + * First MRF register available for spilling. + */ + unsigned + spill_base_mrf(const backend_shader *s) + { + /* We don't use the MRF hack on Gfx9+ */ + assert(s->devinfo->ver < 9); + return BRW_MAX_MRF(s->devinfo->ver) - spill_max_size(s) - 1; + } +} + +void +fs_reg_alloc::setup_live_interference(unsigned node, + int node_start_ip, int node_end_ip) +{ + /* Mark any virtual grf that is live between the start of the program and + * the last use of a payload node interfering with that payload node. + */ + for (int i = 0; i < payload_node_count; i++) { + if (payload_last_use_ip[i] == -1) + continue; + + /* Note that we use a <= comparison, unlike vgrfs_interfere(), + * in order to not have to worry about the uniform issue described in + * calculate_live_intervals(). + */ + if (node_start_ip <= payload_last_use_ip[i]) + ra_add_node_interference(g, node, first_payload_node + i); + } + + /* If we have the MRF hack enabled, mark this node as interfering with all + * MRF registers. + */ + if (first_mrf_hack_node >= 0) { + for (int i = spill_base_mrf(fs); i < BRW_MAX_MRF(devinfo->ver); i++) + ra_add_node_interference(g, node, first_mrf_hack_node + i); + } + + /* Everything interferes with the scratch header */ + if (scratch_header_node >= 0) + ra_add_node_interference(g, node, scratch_header_node); + + /* Add interference with every vgrf whose live range intersects this + * node's. We only need to look at nodes below this one as the reflexivity + * of interference will take care of the rest. + */ + for (unsigned n2 = first_vgrf_node; + n2 <= (unsigned)last_vgrf_node && n2 < node; n2++) { + unsigned vgrf = n2 - first_vgrf_node; + if (!(node_end_ip <= live.vgrf_start[vgrf] || + live.vgrf_end[vgrf] <= node_start_ip)) + ra_add_node_interference(g, node, n2); + } +} + +void +fs_reg_alloc::setup_inst_interference(const fs_inst *inst) +{ + /* Certain instructions can't safely use the same register for their + * sources and destination. Add interference. + */ + if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) { + for (unsigned i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF) { + ra_add_node_interference(g, first_vgrf_node + inst->dst.nr, + first_vgrf_node + inst->src[i].nr); + } + } + } + + /* A compressed instruction is actually two instructions executed + * simultaneously. On most platforms, it ok to have the source and + * destination registers be the same. In this case, each instruction + * over-writes its own source and there's no problem. The real problem + * here is if the source and destination registers are off by one. Then + * you can end up in a scenario where the first instruction over-writes the + * source of the second instruction. Since the compiler doesn't know about + * this level of granularity, we simply make the source and destination + * interfere. + */ + if (inst->dst.component_size(inst->exec_size) > REG_SIZE && + inst->dst.file == VGRF) { + for (int i = 0; i < inst->sources; ++i) { + if (inst->src[i].file == VGRF) { + ra_add_node_interference(g, first_vgrf_node + inst->dst.nr, + first_vgrf_node + inst->src[i].nr); + } + } + } + + if (grf127_send_hack_node >= 0) { + /* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference", + * subsection "EUISA Instructions", Send Message (page 990): + * + * "r127 must not be used for return address when there is a src and + * dest overlap in send instruction." + * + * We are avoiding using grf127 as part of the destination of send + * messages adding a node interference to the grf127_send_hack_node. + * This node has a fixed assignment to grf127. + * + * We don't apply it to SIMD16 instructions because previous code avoids + * any register overlap between sources and destination. + */ + if (inst->exec_size < 16 && inst->is_send_from_grf() && + inst->dst.file == VGRF) + ra_add_node_interference(g, first_vgrf_node + inst->dst.nr, + grf127_send_hack_node); + + /* Spilling instruction are generated as SEND messages from MRF but as + * Gfx7+ supports sending from GRF the driver will maps assingn these + * MRF registers to a GRF. Implementations reuses the dest of the send + * message as source. So as we will have an overlap for sure, we create + * an interference between destination and grf127. + */ + if ((inst->opcode == SHADER_OPCODE_GFX7_SCRATCH_READ || + inst->opcode == SHADER_OPCODE_GFX4_SCRATCH_READ) && + inst->dst.file == VGRF) + ra_add_node_interference(g, first_vgrf_node + inst->dst.nr, + grf127_send_hack_node); + } + + /* From the Skylake PRM Vol. 2a docs for sends: + * + * "It is required that the second block of GRFs does not overlap with + * the first block." + * + * Normally, this is taken care of by fixup_sends_duplicate_payload() but + * in the case where one of the registers is an undefined value, the + * register allocator may decide that they don't interfere even though + * they're used as sources in the same instruction. We also need to add + * interference here. + */ + if (devinfo->ver >= 9) { + if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 && + inst->src[2].file == VGRF && inst->src[3].file == VGRF && + inst->src[2].nr != inst->src[3].nr) + ra_add_node_interference(g, first_vgrf_node + inst->src[2].nr, + first_vgrf_node + inst->src[3].nr); + } + + /* When we do send-from-GRF for FB writes, we need to ensure that the last + * write instruction sends from a high register. This is because the + * vertex fetcher wants to start filling the low payload registers while + * the pixel data port is still working on writing out the memory. If we + * don't do this, we get rendering artifacts. + * + * We could just do "something high". Instead, we just pick the highest + * register that works. + */ + if (inst->eot) { + const int vgrf = inst->opcode == SHADER_OPCODE_SEND ? + inst->src[2].nr : inst->src[0].nr; + const int size = DIV_ROUND_UP(fs->alloc.sizes[vgrf], reg_unit(devinfo)); + int reg = BRW_MAX_GRF - size; + + if (first_mrf_hack_node >= 0) { + /* If something happened to spill, we want to push the EOT send + * register early enough in the register file that we don't + * conflict with any used MRF hack registers. + */ + reg -= BRW_MAX_MRF(devinfo->ver) - spill_base_mrf(fs); + } else if (grf127_send_hack_node >= 0) { + /* Avoid r127 which might be unusable if the node was previously + * written by a SIMD8 SEND message with source/destination overlap. + */ + reg--; + } + + ra_set_node_reg(g, first_vgrf_node + vgrf, reg); + + if (inst->ex_mlen > 0) { + const int vgrf = inst->src[3].nr; + reg -= DIV_ROUND_UP(fs->alloc.sizes[vgrf], reg_unit(devinfo)); + ra_set_node_reg(g, first_vgrf_node + vgrf, reg); + } + } +} + +void +fs_reg_alloc::build_interference_graph(bool allow_spilling) +{ + /* Compute the RA node layout */ + node_count = 0; + first_payload_node = node_count; + node_count += payload_node_count; + if (devinfo->ver >= 7 && devinfo->ver < 9 && allow_spilling) { + first_mrf_hack_node = node_count; + node_count += BRW_MAX_GRF - GFX7_MRF_HACK_START; + } else { + first_mrf_hack_node = -1; + } + if (devinfo->ver >= 8) { + grf127_send_hack_node = node_count; + node_count ++; + } else { + grf127_send_hack_node = -1; + } + first_vgrf_node = node_count; + node_count += fs->alloc.count; + last_vgrf_node = node_count - 1; + if ((devinfo->ver >= 9 && devinfo->verx10 < 125) && allow_spilling) { + scratch_header_node = node_count++; + } else { + scratch_header_node = -1; + } + first_spill_node = node_count; + + fs->calculate_payload_ranges(payload_node_count, + payload_last_use_ip); + + assert(g == NULL); + g = ra_alloc_interference_graph(compiler->fs_reg_sets[rsi].regs, node_count); + ralloc_steal(mem_ctx, g); + + /* Set up the payload nodes */ + for (int i = 0; i < payload_node_count; i++) + ra_set_node_reg(g, first_payload_node + i, i); + + if (first_mrf_hack_node >= 0) { + /* Mark each MRF reg node as being allocated to its physical + * register. + * + * The alternative would be to have per-physical-register classes, + * which would just be silly. + */ + for (int i = 0; i < BRW_MAX_MRF(devinfo->ver); i++) { + ra_set_node_reg(g, first_mrf_hack_node + i, + GFX7_MRF_HACK_START + i); + } + } + + if (grf127_send_hack_node >= 0) + ra_set_node_reg(g, grf127_send_hack_node, 127); + + /* Specify the classes of each virtual register. */ + for (unsigned i = 0; i < fs->alloc.count; i++) { + unsigned size = DIV_ROUND_UP(fs->alloc.sizes[i], reg_unit(devinfo)); + + assert(size <= ARRAY_SIZE(compiler->fs_reg_sets[rsi].classes) && + "Register allocation relies on split_virtual_grfs()"); + + ra_set_node_class(g, first_vgrf_node + i, + compiler->fs_reg_sets[rsi].classes[size - 1]); + } + + /* Special case: on pre-Gfx7 hardware that supports PLN, the second operand + * of a PLN instruction needs to be an even-numbered register, so we have a + * special register class aligned_bary_class to handle this case. + */ + if (compiler->fs_reg_sets[rsi].aligned_bary_class) { + foreach_block_and_inst(block, fs_inst, inst, fs->cfg) { + if (inst->opcode == FS_OPCODE_LINTERP && inst->src[0].file == VGRF && + fs->alloc.sizes[inst->src[0].nr] == + aligned_bary_size(fs->dispatch_width)) { + ra_set_node_class(g, first_vgrf_node + inst->src[0].nr, + compiler->fs_reg_sets[rsi].aligned_bary_class); + } + } + } + + /* Add interference based on the live range of the register */ + for (unsigned i = 0; i < fs->alloc.count; i++) { + setup_live_interference(first_vgrf_node + i, + live.vgrf_start[i], + live.vgrf_end[i]); + } + + /* Add interference based on the instructions in which a register is used. + */ + foreach_block_and_inst(block, fs_inst, inst, fs->cfg) + setup_inst_interference(inst); +} + +void +fs_reg_alloc::discard_interference_graph() +{ + ralloc_free(g); + g = NULL; + have_spill_costs = false; +} + +fs_reg +fs_reg_alloc::build_single_offset(const fs_builder &bld, uint32_t spill_offset, int ip) +{ + fs_reg offset = retype(alloc_spill_reg(1, ip), BRW_REGISTER_TYPE_UD); + fs_inst *inst = bld.MOV(offset, brw_imm_ud(spill_offset)); + _mesa_set_add(spill_insts, inst); + return offset; +} + +fs_reg +fs_reg_alloc::build_lane_offsets(const fs_builder &bld, uint32_t spill_offset, int ip) +{ + /* LSC messages are limited to SIMD16 */ + assert(bld.dispatch_width() <= 16); + + const fs_builder ubld = bld.exec_all(); + const unsigned reg_count = ubld.dispatch_width() / 8; + + fs_reg offset = retype(alloc_spill_reg(reg_count, ip), BRW_REGISTER_TYPE_UD); + fs_inst *inst; + + /* Build an offset per lane in SIMD8 */ + inst = ubld.group(8, 0).MOV(retype(offset, BRW_REGISTER_TYPE_UW), + brw_imm_uv(0x76543210)); + _mesa_set_add(spill_insts, inst); + inst = ubld.group(8, 0).MOV(offset, retype(offset, BRW_REGISTER_TYPE_UW)); + _mesa_set_add(spill_insts, inst); + + /* Build offsets in the upper 8 lanes of SIMD16 */ + if (ubld.dispatch_width() > 8) { + inst = ubld.group(8, 0).ADD( + byte_offset(offset, REG_SIZE), + byte_offset(offset, 0), + brw_imm_ud(8)); + _mesa_set_add(spill_insts, inst); + } + + /* Make the offset a dword */ + inst = ubld.SHL(offset, offset, brw_imm_ud(2)); + _mesa_set_add(spill_insts, inst); + + /* Add the base offset */ + inst = ubld.ADD(offset, offset, brw_imm_ud(spill_offset)); + _mesa_set_add(spill_insts, inst); + + return offset; +} + +void +fs_reg_alloc::emit_unspill(const fs_builder &bld, + struct shader_stats *stats, + fs_reg dst, + uint32_t spill_offset, unsigned count, int ip) +{ + const intel_device_info *devinfo = bld.shader->devinfo; + const unsigned reg_size = dst.component_size(bld.dispatch_width()) / + REG_SIZE; + assert(count % reg_size == 0); + + for (unsigned i = 0; i < count / reg_size; i++) { + ++stats->fill_count; + + fs_inst *unspill_inst; + if (devinfo->verx10 >= 125) { + /* LSC is limited to SIMD16 load/store but we can load more using + * transpose messages. + */ + const bool use_transpose = bld.dispatch_width() > 16; + const fs_builder ubld = use_transpose ? bld.exec_all().group(1, 0) : bld; + fs_reg offset; + if (use_transpose) { + offset = build_single_offset(ubld, spill_offset, ip); + } else { + offset = build_lane_offsets(ubld, spill_offset, ip); + } + /* We leave the extended descriptor empty and flag the instruction to + * ask the generated to insert the extended descriptor in the address + * register. That way we don't need to burn an additional register + * for register allocation spill/fill. + */ + fs_reg srcs[] = { + brw_imm_ud(0), /* desc */ + brw_imm_ud(0), /* ex_desc */ + offset, /* payload */ + fs_reg(), /* payload2 */ + }; + + unspill_inst = ubld.emit(SHADER_OPCODE_SEND, dst, + srcs, ARRAY_SIZE(srcs)); + unspill_inst->sfid = GFX12_SFID_UGM; + unspill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, + unspill_inst->exec_size, + LSC_ADDR_SURFTYPE_SS, + LSC_ADDR_SIZE_A32, + 1 /* num_coordinates */, + LSC_DATA_SIZE_D32, + use_transpose ? reg_size * 8 : 1 /* num_channels */, + use_transpose, + LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS), + true /* has_dest */); + unspill_inst->header_size = 0; + unspill_inst->mlen = + lsc_msg_desc_src0_len(devinfo, unspill_inst->desc); + unspill_inst->ex_mlen = 0; + unspill_inst->size_written = + lsc_msg_desc_dest_len(devinfo, unspill_inst->desc) * REG_SIZE; + unspill_inst->send_has_side_effects = false; + unspill_inst->send_is_volatile = true; + unspill_inst->send_ex_desc_scratch = true; + } else if (devinfo->ver >= 9) { + fs_reg header = this->scratch_header; + fs_builder ubld = bld.exec_all().group(1, 0); + assert(spill_offset % 16 == 0); + unspill_inst = ubld.MOV(component(header, 2), + brw_imm_ud(spill_offset / 16)); + _mesa_set_add(spill_insts, unspill_inst); + + const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT; + const fs_reg ex_desc = brw_imm_ud(0); + + fs_reg srcs[] = { brw_imm_ud(0), ex_desc, header }; + unspill_inst = bld.emit(SHADER_OPCODE_SEND, dst, + srcs, ARRAY_SIZE(srcs)); + unspill_inst->mlen = 1; + unspill_inst->header_size = 1; + unspill_inst->size_written = reg_size * REG_SIZE; + unspill_inst->send_has_side_effects = false; + unspill_inst->send_is_volatile = true; + unspill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE; + unspill_inst->desc = + brw_dp_desc(devinfo, bti, + BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, + BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8)); + } else if (devinfo->ver >= 7 && spill_offset < (1 << 12) * REG_SIZE) { + /* The Gfx7 descriptor-based offset is 12 bits of HWORD units. + * Because the Gfx7-style scratch block read is hardwired to BTI 255, + * on Gfx9+ it would cause the DC to do an IA-coherent read, what + * largely outweighs the slight advantage from not having to provide + * the address as part of the message header, so we're better off + * using plain old oword block reads. + */ + unspill_inst = bld.emit(SHADER_OPCODE_GFX7_SCRATCH_READ, dst); + unspill_inst->offset = spill_offset; + } else { + unspill_inst = bld.emit(SHADER_OPCODE_GFX4_SCRATCH_READ, dst); + unspill_inst->offset = spill_offset; + unspill_inst->base_mrf = spill_base_mrf(bld.shader); + unspill_inst->mlen = 1; /* header contains offset */ + } + _mesa_set_add(spill_insts, unspill_inst); + + dst.offset += reg_size * REG_SIZE; + spill_offset += reg_size * REG_SIZE; + } +} + +void +fs_reg_alloc::emit_spill(const fs_builder &bld, + struct shader_stats *stats, + fs_reg src, + uint32_t spill_offset, unsigned count, int ip) +{ + const intel_device_info *devinfo = bld.shader->devinfo; + const unsigned reg_size = src.component_size(bld.dispatch_width()) / + REG_SIZE; + assert(count % reg_size == 0); + + for (unsigned i = 0; i < count / reg_size; i++) { + ++stats->spill_count; + + fs_inst *spill_inst; + if (devinfo->verx10 >= 125) { + fs_reg offset = build_lane_offsets(bld, spill_offset, ip); + /* We leave the extended descriptor empty and flag the instruction + * relocate the extended descriptor. That way the surface offset is + * directly put into the instruction and we don't need to use a + * register to hold it. + */ + fs_reg srcs[] = { + brw_imm_ud(0), /* desc */ + brw_imm_ud(0), /* ex_desc */ + offset, /* payload */ + src, /* payload2 */ + }; + spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(), + srcs, ARRAY_SIZE(srcs)); + spill_inst->sfid = GFX12_SFID_UGM; + spill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, + bld.dispatch_width(), + LSC_ADDR_SURFTYPE_SS, + LSC_ADDR_SIZE_A32, + 1 /* num_coordinates */, + LSC_DATA_SIZE_D32, + 1 /* num_channels */, + false /* transpose */, + LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS), + false /* has_dest */); + spill_inst->header_size = 0; + spill_inst->mlen = lsc_msg_desc_src0_len(devinfo, spill_inst->desc); + spill_inst->ex_mlen = reg_size; + spill_inst->size_written = 0; + spill_inst->send_has_side_effects = true; + spill_inst->send_is_volatile = false; + spill_inst->send_ex_desc_scratch = true; + } else if (devinfo->ver >= 9) { + fs_reg header = this->scratch_header; + fs_builder ubld = bld.exec_all().group(1, 0); + assert(spill_offset % 16 == 0); + spill_inst = ubld.MOV(component(header, 2), + brw_imm_ud(spill_offset / 16)); + _mesa_set_add(spill_insts, spill_inst); + + const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT; + const fs_reg ex_desc = brw_imm_ud(0); + + fs_reg srcs[] = { brw_imm_ud(0), ex_desc, header, src }; + spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(), + srcs, ARRAY_SIZE(srcs)); + spill_inst->mlen = 1; + spill_inst->ex_mlen = reg_size; + spill_inst->size_written = 0; + spill_inst->header_size = 1; + spill_inst->send_has_side_effects = true; + spill_inst->send_is_volatile = false; + spill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE; + spill_inst->desc = + brw_dp_desc(devinfo, bti, + GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, + BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8)); + } else { + spill_inst = bld.emit(SHADER_OPCODE_GFX4_SCRATCH_WRITE, + bld.null_reg_f(), src); + spill_inst->offset = spill_offset; + spill_inst->mlen = 1 + reg_size; /* header, value */ + spill_inst->base_mrf = spill_base_mrf(bld.shader); + } + _mesa_set_add(spill_insts, spill_inst); + + src.offset += reg_size * REG_SIZE; + spill_offset += reg_size * REG_SIZE; + } +} + +void +fs_reg_alloc::set_spill_costs() +{ + float block_scale = 1.0; + float spill_costs[fs->alloc.count]; + bool no_spill[fs->alloc.count]; + + for (unsigned i = 0; i < fs->alloc.count; i++) { + spill_costs[i] = 0.0; + no_spill[i] = false; + } + + /* Calculate costs for spilling nodes. Call it a cost of 1 per + * spill/unspill we'll have to do, and guess that the insides of + * loops run 10 times. + */ + foreach_block_and_inst(block, fs_inst, inst, fs->cfg) { + for (unsigned int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF) + spill_costs[inst->src[i].nr] += regs_read(inst, i) * block_scale; + } + + if (inst->dst.file == VGRF) + spill_costs[inst->dst.nr] += regs_written(inst) * block_scale; + + /* Don't spill anything we generated while spilling */ + if (_mesa_set_search(spill_insts, inst)) { + for (unsigned int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF) + no_spill[inst->src[i].nr] = true; + } + if (inst->dst.file == VGRF) + no_spill[inst->dst.nr] = true; + } + + switch (inst->opcode) { + + case BRW_OPCODE_DO: + block_scale *= 10; + break; + + case BRW_OPCODE_WHILE: + block_scale /= 10; + break; + + case BRW_OPCODE_IF: + case BRW_OPCODE_IFF: + block_scale *= 0.5; + break; + + case BRW_OPCODE_ENDIF: + block_scale /= 0.5; + break; + + default: + break; + } + } + + for (unsigned i = 0; i < fs->alloc.count; i++) { + /* Do the no_spill check first. Registers that are used as spill + * temporaries may have been allocated after we calculated liveness so + * we shouldn't look their liveness up. Fortunately, they're always + * used in SCRATCH_READ/WRITE instructions so they'll always be flagged + * no_spill. + */ + if (no_spill[i]) + continue; + + int live_length = live.vgrf_end[i] - live.vgrf_start[i]; + if (live_length <= 0) + continue; + + /* Divide the cost (in number of spills/fills) by the log of the length + * of the live range of the register. This will encourage spill logic + * to spill long-living things before spilling short-lived things where + * spilling is less likely to actually do us any good. We use the log + * of the length because it will fall off very quickly and not cause us + * to spill medium length registers with more uses. + */ + float adjusted_cost = spill_costs[i] / logf(live_length); + ra_set_node_spill_cost(g, first_vgrf_node + i, adjusted_cost); + } + + have_spill_costs = true; +} + +int +fs_reg_alloc::choose_spill_reg() +{ + if (!have_spill_costs) + set_spill_costs(); + + int node = ra_get_best_spill_node(g); + if (node < 0) + return -1; + + assert(node >= first_vgrf_node); + return node - first_vgrf_node; +} + +fs_reg +fs_reg_alloc::alloc_scratch_header() +{ + int vgrf = fs->alloc.allocate(1); + assert(first_vgrf_node + vgrf == scratch_header_node); + ra_set_node_class(g, scratch_header_node, + compiler->fs_reg_sets[rsi].classes[0]); + + setup_live_interference(scratch_header_node, 0, INT_MAX); + + return fs_reg(VGRF, vgrf, BRW_REGISTER_TYPE_UD); +} + +fs_reg +fs_reg_alloc::alloc_spill_reg(unsigned size, int ip) +{ + int vgrf = fs->alloc.allocate(ALIGN(size, reg_unit(devinfo))); + int class_idx = DIV_ROUND_UP(size, reg_unit(devinfo)) - 1; + int n = ra_add_node(g, compiler->fs_reg_sets[rsi].classes[class_idx]); + assert(n == first_vgrf_node + vgrf); + assert(n == first_spill_node + spill_node_count); + + setup_live_interference(n, ip - 1, ip + 1); + + /* Add interference between this spill node and any other spill nodes for + * the same instruction. + */ + for (int s = 0; s < spill_node_count; s++) { + if (spill_vgrf_ip[s] == ip) + ra_add_node_interference(g, n, first_spill_node + s); + } + + /* Add this spill node to the list for next time */ + if (spill_node_count >= spill_vgrf_ip_alloc) { + if (spill_vgrf_ip_alloc == 0) + spill_vgrf_ip_alloc = 16; + else + spill_vgrf_ip_alloc *= 2; + spill_vgrf_ip = reralloc(mem_ctx, spill_vgrf_ip, int, + spill_vgrf_ip_alloc); + } + spill_vgrf_ip[spill_node_count++] = ip; + + return fs_reg(VGRF, vgrf); +} + +void +fs_reg_alloc::spill_reg(unsigned spill_reg) +{ + int size = fs->alloc.sizes[spill_reg]; + unsigned int spill_offset = fs->last_scratch; + assert(ALIGN(spill_offset, 16) == spill_offset); /* oword read/write req. */ + + /* Spills may use MRFs 13-15 in the SIMD16 case. Our texturing is done + * using up to 11 MRFs starting from either m1 or m2, and fb writes can use + * up to m13 (gfx6+ simd16: 2 header + 8 color + 2 src0alpha + 2 omask) or + * m15 (gfx4-5 simd16: 2 header + 8 color + 1 aads + 2 src depth + 2 dst + * depth), starting from m1. In summary: We may not be able to spill in + * SIMD16 mode, because we'd stomp the FB writes. + */ + if (!fs->spilled_any_registers) { + if (devinfo->verx10 >= 125) { + /* We will allocate a register on the fly */ + } else if (devinfo->ver >= 9) { + this->scratch_header = alloc_scratch_header(); + fs_builder ubld = fs_builder(fs, 8).exec_all().at( + fs->cfg->first_block(), fs->cfg->first_block()->start()); + + fs_inst *inst = ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, + this->scratch_header); + _mesa_set_add(spill_insts, inst); + } else { + bool mrf_used[BRW_MAX_MRF(devinfo->ver)]; + get_used_mrfs(fs, mrf_used); + + for (int i = spill_base_mrf(fs); i < BRW_MAX_MRF(devinfo->ver); i++) { + if (mrf_used[i]) { + fs->fail("Register spilling not supported with m%d used", i); + return; + } + } + } + + fs->spilled_any_registers = true; + } + + fs->last_scratch += size * REG_SIZE; + + /* We're about to replace all uses of this register. It no longer + * conflicts with anything so we can get rid of its interference. + */ + ra_set_node_spill_cost(g, first_vgrf_node + spill_reg, 0); + ra_reset_node_interference(g, first_vgrf_node + spill_reg); + + /* Generate spill/unspill instructions for the objects being + * spilled. Right now, we spill or unspill the whole thing to a + * virtual grf of the same size. For most instructions, though, we + * could just spill/unspill the GRF being accessed. + */ + int ip = 0; + foreach_block_and_inst (block, fs_inst, inst, fs->cfg) { + const fs_builder ibld = fs_builder(fs, block, inst); + exec_node *before = inst->prev; + exec_node *after = inst->next; + + for (unsigned int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF && + inst->src[i].nr == spill_reg) { + int count = regs_read(inst, i); + int subset_spill_offset = spill_offset + + ROUND_DOWN_TO(inst->src[i].offset, REG_SIZE); + fs_reg unspill_dst = alloc_spill_reg(count, ip); + + inst->src[i].nr = unspill_dst.nr; + inst->src[i].offset %= REG_SIZE; + + /* We read the largest power-of-two divisor of the register count + * (because only POT scratch read blocks are allowed by the + * hardware) up to the maximum supported block size. + */ + const unsigned width = + MIN2(32, 1u << (ffs(MAX2(1, count) * 8) - 1)); + + /* Set exec_all() on unspill messages under the (rather + * pessimistic) assumption that there is no one-to-one + * correspondence between channels of the spilled variable in + * scratch space and the scratch read message, which operates on + * 32 bit channels. It shouldn't hurt in any case because the + * unspill destination is a block-local temporary. + */ + emit_unspill(ibld.exec_all().group(width, 0), &fs->shader_stats, + unspill_dst, subset_spill_offset, count, ip); + } + } + + if (inst->dst.file == VGRF && + inst->dst.nr == spill_reg && + inst->opcode != SHADER_OPCODE_UNDEF) { + int subset_spill_offset = spill_offset + + ROUND_DOWN_TO(inst->dst.offset, REG_SIZE); + fs_reg spill_src = alloc_spill_reg(regs_written(inst), ip); + + inst->dst.nr = spill_src.nr; + inst->dst.offset %= REG_SIZE; + + /* If we're immediately spilling the register, we should not use + * destination dependency hints. Doing so will cause the GPU do + * try to read and write the register at the same time and may + * hang the GPU. + */ + inst->no_dd_clear = false; + inst->no_dd_check = false; + + /* Calculate the execution width of the scratch messages (which work + * in terms of 32 bit components so we have a fixed number of eight + * channels per spilled register). We attempt to write one + * exec_size-wide component of the variable at a time without + * exceeding the maximum number of (fake) MRF registers reserved for + * spills. + */ + const unsigned width = 8 * reg_unit(devinfo) * + DIV_ROUND_UP(MIN2(inst->dst.component_size(inst->exec_size), + spill_max_size(fs) * REG_SIZE), + reg_unit(devinfo) * REG_SIZE); + + /* Spills should only write data initialized by the instruction for + * whichever channels are enabled in the execution mask. If that's + * not possible we'll have to emit a matching unspill before the + * instruction and set force_writemask_all on the spill. + */ + const bool per_channel = + inst->dst.is_contiguous() && type_sz(inst->dst.type) == 4 && + inst->exec_size == width; + + /* Builder used to emit the scratch messages. */ + const fs_builder ubld = ibld.exec_all(!per_channel).group(width, 0); + + /* If our write is going to affect just part of the + * regs_written(inst), then we need to unspill the destination since + * we write back out all of the regs_written(). If the original + * instruction had force_writemask_all set and is not a partial + * write, there should be no need for the unspill since the + * instruction will be overwriting the whole destination in any case. + */ + if (inst->is_partial_write() || + (!inst->force_writemask_all && !per_channel)) + emit_unspill(ubld, &fs->shader_stats, spill_src, + subset_spill_offset, regs_written(inst), ip); + + emit_spill(ubld.at(block, inst->next), &fs->shader_stats, spill_src, + subset_spill_offset, regs_written(inst), ip); + } + + for (fs_inst *inst = (fs_inst *)before->next; + inst != after; inst = (fs_inst *)inst->next) + setup_inst_interference(inst); + + /* We don't advance the ip for scratch read/write instructions + * because we consider them to have the same ip as instruction we're + * spilling around for the purposes of interference. Also, we're + * inserting spill instructions without re-running liveness analysis + * and we don't want to mess up our IPs. + */ + if (!_mesa_set_search(spill_insts, inst)) + ip++; + } + + assert(ip == live_instr_count); +} + +bool +fs_reg_alloc::assign_regs(bool allow_spilling, bool spill_all) +{ + build_interference_graph(fs->spilled_any_registers || spill_all); + + unsigned spilled = 0; + while (1) { + /* Debug of register spilling: Go spill everything. */ + if (unlikely(spill_all)) { + int reg = choose_spill_reg(); + if (reg != -1) { + spill_reg(reg); + continue; + } + } + + if (ra_allocate(g)) + break; + + if (!allow_spilling) + return false; + + /* Failed to allocate registers. Spill some regs, and the caller will + * loop back into here to try again. + */ + unsigned nr_spills = 1; + if (compiler->spilling_rate) + nr_spills = MAX2(1, spilled / compiler->spilling_rate); + + for (unsigned j = 0; j < nr_spills; j++) { + int reg = choose_spill_reg(); + if (reg == -1) { + if (j == 0) + return false; /* Nothing to spill */ + break; + } + + /* If we're going to spill but we've never spilled before, we need + * to re-build the interference graph with MRFs enabled to allow + * spilling. + */ + if (!fs->spilled_any_registers) { + discard_interference_graph(); + build_interference_graph(true); + } + + spill_reg(reg); + spilled++; + } + } + + if (spilled) + fs->invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + + /* Get the chosen virtual registers for each node, and map virtual + * regs in the register classes back down to real hardware reg + * numbers. + */ + unsigned hw_reg_mapping[fs->alloc.count]; + fs->grf_used = fs->first_non_payload_grf; + for (unsigned i = 0; i < fs->alloc.count; i++) { + int reg = ra_get_node_reg(g, first_vgrf_node + i); + + hw_reg_mapping[i] = reg; + fs->grf_used = MAX2(fs->grf_used, + hw_reg_mapping[i] + DIV_ROUND_UP(fs->alloc.sizes[i], + reg_unit(devinfo))); + } + + foreach_block_and_inst(block, fs_inst, inst, fs->cfg) { + assign_reg(devinfo, hw_reg_mapping, &inst->dst); + for (int i = 0; i < inst->sources; i++) { + assign_reg(devinfo, hw_reg_mapping, &inst->src[i]); + } + } + + fs->alloc.count = fs->grf_used; + + return true; +} + +bool +fs_visitor::assign_regs(bool allow_spilling, bool spill_all) +{ + fs_reg_alloc alloc(this); + bool success = alloc.assign_regs(allow_spilling, spill_all); + if (!success && allow_spilling) { + fail("no register to spill:\n"); + dump_instructions(NULL); + } + return success; +} diff --git a/src/intel/compiler/elk/brw_fs_register_coalesce.cpp b/src/intel/compiler/elk/brw_fs_register_coalesce.cpp new file mode 100644 index 00000000000..4c9bb3edba8 --- /dev/null +++ b/src/intel/compiler/elk/brw_fs_register_coalesce.cpp @@ -0,0 +1,349 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_fs_register_coalesce.cpp + * + * Implements register coalescing: Checks if the two registers involved in a + * raw move don't interfere, in which case they can both be stored in the same + * place and the MOV removed. + * + * To do this, all uses of the source of the MOV in the shader are replaced + * with the destination of the MOV. For example: + * + * add vgrf3:F, vgrf1:F, vgrf2:F + * mov vgrf4:F, vgrf3:F + * mul vgrf5:F, vgrf5:F, vgrf4:F + * + * becomes + * + * add vgrf4:F, vgrf1:F, vgrf2:F + * mul vgrf5:F, vgrf5:F, vgrf4:F + */ + +#include "brw_fs.h" +#include "brw_cfg.h" +#include "brw_fs_live_variables.h" + +using namespace brw; + +static bool +is_nop_mov(const fs_inst *inst) +{ + if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) { + fs_reg dst = inst->dst; + for (int i = 0; i < inst->sources; i++) { + if (!dst.equals(inst->src[i])) { + return false; + } + dst.offset += (i < inst->header_size ? REG_SIZE : + inst->exec_size * dst.stride * + type_sz(inst->src[i].type)); + } + return true; + } else if (inst->opcode == BRW_OPCODE_MOV) { + return inst->dst.equals(inst->src[0]); + } + + return false; +} + +static bool +is_coalesce_candidate(const fs_visitor *v, const fs_inst *inst) +{ + if ((inst->opcode != BRW_OPCODE_MOV && + inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) || + inst->is_partial_write() || + inst->saturate || + inst->src[0].file != VGRF || + inst->src[0].negate || + inst->src[0].abs || + !inst->src[0].is_contiguous() || + inst->dst.file != VGRF || + inst->dst.type != inst->src[0].type) { + return false; + } + + if (v->alloc.sizes[inst->src[0].nr] > + v->alloc.sizes[inst->dst.nr]) + return false; + + if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) { + if (!is_coalescing_payload(v->alloc, inst)) { + return false; + } + } + + return true; +} + +static bool +can_coalesce_vars(const fs_live_variables &live, const cfg_t *cfg, + const bblock_t *block, const fs_inst *inst, + int dst_var, int src_var) +{ + if (!live.vars_interfere(src_var, dst_var)) + return true; + + int dst_start = live.start[dst_var]; + int dst_end = live.end[dst_var]; + int src_start = live.start[src_var]; + int src_end = live.end[src_var]; + + /* Variables interfere and one line range isn't a subset of the other. */ + if ((dst_end > src_end && src_start < dst_start) || + (src_end > dst_end && dst_start < src_start)) + return false; + + /* Check for a write to either register in the intersection of their live + * ranges. + */ + int start_ip = MAX2(dst_start, src_start); + int end_ip = MIN2(dst_end, src_end); + + foreach_block(scan_block, cfg) { + if (scan_block->end_ip < start_ip) + continue; + + int scan_ip = scan_block->start_ip - 1; + + bool seen_src_write = false; + bool seen_copy = false; + foreach_inst_in_block(fs_inst, scan_inst, scan_block) { + scan_ip++; + + /* Ignore anything before the intersection of the live ranges */ + if (scan_ip < start_ip) + continue; + + /* Ignore the copying instruction itself */ + if (scan_inst == inst) { + seen_copy = true; + continue; + } + + if (scan_ip > end_ip) + return true; /* registers do not interfere */ + + if (seen_src_write && !seen_copy) { + /* In order to satisfy the guarantee of register coalescing, we + * must ensure that the two registers always have the same value + * during the intersection of their live ranges. One way to do + * this is to simply ensure that neither is ever written apart + * from the one copy which syncs up the two registers. However, + * this can be overly conservative and only works in the case + * where the destination live range is entirely contained in the + * source live range. + * + * To handle the other case where the source is contained in the + * destination, we allow writes to the source register as long as + * they happen before the copy, in the same block as the copy, and + * the destination is never read between first such write and the + * copy. This effectively moves the write from the copy up. + */ + for (int j = 0; j < scan_inst->sources; j++) { + if (regions_overlap(scan_inst->src[j], scan_inst->size_read(j), + inst->dst, inst->size_written)) + return false; /* registers interfere */ + } + } + + /* The MOV being coalesced had better be the only instruction which + * writes to the coalesce destination in the intersection. + */ + if (regions_overlap(scan_inst->dst, scan_inst->size_written, + inst->dst, inst->size_written)) + return false; /* registers interfere */ + + /* See the big comment above */ + if (regions_overlap(scan_inst->dst, scan_inst->size_written, + inst->src[0], inst->size_read(0))) { + if (seen_copy || scan_block != block || + (scan_inst->force_writemask_all && !inst->force_writemask_all)) + return false; + seen_src_write = true; + } + } + } + + return true; +} + +bool +fs_visitor::register_coalesce() +{ + bool progress = false; + fs_live_variables &live = live_analysis.require(); + int src_size = 0; + int channels_remaining = 0; + unsigned src_reg = ~0u, dst_reg = ~0u; + int *dst_reg_offset = new int[MAX_VGRF_SIZE(devinfo)]; + fs_inst **mov = new fs_inst *[MAX_VGRF_SIZE(devinfo)]; + int *dst_var = new int[MAX_VGRF_SIZE(devinfo)]; + int *src_var = new int[MAX_VGRF_SIZE(devinfo)]; + + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (!is_coalesce_candidate(this, inst)) + continue; + + if (is_nop_mov(inst)) { + inst->opcode = BRW_OPCODE_NOP; + progress = true; + continue; + } + + if (src_reg != inst->src[0].nr) { + src_reg = inst->src[0].nr; + + src_size = alloc.sizes[inst->src[0].nr]; + assert(src_size <= MAX_VGRF_SIZE(devinfo)); + + channels_remaining = src_size; + memset(mov, 0, sizeof(*mov) * MAX_VGRF_SIZE(devinfo)); + + dst_reg = inst->dst.nr; + } + + if (dst_reg != inst->dst.nr) + continue; + + if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) { + for (int i = 0; i < src_size; i++) { + dst_reg_offset[i] = i; + } + mov[0] = inst; + channels_remaining -= regs_written(inst); + } else { + const int offset = inst->src[0].offset / REG_SIZE; + if (mov[offset]) { + /* This is the second time that this offset in the register has + * been set. This means, in particular, that inst->dst was + * live before this instruction and that the live ranges of + * inst->dst and inst->src[0] overlap and we can't coalesce the + * two variables. Let's ensure that doesn't happen. + */ + channels_remaining = -1; + continue; + } + for (unsigned i = 0; i < MAX2(inst->size_written / REG_SIZE, 1); i++) + dst_reg_offset[offset + i] = inst->dst.offset / REG_SIZE + i; + mov[offset] = inst; + channels_remaining -= regs_written(inst); + } + + if (channels_remaining) + continue; + + bool can_coalesce = true; + for (int i = 0; i < src_size; i++) { + if (dst_reg_offset[i] != dst_reg_offset[0] + i) { + /* Registers are out-of-order. */ + can_coalesce = false; + src_reg = ~0u; + break; + } + + dst_var[i] = live.var_from_vgrf[dst_reg] + dst_reg_offset[i]; + src_var[i] = live.var_from_vgrf[src_reg] + i; + + if (!can_coalesce_vars(live, cfg, block, inst, dst_var[i], src_var[i])) { + can_coalesce = false; + src_reg = ~0u; + break; + } + } + + if (!can_coalesce) + continue; + + progress = true; + + for (int i = 0; i < src_size; i++) { + if (!mov[i]) + continue; + + if (mov[i]->conditional_mod == BRW_CONDITIONAL_NONE) { + mov[i]->opcode = BRW_OPCODE_NOP; + mov[i]->dst = reg_undef; + for (int j = 0; j < mov[i]->sources; j++) { + mov[i]->src[j] = reg_undef; + } + } else { + /* If we have a conditional modifier, rewrite the MOV to be a + * MOV.cmod from the coalesced register. Hopefully, cmod + * propagation will clean this up and move it to the instruction + * that writes the register. If not, this keeps things correct + * while still letting us coalesce. + */ + assert(mov[i]->opcode == BRW_OPCODE_MOV); + assert(mov[i]->sources == 1); + mov[i]->src[0] = mov[i]->dst; + mov[i]->dst = retype(brw_null_reg(), mov[i]->dst.type); + } + } + + foreach_block_and_inst(block, fs_inst, scan_inst, cfg) { + if (scan_inst->dst.file == VGRF && + scan_inst->dst.nr == src_reg) { + scan_inst->dst.nr = dst_reg; + scan_inst->dst.offset = scan_inst->dst.offset % REG_SIZE + + dst_reg_offset[scan_inst->dst.offset / REG_SIZE] * REG_SIZE; + } + + for (int j = 0; j < scan_inst->sources; j++) { + if (scan_inst->src[j].file == VGRF && + scan_inst->src[j].nr == src_reg) { + scan_inst->src[j].nr = dst_reg; + scan_inst->src[j].offset = scan_inst->src[j].offset % REG_SIZE + + dst_reg_offset[scan_inst->src[j].offset / REG_SIZE] * REG_SIZE; + } + } + } + + for (int i = 0; i < src_size; i++) { + live.start[dst_var[i]] = MIN2(live.start[dst_var[i]], + live.start[src_var[i]]); + live.end[dst_var[i]] = MAX2(live.end[dst_var[i]], + live.end[src_var[i]]); + } + src_reg = ~0u; + } + + if (progress) { + foreach_block_and_inst_safe (block, backend_instruction, inst, cfg) { + if (inst->opcode == BRW_OPCODE_NOP) { + inst->remove(block, true); + } + } + + cfg->adjust_block_ips(); + + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + } + + delete[] src_var; + delete[] dst_var; + delete[] mov; + delete[] dst_reg_offset; + + return progress; +} diff --git a/src/intel/compiler/elk/brw_fs_saturate_propagation.cpp b/src/intel/compiler/elk/brw_fs_saturate_propagation.cpp new file mode 100644 index 00000000000..50b05dd92b8 --- /dev/null +++ b/src/intel/compiler/elk/brw_fs_saturate_propagation.cpp @@ -0,0 +1,165 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_fs.h" +#include "brw_fs_live_variables.h" +#include "brw_cfg.h" + +using namespace brw; + +/** @file brw_fs_saturate_propagation.cpp + * + * Implements a pass that propagates the SAT modifier from a MOV.SAT into the + * instruction that produced the source of the MOV.SAT, thereby allowing the + * MOV's src and dst to be coalesced and the MOV removed. + * + * For instance, + * + * ADD tmp, src0, src1 + * MOV.SAT dst, tmp + * + * would be transformed into + * + * ADD.SAT tmp, src0, src1 + * MOV dst, tmp + */ + +static bool +opt_saturate_propagation_local(const fs_live_variables &live, bblock_t *block) +{ + bool progress = false; + int ip = block->end_ip + 1; + + foreach_inst_in_block_reverse(fs_inst, inst, block) { + ip--; + + if (inst->opcode != BRW_OPCODE_MOV || + !inst->saturate || + inst->dst.file != VGRF || + inst->dst.type != inst->src[0].type || + inst->src[0].file != VGRF || + inst->src[0].abs) + continue; + + int src_var = live.var_from_reg(inst->src[0]); + int src_end_ip = live.end[src_var]; + + bool interfered = false; + foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { + if (scan_inst->exec_size == inst->exec_size && + regions_overlap(scan_inst->dst, scan_inst->size_written, + inst->src[0], inst->size_read(0))) { + if (scan_inst->is_partial_write() || + (scan_inst->dst.type != inst->dst.type && + !scan_inst->can_change_types())) + break; + + if (scan_inst->saturate) { + inst->saturate = false; + progress = true; + } else if (src_end_ip == ip || inst->dst.equals(inst->src[0])) { + if (scan_inst->can_do_saturate()) { + if (scan_inst->dst.type != inst->dst.type) { + scan_inst->dst.type = inst->dst.type; + for (int i = 0; i < scan_inst->sources; i++) { + scan_inst->src[i].type = inst->dst.type; + } + } + + if (inst->src[0].negate) { + if (scan_inst->opcode == BRW_OPCODE_MUL) { + scan_inst->src[0].negate = !scan_inst->src[0].negate; + inst->src[0].negate = false; + } else if (scan_inst->opcode == BRW_OPCODE_MAD) { + for (int i = 0; i < 2; i++) { + if (scan_inst->src[i].file == IMM) { + brw_negate_immediate(scan_inst->src[i].type, + &scan_inst->src[i].as_brw_reg()); + } else { + scan_inst->src[i].negate = !scan_inst->src[i].negate; + } + } + inst->src[0].negate = false; + } else if (scan_inst->opcode == BRW_OPCODE_ADD) { + if (scan_inst->src[1].file == IMM) { + if (!brw_negate_immediate(scan_inst->src[1].type, + &scan_inst->src[1].as_brw_reg())) { + break; + } + } else { + scan_inst->src[1].negate = !scan_inst->src[1].negate; + } + scan_inst->src[0].negate = !scan_inst->src[0].negate; + inst->src[0].negate = false; + } else { + break; + } + } + + scan_inst->saturate = true; + inst->saturate = false; + progress = true; + } + } + break; + } + for (int i = 0; i < scan_inst->sources; i++) { + if (scan_inst->src[i].file == VGRF && + scan_inst->src[i].nr == inst->src[0].nr && + regions_overlap( + scan_inst->src[i], scan_inst->size_read(i), + inst->src[0], inst->size_read(0))) { + if (scan_inst->opcode != BRW_OPCODE_MOV || + !scan_inst->saturate || + scan_inst->src[0].abs || + scan_inst->src[0].negate || + scan_inst->src[0].abs != inst->src[0].abs || + scan_inst->src[0].negate != inst->src[0].negate) { + interfered = true; + break; + } + } + } + + if (interfered) + break; + } + } + + return progress; +} + +bool +fs_visitor::opt_saturate_propagation() +{ + const fs_live_variables &live = live_analysis.require(); + bool progress = false; + + foreach_block (block, cfg) { + progress = opt_saturate_propagation_local(live, block) || progress; + } + + /* Live intervals are still valid. */ + + return progress; +} diff --git a/src/intel/compiler/elk/brw_fs_scoreboard.cpp b/src/intel/compiler/elk/brw_fs_scoreboard.cpp new file mode 100644 index 00000000000..144179941c2 --- /dev/null +++ b/src/intel/compiler/elk/brw_fs_scoreboard.cpp @@ -0,0 +1,1365 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_fs_scoreboard.cpp + * + * Gfx12+ hardware lacks the register scoreboard logic that used to guarantee + * data coherency between register reads and writes in previous generations. + * This lowering pass runs after register allocation in order to make up for + * it. + * + * It works by performing global dataflow analysis in order to determine the + * set of potential dependencies of every instruction in the shader, and then + * inserts any required SWSB annotations and additional SYNC instructions in + * order to guarantee data coherency. + * + * WARNING - Access of the following (rarely used) ARF registers is not + * tracked here, and require the RegDist SWSB annotation to be set + * to 1 by the generator in order to avoid data races: + * + * - sp stack pointer + * - sr0 state register + * - cr0 control register + * - ip instruction pointer + * - tm0 timestamp register + * - dbg0 debug register + * - acc2-9 special accumulator registers on TGL + * - mme0-7 math macro extended accumulator registers + * + * The following ARF registers don't need to be tracked here because data + * coherency is still provided transparently by the hardware: + * + * - f0-1 flag registers + * - n0 notification register + * - tdr0 thread dependency register + */ + +#include "brw_fs.h" +#include "brw_fs_builder.h" +#include "brw_cfg.h" + +using namespace brw; + +namespace { + /** + * In-order instruction accounting. + * @{ + */ + + /** + * Return the RegDist pipeline the hardware will synchronize with if no + * pipeline information is provided in the SWSB annotation of an + * instruction (e.g. when TGL_PIPE_NONE is specified in tgl_swsb). + */ + tgl_pipe + inferred_sync_pipe(const struct intel_device_info *devinfo, const fs_inst *inst) + { + if (devinfo->verx10 >= 125) { + bool has_int_src = false, has_long_src = false; + const bool has_long_pipe = !devinfo->has_64bit_float_via_math_pipe; + + if (is_send(inst)) + return TGL_PIPE_NONE; + + for (unsigned i = 0; i < inst->sources; i++) { + if (inst->src[i].file != BAD_FILE && + !inst->is_control_source(i)) { + const brw_reg_type t = inst->src[i].type; + has_int_src |= !brw_reg_type_is_floating_point(t); + has_long_src |= type_sz(t) >= 8; + } + } + + /* Avoid the emitting (RegDist, SWSB) annotations for long + * instructions on platforms where they are unordered. It's not clear + * what the inferred sync pipe is for them or if we are even allowed + * to use these annotations in this case. Return NONE, which should + * prevent baked_{un,}ordered_dependency_mode functions from even + * trying to emit these annotations. + */ + if (!has_long_pipe && has_long_src) + return TGL_PIPE_NONE; + + return has_long_src ? TGL_PIPE_LONG : + has_int_src ? TGL_PIPE_INT : + TGL_PIPE_FLOAT; + + } else { + return TGL_PIPE_FLOAT; + } + } + + /** + * Return the RegDist pipeline that will execute an instruction, or + * TGL_PIPE_NONE if the instruction is out-of-order and doesn't use the + * RegDist synchronization mechanism. + */ + tgl_pipe + inferred_exec_pipe(const struct intel_device_info *devinfo, const fs_inst *inst) + { + const brw_reg_type t = get_exec_type(inst); + const bool is_dword_multiply = !brw_reg_type_is_floating_point(t) && + ((inst->opcode == BRW_OPCODE_MUL && + MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4) || + (inst->opcode == BRW_OPCODE_MAD && + MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4)); + + if (is_unordered(devinfo, inst)) + return TGL_PIPE_NONE; + else if (devinfo->verx10 < 125) + return TGL_PIPE_FLOAT; + else if (inst->is_math() && devinfo->ver >= 20) + return TGL_PIPE_MATH; + else if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT || + inst->opcode == SHADER_OPCODE_BROADCAST || + inst->opcode == SHADER_OPCODE_SHUFFLE) + return TGL_PIPE_INT; + else if (inst->opcode == FS_OPCODE_PACK_HALF_2x16_SPLIT) + return TGL_PIPE_FLOAT; + else if (devinfo->ver >= 20 && type_sz(inst->dst.type) >= 8 && + brw_reg_type_is_floating_point(inst->dst.type)) { + assert(devinfo->has_64bit_float); + return TGL_PIPE_LONG; + } else if (devinfo->ver < 20 && + (type_sz(inst->dst.type) >= 8 || type_sz(t) >= 8 || + is_dword_multiply)) { + assert(devinfo->has_64bit_float || devinfo->has_64bit_int || + devinfo->has_integer_dword_mul); + return TGL_PIPE_LONG; + } else if (brw_reg_type_is_floating_point(inst->dst.type)) + return TGL_PIPE_FLOAT; + else + return TGL_PIPE_INT; + } + + /** + * Index of the \p p pipeline counter in the ordered_address vector defined + * below. + */ +#define IDX(p) (p >= TGL_PIPE_FLOAT ? unsigned(p - TGL_PIPE_FLOAT) : \ + (abort(), ~0u)) + + /** + * Number of in-order hardware instructions for pipeline index \p contained + * in this IR instruction. This determines the increment applied to the + * RegDist counter calculated for any ordered dependency that crosses this + * instruction. + */ + unsigned + ordered_unit(const struct intel_device_info *devinfo, const fs_inst *inst, + unsigned p) + { + switch (inst->opcode) { + case BRW_OPCODE_SYNC: + case BRW_OPCODE_DO: + case SHADER_OPCODE_UNDEF: + case SHADER_OPCODE_HALT_TARGET: + case FS_OPCODE_SCHEDULING_FENCE: + return 0; + default: + /* Note that the following is inaccurate for virtual instructions + * that expand to more in-order instructions than assumed here, but + * that can only lead to suboptimal execution ordering, data + * coherency won't be impacted. Providing exact RegDist counts for + * each virtual instruction would allow better ALU performance, but + * it would require keeping this switch statement in perfect sync + * with the generator in order to avoid data corruption. Lesson is + * (again) don't use virtual instructions if you want optimal + * scheduling. + */ + if (!is_unordered(devinfo, inst) && + (p == IDX(inferred_exec_pipe(devinfo, inst)) || + p == IDX(TGL_PIPE_ALL))) + return 1; + else + return 0; + } + } + + /** + * Type for an instruction counter that increments for in-order + * instructions only, arbitrarily denoted 'jp' throughout this lowering + * pass in order to distinguish it from the regular instruction counter. + * This is represented as a vector with an independent counter for each + * asynchronous ALU pipeline in the EU. + */ + struct ordered_address { + /** + * Construct the ordered address of a dependency known to execute on a + * single specified pipeline \p p (unless TGL_PIPE_NONE or TGL_PIPE_ALL + * is provided), in which case the vector counter will be initialized + * with all components equal to INT_MIN (always satisfied) except for + * component IDX(p). + */ + ordered_address(tgl_pipe p = TGL_PIPE_NONE, int jp0 = INT_MIN) { + for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) + jp[q] = (p == TGL_PIPE_NONE || (IDX(p) != q && p != TGL_PIPE_ALL) ? + INT_MIN : jp0); + } + + int jp[IDX(TGL_PIPE_ALL)]; + + friend bool + operator==(const ordered_address &jp0, const ordered_address &jp1) + { + for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) { + if (jp0.jp[p] != jp1.jp[p]) + return false; + } + + return true; + } + }; + + /** + * Return true if the specified ordered address is trivially satisfied for + * all pipelines except potentially for the specified pipeline \p p. + */ + bool + is_single_pipe(const ordered_address &jp, tgl_pipe p) + { + for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) { + if ((p == TGL_PIPE_NONE || IDX(p) != q) && jp.jp[q] > INT_MIN) + return false; + } + + return true; + } + + /** + * Return the number of instructions in the program. + */ + unsigned + num_instructions(const backend_shader *shader) + { + return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1; + } + + /** + * Calculate the local ordered_address instruction counter at every + * instruction of the shader for subsequent constant-time look-up. + */ + ordered_address * + ordered_inst_addresses(const fs_visitor *shader) + { + ordered_address *jps = new ordered_address[num_instructions(shader)]; + ordered_address jp(TGL_PIPE_ALL, 0); + unsigned ip = 0; + + foreach_block_and_inst(block, fs_inst, inst, shader->cfg) { + jps[ip] = jp; + for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) + jp.jp[p] += ordered_unit(shader->devinfo, inst, p); + ip++; + } + + return jps; + } + + /** + * Synchronization mode required for data manipulated by in-order + * instructions. + * + * Similar to tgl_sbid_mode, but without SET mode. Defined as a separate + * enum for additional type safety. The hardware doesn't provide control + * over the synchronization mode for RegDist annotations, this is only used + * internally in this pass in order to optimize out redundant read + * dependencies where possible. + */ + enum tgl_regdist_mode { + TGL_REGDIST_NULL = 0, + TGL_REGDIST_SRC = 1, + TGL_REGDIST_DST = 2 + }; + + /** + * Allow bitwise arithmetic of tgl_regdist_mode enums. + */ + tgl_regdist_mode + operator|(tgl_regdist_mode x, tgl_regdist_mode y) + { + return tgl_regdist_mode(unsigned(x) | unsigned(y)); + } + + tgl_regdist_mode + operator&(tgl_regdist_mode x, tgl_regdist_mode y) + { + return tgl_regdist_mode(unsigned(x) & unsigned(y)); + } + + tgl_regdist_mode & + operator|=(tgl_regdist_mode &x, tgl_regdist_mode y) + { + return x = x | y; + } + + tgl_regdist_mode & + operator&=(tgl_regdist_mode &x, tgl_regdist_mode y) + { + return x = x & y; + } + + /** @} */ + + /** + * Representation of an equivalence relation among the set of unsigned + * integers. + * + * Its initial state is the identity relation '~' such that i ~ j if and + * only if i == j for every pair of unsigned integers i and j. + */ + struct equivalence_relation { + equivalence_relation(unsigned n) : is(new unsigned[n]), n(n) + { + for (unsigned i = 0; i < n; i++) + is[i] = i; + } + + ~equivalence_relation() + { + delete[] is; + } + + /** + * Return equivalence class index of the specified element. Effectively + * this is the numeric value of an arbitrary representative from the + * equivalence class. + * + * Allows the evaluation of the equivalence relation according to the + * rule that i ~ j if and only if lookup(i) == lookup(j). + */ + unsigned + lookup(unsigned i) const + { + if (i < n && is[i] != i) + return lookup(is[i]); + else + return i; + } + + /** + * Create an array with the results of the lookup() method for + * constant-time evaluation. + */ + unsigned * + flatten() const + { + unsigned *ids = new unsigned[n]; + + for (unsigned i = 0; i < n; i++) + ids[i] = lookup(i); + + return ids; + } + + /** + * Mutate the existing equivalence relation minimally by imposing the + * additional requirement that i ~ j. + * + * The algorithm updates the internal representation recursively in + * order to guarantee transitivity while preserving the previously + * specified equivalence requirements. + */ + unsigned + link(unsigned i, unsigned j) + { + const unsigned k = lookup(i); + assign(i, k); + assign(j, k); + return k; + } + + private: + equivalence_relation(const equivalence_relation &); + + equivalence_relation & + operator=(const equivalence_relation &); + + /** + * Assign the representative of \p from to be equivalent to \p to. + * + * At the same time the data structure is partially flattened as much as + * it's possible without increasing the number of recursive calls. + */ + void + assign(unsigned from, unsigned to) + { + if (from != to) { + assert(from < n); + + if (is[from] != from) + assign(is[from], to); + + is[from] = to; + } + } + + unsigned *is; + unsigned n; + }; + + /** + * Representation of a data dependency between two instructions in the + * program. + * @{ + */ + struct dependency { + /** + * No dependency information. + */ + dependency() : ordered(TGL_REGDIST_NULL), jp(), + unordered(TGL_SBID_NULL), id(0), + exec_all(false) {} + + /** + * Construct a dependency on the in-order instruction with the provided + * ordered_address instruction counter. + */ + dependency(tgl_regdist_mode mode, const ordered_address &jp, + bool exec_all) : + ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0), + exec_all(exec_all) {} + + /** + * Construct a dependency on the out-of-order instruction with the + * specified synchronization token. + */ + dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) : + ordered(TGL_REGDIST_NULL), jp(), unordered(mode), id(id), + exec_all(exec_all) {} + + /** + * Synchronization mode of in-order dependency, or zero if no in-order + * dependency is present. + */ + tgl_regdist_mode ordered; + + /** + * Instruction counter of in-order dependency. + * + * For a dependency part of a different block in the program, this is + * relative to the specific control flow path taken between the + * dependency and the current block: It is the ordered_address such that + * the difference between it and the ordered_address of the first + * instruction of the current block is exactly the number of in-order + * instructions across that control flow path. It is not guaranteed to + * be equal to the local ordered_address of the generating instruction + * [as returned by ordered_inst_addresses()], except for block-local + * dependencies. + */ + ordered_address jp; + + /** + * Synchronization mode of unordered dependency, or zero if no unordered + * dependency is present. + */ + tgl_sbid_mode unordered; + + /** Synchronization token of out-of-order dependency. */ + unsigned id; + + /** + * Whether the dependency could be run with execution masking disabled, + * which might lead to the unwanted execution of the generating + * instruction in cases where a BB is executed with all channels + * disabled due to hardware bug Wa_1407528679. + */ + bool exec_all; + + /** + * Trivial in-order dependency that's always satisfied. + * + * Note that unlike a default-constructed dependency() which is also + * trivially satisfied, this is considered to provide dependency + * information and can be used to clear a previously pending dependency + * via shadow(). + */ + static const dependency done; + + friend bool + operator==(const dependency &dep0, const dependency &dep1) + { + return dep0.ordered == dep1.ordered && + dep0.jp == dep1.jp && + dep0.unordered == dep1.unordered && + dep0.id == dep1.id && + dep0.exec_all == dep1.exec_all; + } + + friend bool + operator!=(const dependency &dep0, const dependency &dep1) + { + return !(dep0 == dep1); + } + }; + + const dependency dependency::done = + dependency(TGL_REGDIST_DST, ordered_address(), false); + + /** + * Return whether \p dep contains any dependency information. + */ + bool + is_valid(const dependency &dep) + { + return dep.ordered || dep.unordered; + } + + /** + * Combine \p dep0 and \p dep1 into a single dependency object that is only + * satisfied when both original dependencies are satisfied. This might + * involve updating the equivalence relation \p eq in order to make sure + * that both out-of-order dependencies are assigned the same hardware SBID + * as synchronization token. + */ + dependency + merge(equivalence_relation &eq, + const dependency &dep0, const dependency &dep1) + { + dependency dep; + + if (dep0.ordered || dep1.ordered) { + dep.ordered = dep0.ordered | dep1.ordered; + for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) + dep.jp.jp[p] = MAX2(dep0.jp.jp[p], dep1.jp.jp[p]); + } + + if (dep0.unordered || dep1.unordered) { + dep.unordered = dep0.unordered | dep1.unordered; + dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id, + dep1.unordered ? dep1.id : dep0.id); + } + + dep.exec_all = dep0.exec_all || dep1.exec_all; + + return dep; + } + + /** + * Override dependency information of \p dep0 with that of \p dep1. + */ + dependency + shadow(const dependency &dep0, const dependency &dep1) + { + if (dep0.ordered == TGL_REGDIST_SRC && + is_valid(dep1) && !(dep1.unordered & TGL_SBID_DST) && + !(dep1.ordered & TGL_REGDIST_DST)) { + /* As an optimization (see dependency_for_read()), + * instructions with a RaR dependency don't synchronize + * against a previous in-order read, so we need to pass + * through both ordered dependencies instead of simply + * dropping the first one. Otherwise we could encounter a + * WaR data hazard between OP0 and OP2 in cases like: + * + * OP0 r1:f r0:d + * OP1 r2:d r0:d + * OP2 r0:d r3:d + * + * since only the integer-pipeline r0 dependency from OP1 + * would be visible to OP2, even though OP0 could technically + * execute after OP1 due to the floating-point and integer + * pipelines being asynchronous on Gfx12.5+ platforms, so + * synchronizing OP2 against OP1 would be insufficient. + */ + dependency dep = dep1; + + dep.ordered |= dep0.ordered; + for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) + dep.jp.jp[p] = MAX2(dep.jp.jp[p], dep0.jp.jp[p]); + + return dep; + } else { + return is_valid(dep1) ? dep1 : dep0; + } + } + + /** + * Translate dependency information across the program. + * + * This returns a dependency on the same instruction translated to the + * ordered_address space of a different block. The correct shift for + * transporting a dependency across an edge of the CFG is the difference + * between the local ordered_address of the first instruction of the target + * block and the local ordered_address of the instruction immediately after + * the end of the origin block. + */ + dependency + transport(dependency dep, int delta[IDX(TGL_PIPE_ALL)]) + { + if (dep.ordered) { + for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) { + if (dep.jp.jp[p] > INT_MIN) + dep.jp.jp[p] += delta[p]; + } + } + + return dep; + } + + /** + * Return simplified dependency removing any synchronization modes not + * applicable to an instruction reading the same register location. + */ + dependency + dependency_for_read(dependency dep) + { + dep.ordered &= TGL_REGDIST_DST; + return dep; + } + + /** + * Return simplified dependency removing any synchronization modes not + * applicable to an instruction \p inst writing the same register location. + * + * This clears any WaR dependency for writes performed from the same + * pipeline as the read, since there is no possibility for a data hazard. + */ + dependency + dependency_for_write(const struct intel_device_info *devinfo, + const fs_inst *inst, dependency dep) + { + if (!is_unordered(devinfo, inst) && + is_single_pipe(dep.jp, inferred_exec_pipe(devinfo, inst))) + dep.ordered &= TGL_REGDIST_DST; + return dep; + } + + /** @} */ + + /** + * Scoreboard representation. This keeps track of the data dependencies of + * registers with GRF granularity. + */ + class scoreboard { + public: + /** + * Look up the most current data dependency for register \p r. + */ + dependency + get(const fs_reg &r) const + { + if (const dependency *p = const_cast(this)->dep(r)) + return *p; + else + return dependency(); + } + + /** + * Specify the most current data dependency for register \p r. + */ + void + set(const fs_reg &r, const dependency &d) + { + if (dependency *p = dep(r)) + *p = d; + } + + /** + * Component-wise merge() of corresponding dependencies from two + * scoreboard objects. \sa merge(). + */ + friend scoreboard + merge(equivalence_relation &eq, + const scoreboard &sb0, const scoreboard &sb1) + { + scoreboard sb; + + for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++) + sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]); + + sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep); + sb.accum_dep = merge(eq, sb0.accum_dep, sb1.accum_dep); + + return sb; + } + + /** + * Component-wise shadow() of corresponding dependencies from two + * scoreboard objects. \sa shadow(). + */ + friend scoreboard + shadow(const scoreboard &sb0, const scoreboard &sb1) + { + scoreboard sb; + + for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++) + sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]); + + sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep); + sb.accum_dep = shadow(sb0.accum_dep, sb1.accum_dep); + + return sb; + } + + /** + * Component-wise transport() of dependencies from a scoreboard + * object. \sa transport(). + */ + friend scoreboard + transport(const scoreboard &sb0, int delta[IDX(TGL_PIPE_ALL)]) + { + scoreboard sb; + + for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++) + sb.grf_deps[i] = transport(sb0.grf_deps[i], delta); + + sb.addr_dep = transport(sb0.addr_dep, delta); + sb.accum_dep = transport(sb0.accum_dep, delta); + + return sb; + } + + friend bool + operator==(const scoreboard &sb0, const scoreboard &sb1) + { + for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) { + if (sb0.grf_deps[i] != sb1.grf_deps[i]) + return false; + } + + if (sb0.addr_dep != sb1.addr_dep) + return false; + + if (sb0.accum_dep != sb1.accum_dep) + return false; + + return true; + } + + friend bool + operator!=(const scoreboard &sb0, const scoreboard &sb1) + { + return !(sb0 == sb1); + } + + private: + dependency grf_deps[XE2_MAX_GRF]; + dependency addr_dep; + dependency accum_dep; + + dependency * + dep(const fs_reg &r) + { + const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE : + reg_offset(r) / REG_SIZE); + + return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] : + r.file == MRF ? &grf_deps[GFX7_MRF_HACK_START + reg] : + r.file == ARF && reg >= BRW_ARF_ADDRESS && + reg < BRW_ARF_ACCUMULATOR ? &addr_dep : + r.file == ARF && reg >= BRW_ARF_ACCUMULATOR && + reg < BRW_ARF_FLAG ? &accum_dep : + NULL); + } + }; + + /** + * Dependency list handling. + * @{ + */ + struct dependency_list { + dependency_list() : deps(NULL), n(0) {} + + ~dependency_list() + { + free(deps); + } + + void + push_back(const dependency &dep) + { + deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps)); + deps[n++] = dep; + } + + unsigned + size() const + { + return n; + } + + const dependency & + operator[](unsigned i) const + { + assert(i < n); + return deps[i]; + } + + dependency & + operator[](unsigned i) + { + assert(i < n); + return deps[i]; + } + + private: + dependency_list(const dependency_list &); + dependency_list & + operator=(const dependency_list &); + + dependency *deps; + unsigned n; + }; + + /** + * Add dependency \p dep to the list of dependencies of an instruction + * \p deps. + */ + void + add_dependency(const unsigned *ids, dependency_list &deps, dependency dep) + { + if (is_valid(dep)) { + /* Translate the unordered dependency token first in order to keep + * the list minimally redundant. + */ + if (dep.unordered) + dep.id = ids[dep.id]; + + /* Try to combine the specified dependency with any existing ones. */ + for (unsigned i = 0; i < deps.size(); i++) { + /* Don't combine otherwise matching dependencies if there is an + * exec_all mismatch which would cause a SET dependency to gain an + * exec_all flag, since that would prevent it from being baked + * into the instruction we want to allocate an SBID for. + */ + if (deps[i].exec_all != dep.exec_all && + (!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) && + (!dep.exec_all || (deps[i].unordered & TGL_SBID_SET))) + continue; + + if (dep.ordered && deps[i].ordered) { + for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) + deps[i].jp.jp[p] = MAX2(deps[i].jp.jp[p], dep.jp.jp[p]); + + deps[i].ordered |= dep.ordered; + deps[i].exec_all |= dep.exec_all; + dep.ordered = TGL_REGDIST_NULL; + } + + if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) { + deps[i].unordered |= dep.unordered; + deps[i].exec_all |= dep.exec_all; + dep.unordered = TGL_SBID_NULL; + } + } + + /* Add it to the end of the list if necessary. */ + if (is_valid(dep)) + deps.push_back(dep); + } + } + + /** + * Construct a tgl_swsb annotation encoding any ordered dependencies from + * the dependency list \p deps of an instruction with ordered_address \p + * jp. If \p exec_all is false only dependencies known to be executed with + * channel masking applied will be considered in the calculation. + */ + tgl_swsb + ordered_dependency_swsb(const dependency_list &deps, + const ordered_address &jp, + bool exec_all) + { + tgl_pipe p = TGL_PIPE_NONE; + unsigned min_dist = ~0u; + + for (unsigned i = 0; i < deps.size(); i++) { + if (deps[i].ordered && exec_all >= deps[i].exec_all) { + for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) { + const unsigned dist = jp.jp[q] - int64_t(deps[i].jp.jp[q]); + const unsigned max_dist = (q == IDX(TGL_PIPE_LONG) ? 14 : 10); + assert(jp.jp[q] > deps[i].jp.jp[q]); + if (dist <= max_dist) { + p = (p && IDX(p) != q ? TGL_PIPE_ALL : + tgl_pipe(TGL_PIPE_FLOAT + q)); + min_dist = MIN3(min_dist, dist, 7); + } + } + } + } + + return { p ? min_dist : 0, p }; + } + + /** + * Return whether the dependency list \p deps of an instruction with + * ordered_address \p jp has any non-trivial ordered dependencies. If \p + * exec_all is false only dependencies known to be executed with channel + * masking applied will be considered in the calculation. + */ + bool + find_ordered_dependency(const dependency_list &deps, + const ordered_address &jp, + bool exec_all) + { + return ordered_dependency_swsb(deps, jp, exec_all).regdist; + } + + /** + * Return the full tgl_sbid_mode bitset for the first unordered dependency + * on the list \p deps that matches the specified tgl_sbid_mode, or zero if + * no such dependency is present. If \p exec_all is false only + * dependencies known to be executed with channel masking applied will be + * considered in the calculation. + */ + tgl_sbid_mode + find_unordered_dependency(const dependency_list &deps, + tgl_sbid_mode unordered, + bool exec_all) + { + if (unordered) { + for (unsigned i = 0; i < deps.size(); i++) { + if ((unordered & deps[i].unordered) && + exec_all >= deps[i].exec_all) + return deps[i].unordered; + } + } + + return TGL_SBID_NULL; + } + + /** + * Return the tgl_sbid_mode bitset of an unordered dependency from the list + * \p deps that can be represented directly in the SWSB annotation of the + * instruction without additional SYNC instructions, or zero if no such + * dependency is present. + */ + tgl_sbid_mode + baked_unordered_dependency_mode(const struct intel_device_info *devinfo, + const fs_inst *inst, + const dependency_list &deps, + const ordered_address &jp) + { + const bool exec_all = inst->force_writemask_all; + const bool has_ordered = find_ordered_dependency(deps, jp, exec_all); + const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp, + exec_all).pipe; + + if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all)) + return find_unordered_dependency(deps, TGL_SBID_SET, exec_all); + else if (has_ordered && is_unordered(devinfo, inst)) + return TGL_SBID_NULL; + else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) && + (!has_ordered || ordered_pipe == inferred_sync_pipe(devinfo, inst))) + return find_unordered_dependency(deps, TGL_SBID_DST, exec_all); + else if (!has_ordered) + return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all); + else + return TGL_SBID_NULL; + } + + /** + * Return whether an ordered dependency from the list \p deps can be + * represented directly in the SWSB annotation of the instruction without + * additional SYNC instructions. + */ + bool + baked_ordered_dependency_mode(const struct intel_device_info *devinfo, + const fs_inst *inst, + const dependency_list &deps, + const ordered_address &jp) + { + const bool exec_all = inst->force_writemask_all; + const bool has_ordered = find_ordered_dependency(deps, jp, exec_all); + const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp, + exec_all).pipe; + const tgl_sbid_mode unordered_mode = + baked_unordered_dependency_mode(devinfo, inst, deps, jp); + + if (!has_ordered) + return false; + else if (!unordered_mode) + return true; + else + return ordered_pipe == inferred_sync_pipe(devinfo, inst) && + unordered_mode == (is_unordered(devinfo, inst) ? TGL_SBID_SET : + TGL_SBID_DST); + } + + /** @} */ + + /** + * Shader instruction dependency calculation. + * @{ + */ + + /** + * Update scoreboard object \p sb to account for the execution of + * instruction \p inst. + */ + void + update_inst_scoreboard(const fs_visitor *shader, const ordered_address *jps, + const fs_inst *inst, unsigned ip, scoreboard &sb) + { + const bool exec_all = inst->force_writemask_all; + const struct intel_device_info *devinfo = shader->devinfo; + const tgl_pipe p = inferred_exec_pipe(devinfo, inst); + const ordered_address jp = p ? ordered_address(p, jps[ip].jp[IDX(p)]) : + ordered_address(); + const bool is_ordered = ordered_unit(devinfo, inst, IDX(TGL_PIPE_ALL)); + const bool is_unordered_math = + (inst->is_math() && devinfo->ver < 20) || + (devinfo->has_64bit_float_via_math_pipe && + (get_exec_type(inst) == BRW_REGISTER_TYPE_DF || + inst->dst.type == BRW_REGISTER_TYPE_DF)); + + /* Track any source registers that may be fetched asynchronously by this + * instruction, otherwise clear the dependency in order to avoid + * subsequent redundant synchronization. + */ + for (unsigned i = 0; i < inst->sources; i++) { + const dependency rd_dep = + (inst->is_payload(i) || + inst->opcode == BRW_OPCODE_DPAS || + is_unordered_math) ? dependency(TGL_SBID_SRC, ip, exec_all) : + is_ordered ? dependency(TGL_REGDIST_SRC, jp, exec_all) : + dependency::done; + + for (unsigned j = 0; j < regs_read(inst, i); j++) { + const fs_reg r = byte_offset(inst->src[i], REG_SIZE * j); + sb.set(r, shadow(sb.get(r), rd_dep)); + } + } + + if (inst->reads_accumulator_implicitly()) + sb.set(brw_acc_reg(8), dependency(TGL_REGDIST_SRC, jp, exec_all)); + + if (is_send(inst) && inst->base_mrf != -1) { + const dependency rd_dep = dependency(TGL_SBID_SRC, ip, exec_all); + + for (unsigned j = 0; j < inst->mlen; j++) + sb.set(brw_uvec_mrf(8, inst->base_mrf + j, 0), rd_dep); + } + + /* Track any destination registers of this instruction. */ + const dependency wr_dep = + is_unordered(devinfo, inst) ? dependency(TGL_SBID_DST, ip, exec_all) : + is_ordered ? dependency(TGL_REGDIST_DST, jp, exec_all) : + dependency(); + + if (inst->writes_accumulator_implicitly(devinfo)) + sb.set(brw_acc_reg(8), wr_dep); + + if (is_valid(wr_dep) && inst->dst.file != BAD_FILE && + !inst->dst.is_null()) { + for (unsigned j = 0; j < regs_written(inst); j++) + sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep); + } + } + + /** + * Calculate scoreboard objects locally that represent any pending (and + * unconditionally resolved) dependencies at the end of each block of the + * program. + */ + scoreboard * + gather_block_scoreboards(const fs_visitor *shader, + const ordered_address *jps) + { + scoreboard *sbs = new scoreboard[shader->cfg->num_blocks]; + unsigned ip = 0; + + foreach_block_and_inst(block, fs_inst, inst, shader->cfg) + update_inst_scoreboard(shader, jps, inst, ip++, sbs[block->num]); + + return sbs; + } + + /** + * Propagate data dependencies globally through the control flow graph + * until a fixed point is reached. + * + * Calculates the set of dependencies potentially pending at the beginning + * of each block, and returns it as an array of scoreboard objects. + */ + scoreboard * + propagate_block_scoreboards(const fs_visitor *shader, + const ordered_address *jps, + equivalence_relation &eq) + { + const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps); + scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks]; + scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks]; + + for (bool progress = true; progress;) { + progress = false; + + foreach_block(block, shader->cfg) { + const scoreboard sb = shadow(in_sbs[block->num], + delta_sbs[block->num]); + + if (sb != out_sbs[block->num]) { + foreach_list_typed(bblock_link, child_link, link, + &block->children) { + scoreboard &in_sb = in_sbs[child_link->block->num]; + int delta[IDX(TGL_PIPE_ALL)]; + + for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) + delta[p] = jps[child_link->block->start_ip].jp[p] + - jps[block->end_ip].jp[p] + - ordered_unit(shader->devinfo, + static_cast(block->end()), p); + + in_sb = merge(eq, in_sb, transport(sb, delta)); + } + + out_sbs[block->num] = sb; + progress = true; + } + } + } + + delete[] delta_sbs; + delete[] out_sbs; + + return in_sbs; + } + + /** + * Return the list of potential dependencies of each instruction in the + * shader based on the result of global dependency analysis. + */ + dependency_list * + gather_inst_dependencies(const fs_visitor *shader, + const ordered_address *jps) + { + const struct intel_device_info *devinfo = shader->devinfo; + equivalence_relation eq(num_instructions(shader)); + scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq); + const unsigned *ids = eq.flatten(); + dependency_list *deps = new dependency_list[num_instructions(shader)]; + unsigned ip = 0; + + foreach_block_and_inst(block, fs_inst, inst, shader->cfg) { + const bool exec_all = inst->force_writemask_all; + const tgl_pipe p = inferred_exec_pipe(devinfo, inst); + scoreboard &sb = sbs[block->num]; + + for (unsigned i = 0; i < inst->sources; i++) { + for (unsigned j = 0; j < regs_read(inst, i); j++) + add_dependency(ids, deps[ip], dependency_for_read( + sb.get(byte_offset(inst->src[i], REG_SIZE * j)))); + } + + if (inst->reads_accumulator_implicitly()) { + /* Wa_22012725308: + * + * "When the accumulator registers are used as source and/or + * destination, hardware does not ensure prevention of write + * after read hazard across execution pipes." + */ + const dependency dep = sb.get(brw_acc_reg(8)); + if (dep.ordered && !is_single_pipe(dep.jp, p)) + add_dependency(ids, deps[ip], dep); + } + + if (is_send(inst) && inst->base_mrf != -1) { + for (unsigned j = 0; j < inst->mlen; j++) + add_dependency(ids, deps[ip], dependency_for_read( + sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0)))); + } + + if (is_unordered(devinfo, inst) && !inst->eot) + add_dependency(ids, deps[ip], + dependency(TGL_SBID_SET, ip, exec_all)); + + if (!inst->no_dd_check) { + if (inst->dst.file != BAD_FILE && !inst->dst.is_null() && + !inst->dst.is_accumulator()) { + for (unsigned j = 0; j < regs_written(inst); j++) { + add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst, + sb.get(byte_offset(inst->dst, REG_SIZE * j)))); + } + } + + if (inst->writes_accumulator_implicitly(devinfo) || + inst->dst.is_accumulator()) { + /* Wa_22012725308: + * + * "When the accumulator registers are used as source and/or + * destination, hardware does not ensure prevention of write + * after read hazard across execution pipes." + */ + const dependency dep = sb.get(brw_acc_reg(8)); + if (dep.ordered && !is_single_pipe(dep.jp, p)) + add_dependency(ids, deps[ip], dep); + } + + if (is_send(inst) && inst->base_mrf != -1) { + for (unsigned j = 0; j < inst->implied_mrf_writes(); j++) + add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst, + sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0)))); + } + } + + update_inst_scoreboard(shader, jps, inst, ip, sb); + ip++; + } + + delete[] sbs; + delete[] ids; + + return deps; + } + + /** @} */ + + /** + * Allocate SBID tokens to track the execution of every out-of-order + * instruction of the shader. + */ + dependency_list * + allocate_inst_dependencies(const fs_visitor *shader, + const dependency_list *deps0) + { + /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in + * shaders with a large number of SEND messages. + * + * XXX - Use 32 SBIDs on Xe2+ while in large GRF mode. + */ + const unsigned num_sbids = 16; + + /* Allocate an unordered dependency ID to hardware SBID translation + * table with as many entries as instructions there are in the shader, + * which is the maximum number of unordered IDs we can find in the + * program. + */ + unsigned *ids = new unsigned[num_instructions(shader)]; + for (unsigned ip = 0; ip < num_instructions(shader); ip++) + ids[ip] = ~0u; + + dependency_list *deps1 = new dependency_list[num_instructions(shader)]; + unsigned next_id = 0; + + for (unsigned ip = 0; ip < num_instructions(shader); ip++) { + for (unsigned i = 0; i < deps0[ip].size(); i++) { + const dependency &dep = deps0[ip][i]; + + if (dep.unordered && ids[dep.id] == ~0u) + ids[dep.id] = (next_id++) & (num_sbids - 1); + + add_dependency(ids, deps1[ip], dep); + } + } + + delete[] ids; + + return deps1; + } + + /** + * Emit dependency information provided by \p deps into the shader, + * inserting additional SYNC instructions for dependencies that can't be + * represented directly by annotating existing instructions. + */ + void + emit_inst_dependencies(fs_visitor *shader, + const ordered_address *jps, + const dependency_list *deps) + { + const struct intel_device_info *devinfo = shader->devinfo; + unsigned ip = 0; + + foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) { + const bool exec_all = inst->force_writemask_all; + const bool ordered_mode = + baked_ordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]); + const tgl_sbid_mode unordered_mode = + baked_unordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]); + tgl_swsb swsb = !ordered_mode ? tgl_swsb() : + ordered_dependency_swsb(deps[ip], jps[ip], exec_all); + + for (unsigned i = 0; i < deps[ip].size(); i++) { + const dependency &dep = deps[ip][i]; + + if (dep.unordered) { + if (unordered_mode == dep.unordered && + exec_all >= dep.exec_all && !swsb.mode) { + /* Bake unordered dependency into the instruction's SWSB if + * possible, except in cases where the current instruction + * isn't marked NoMask but the dependency is, since that + * might lead to data coherency issues due to + * Wa_1407528679. + */ + swsb.sbid = dep.id; + swsb.mode = dep.unordered; + } else { + /* Emit dependency into the SWSB of an extra SYNC + * instruction. + */ + const fs_builder ibld = fs_builder(shader, block, inst) + .exec_all().group(1, 0); + fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(), + brw_imm_ud(TGL_SYNC_NOP)); + sync->sched.sbid = dep.id; + sync->sched.mode = dep.unordered; + assert(!(sync->sched.mode & TGL_SBID_SET)); + } + } + } + + for (unsigned i = 0; i < deps[ip].size(); i++) { + const dependency &dep = deps[ip][i]; + + if (dep.ordered && + find_ordered_dependency(deps[ip], jps[ip], true) && + (!ordered_mode || dep.exec_all > exec_all)) { + /* If the current instruction is not marked NoMask but an + * ordered dependency is, perform the synchronization as a + * separate NoMask SYNC instruction in order to avoid data + * coherency issues due to Wa_1407528679. The similar + * scenario with unordered dependencies should have been + * handled above. + */ + const fs_builder ibld = fs_builder(shader, block, inst) + .exec_all().group(1, 0); + fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(), + brw_imm_ud(TGL_SYNC_NOP)); + sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true); + break; + } + } + + /* Update the IR. */ + inst->sched = swsb; + inst->no_dd_check = inst->no_dd_clear = false; + ip++; + } + } +} + +bool +fs_visitor::lower_scoreboard() +{ + if (devinfo->ver >= 12) { + const ordered_address *jps = ordered_inst_addresses(this); + const dependency_list *deps0 = gather_inst_dependencies(this, jps); + const dependency_list *deps1 = allocate_inst_dependencies(this, deps0); + emit_inst_dependencies(this, jps, deps1); + delete[] deps1; + delete[] deps0; + delete[] jps; + } + + return true; +} diff --git a/src/intel/compiler/elk/brw_fs_sel_peephole.cpp b/src/intel/compiler/elk/brw_fs_sel_peephole.cpp new file mode 100644 index 00000000000..1b7fd14e59e --- /dev/null +++ b/src/intel/compiler/elk/brw_fs_sel_peephole.cpp @@ -0,0 +1,229 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_fs.h" +#include "brw_fs_builder.h" +#include "brw_cfg.h" + +/** @file brw_fs_sel_peephole.cpp + * + * This file contains the opt_peephole_sel() optimization pass that replaces + * MOV instructions to the same destination in the "then" and "else" bodies of + * an if statement with SEL instructions. + */ + +/* Four MOVs seems to be pretty typical, so I picked the next power of two in + * the hopes that it would handle almost anything possible in a single + * pass. + */ +#define MAX_MOVS 8 /**< The maximum number of MOVs to attempt to match. */ + +using namespace brw; + +/** + * Scans forwards from an IF counting consecutive MOV instructions in the + * "then" and "else" blocks of the if statement. + * + * A pointer to the bblock_t following the IF is passed as the + * argument. The function stores pointers to the MOV instructions in the + * and arrays. + * + * \return the minimum number of MOVs found in the two branches or zero if + * an error occurred. + * + * E.g.: + * IF ... + * then_mov[0] = MOV g4, ... + * then_mov[1] = MOV g5, ... + * then_mov[2] = MOV g6, ... + * ELSE ... + * else_mov[0] = MOV g4, ... + * else_mov[1] = MOV g5, ... + * else_mov[2] = MOV g7, ... + * ENDIF + * returns 3. + */ +static int +count_movs_from_if(const intel_device_info *devinfo, + fs_inst *then_mov[MAX_MOVS], fs_inst *else_mov[MAX_MOVS], + bblock_t *then_block, bblock_t *else_block) +{ + int then_movs = 0; + foreach_inst_in_block(fs_inst, inst, then_block) { + if (then_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV || + inst->flags_written(devinfo)) + break; + + then_mov[then_movs] = inst; + then_movs++; + } + + int else_movs = 0; + foreach_inst_in_block(fs_inst, inst, else_block) { + if (else_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV || + inst->flags_written(devinfo)) + break; + + else_mov[else_movs] = inst; + else_movs++; + } + + return MIN2(then_movs, else_movs); +} + +/** + * Try to replace IF/MOV+/ELSE/MOV+/ENDIF with SEL. + * + * Many GLSL shaders contain the following pattern: + * + * x = condition ? foo : bar + * + * or + * + * if (...) a.xyzw = foo.xyzw; + * else a.xyzw = bar.xyzw; + * + * The compiler emits an ir_if tree for this, since each subexpression might be + * a complex tree that could have side-effects or short-circuit logic. + * + * However, the common case is to simply select one of two constants or + * variable values---which is exactly what SEL is for. In this case, the + * assembly looks like: + * + * (+f0) IF + * MOV dst src0 + * ... + * ELSE + * MOV dst src1 + * ... + * ENDIF + * + * where each pair of MOVs to a common destination and can be easily translated + * into + * + * (+f0) SEL dst src0 src1 + * + * If src0 is an immediate value, we promote it to a temporary GRF. + */ +bool +fs_visitor::opt_peephole_sel() +{ + bool progress = false; + + foreach_block (block, cfg) { + /* IF instructions, by definition, can only be found at the ends of + * basic blocks. + */ + fs_inst *if_inst = (fs_inst *)block->end(); + if (if_inst->opcode != BRW_OPCODE_IF) + continue; + + fs_inst *else_mov[MAX_MOVS] = { NULL }; + fs_inst *then_mov[MAX_MOVS] = { NULL }; + + bblock_t *then_block = block->next(); + bblock_t *else_block = NULL; + foreach_list_typed(bblock_link, child, link, &block->children) { + if (child->block != then_block) { + if (child->block->prev()->end()->opcode == BRW_OPCODE_ELSE) { + else_block = child->block; + } + break; + } + } + if (else_block == NULL) + continue; + + int movs = count_movs_from_if(devinfo, then_mov, else_mov, then_block, else_block); + + if (movs == 0) + continue; + + /* Generate SEL instructions for pairs of MOVs to a common destination. */ + for (int i = 0; i < movs; i++) { + if (!then_mov[i] || !else_mov[i]) + break; + + /* Check that the MOVs are the right form. */ + if (!then_mov[i]->dst.equals(else_mov[i]->dst) || + then_mov[i]->exec_size != else_mov[i]->exec_size || + then_mov[i]->group != else_mov[i]->group || + then_mov[i]->force_writemask_all != else_mov[i]->force_writemask_all || + then_mov[i]->is_partial_write() || + else_mov[i]->is_partial_write() || + then_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE || + else_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE) { + movs = i; + break; + } + + /* Check that source types for mov operations match. */ + if (then_mov[i]->src[0].type != else_mov[i]->src[0].type) { + movs = i; + break; + } + } + + if (movs == 0) + continue; + + for (int i = 0; i < movs; i++) { + const fs_builder ibld = fs_builder(this, then_block, then_mov[i]) + .at(block, if_inst); + + if (then_mov[i]->src[0].equals(else_mov[i]->src[0])) { + ibld.MOV(then_mov[i]->dst, then_mov[i]->src[0]); + } else { + /* Only the last source register can be a constant, so if the MOV + * in the "then" clause uses a constant, we need to put it in a + * temporary. + */ + fs_reg src0(then_mov[i]->src[0]); + if (src0.file == IMM) { + src0 = ibld.vgrf(then_mov[i]->src[0].type); + ibld.MOV(src0, then_mov[i]->src[0]); + } + + /* 64-bit immediates can't be placed in src1. */ + fs_reg src1(else_mov[i]->src[0]); + if (src1.file == IMM && type_sz(src1.type) == 8) { + src1 = ibld.vgrf(else_mov[i]->src[0].type); + ibld.MOV(src1, else_mov[i]->src[0]); + } + + set_predicate_inv(if_inst->predicate, if_inst->predicate_inverse, + ibld.SEL(then_mov[i]->dst, src0, src1)); + } + + then_mov[i]->remove(then_block); + else_mov[i]->remove(else_block); + } + + progress = true; + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + + return progress; +} diff --git a/src/intel/compiler/elk/brw_fs_thread_payload.cpp b/src/intel/compiler/elk/brw_fs_thread_payload.cpp new file mode 100644 index 00000000000..b78567fa2d1 --- /dev/null +++ b/src/intel/compiler/elk/brw_fs_thread_payload.cpp @@ -0,0 +1,605 @@ +/* + * Copyright © 2006-2022 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_fs.h" +#include "brw_fs_builder.h" + +using namespace brw; + +vs_thread_payload::vs_thread_payload(const fs_visitor &v) +{ + unsigned r = 0; + + /* R0: Thread header. */ + r += reg_unit(v.devinfo); + + /* R1: URB handles. */ + urb_handles = brw_ud8_grf(r, 0); + r += reg_unit(v.devinfo); + + num_regs = r; +} + +tcs_thread_payload::tcs_thread_payload(const fs_visitor &v) +{ + struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(v.prog_data); + struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(v.prog_data); + struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) v.key; + + if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH) { + patch_urb_output = brw_ud1_grf(0, 0); + primitive_id = brw_vec1_grf(0, 1); + + /* r1-r4 contain the ICP handles. */ + icp_handle_start = brw_ud8_grf(1, 0); + + num_regs = 5; + } else { + assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH); + assert(tcs_key->input_vertices <= BRW_MAX_TCS_INPUT_VERTICES); + + unsigned r = 0; + + r += reg_unit(v.devinfo); + + patch_urb_output = brw_ud8_grf(r, 0); + r += reg_unit(v.devinfo); + + if (tcs_prog_data->include_primitive_id) { + primitive_id = brw_vec8_grf(r, 0); + r += reg_unit(v.devinfo); + } + + /* ICP handles occupy the next 1-32 registers. */ + icp_handle_start = brw_ud8_grf(r, 0); + r += brw_tcs_prog_key_input_vertices(tcs_key) * reg_unit(v.devinfo); + + num_regs = r; + } +} + +tes_thread_payload::tes_thread_payload(const fs_visitor &v) +{ + unsigned r = 0; + + /* R0: Thread Header. */ + patch_urb_input = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD); + primitive_id = brw_vec1_grf(0, 1); + r += reg_unit(v.devinfo); + + /* R1-3: gl_TessCoord.xyz. */ + for (unsigned i = 0; i < 3; i++) { + coords[i] = brw_vec8_grf(r, 0); + r += reg_unit(v.devinfo); + } + + /* R4: URB output handles. */ + urb_output = brw_ud8_grf(r, 0); + r += reg_unit(v.devinfo); + + num_regs = r; +} + +gs_thread_payload::gs_thread_payload(fs_visitor &v) +{ + struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(v.prog_data); + struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(v.prog_data); + const fs_builder bld = fs_builder(&v).at_end(); + + /* R0: thread header. */ + unsigned r = reg_unit(v.devinfo); + + /* R1: output URB handles. */ + urb_handles = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.AND(urb_handles, brw_ud8_grf(r, 0), + v.devinfo->ver >= 20 ? brw_imm_ud(0xFFFFFF) : brw_imm_ud(0xFFFF)); + + /* R1: Instance ID stored in bits 31:27 */ + instance_id = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.SHR(instance_id, brw_ud8_grf(r, 0), brw_imm_ud(27u)); + + r += reg_unit(v.devinfo); + + if (gs_prog_data->include_primitive_id) { + primitive_id = brw_ud8_grf(r, 0); + r += reg_unit(v.devinfo); + } + + /* Always enable VUE handles so we can safely use pull model if needed. + * + * The push model for a GS uses a ton of register space even for trivial + * scenarios with just a few inputs, so just make things easier and a bit + * safer by always having pull model available. + */ + gs_prog_data->base.include_vue_handles = true; + + /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */ + icp_handle_start = brw_ud8_grf(r, 0); + r += v.nir->info.gs.vertices_in * reg_unit(v.devinfo); + + num_regs = r; + + /* Use a maximum of 24 registers for push-model inputs. */ + const unsigned max_push_components = 24; + + /* If pushing our inputs would take too many registers, reduce the URB read + * length (which is in HWords, or 8 registers), and resort to pulling. + * + * Note that the GS reads HWords for every vertex - so we + * have to multiply by VerticesIn to obtain the total storage requirement. + */ + if (8 * vue_prog_data->urb_read_length * v.nir->info.gs.vertices_in > + max_push_components) { + vue_prog_data->urb_read_length = + ROUND_DOWN_TO(max_push_components / v.nir->info.gs.vertices_in, 8) / 8; + } +} + +static inline void +setup_fs_payload_gfx20(fs_thread_payload &payload, + const fs_visitor &v, + bool &source_depth_to_render_target) +{ + struct brw_wm_prog_data *prog_data = brw_wm_prog_data(v.prog_data); + const unsigned payload_width = 16; + assert(v.dispatch_width % payload_width == 0); + assert(v.devinfo->ver >= 20); + + for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) { + /* R0-1: PS thread payload header, masks and pixel X/Y coordinates. */ + payload.num_regs++; + payload.subspan_coord_reg[j] = payload.num_regs++; + } + + for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) { + /* R2-13: Barycentric interpolation coordinates. These appear + * in the same order that they appear in the brw_barycentric_mode + * enum. Each set of coordinates occupies 2 64B registers per + * SIMD16 half. Coordinates only appear if they were enabled + * using the "Barycentric Interpolation Mode" bits in WM_STATE. + */ + for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) { + if (prog_data->barycentric_interp_modes & (1 << i)) { + payload.barycentric_coord_reg[i][j] = payload.num_regs; + payload.num_regs += payload_width / 4; + } + } + + /* R14: Interpolated depth if "Pixel Shader Uses Source Depth" is set. */ + if (prog_data->uses_src_depth) { + payload.source_depth_reg[j] = payload.num_regs; + payload.num_regs += payload_width / 8; + } + + /* R15: Interpolated W if "Pixel Shader Uses Source W" is set. */ + if (prog_data->uses_src_w) { + payload.source_w_reg[j] = payload.num_regs; + payload.num_regs += payload_width / 8; + } + + /* R16: MSAA input coverage mask if "Pixel Shader Uses Input + * Coverage Mask" is set. + */ + if (prog_data->uses_sample_mask) { + payload.sample_mask_in_reg[j] = payload.num_regs; + payload.num_regs += payload_width / 8; + } + + /* R19: MSAA position XY offsets if "Position XY Offset Select" + * is either POSOFFSET_CENTROID or POSOFFSET_SAMPLE. Note that + * this is delivered as a single SIMD32 vector, inconsistently + * with most other PS payload fields. + */ + if (prog_data->uses_pos_offset && j == 0) { + for (unsigned k = 0; k < 2; k++) { + payload.sample_pos_reg[k] = payload.num_regs; + payload.num_regs++; + } + } + } + + if (prog_data->uses_depth_w_coefficients) { + assert(v.max_polygons == 1); + payload.depth_w_coef_reg = payload.num_regs; + payload.num_regs += 2; + } + + if (v.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { + source_depth_to_render_target = true; + } +} + +static inline void +setup_fs_payload_gfx6(fs_thread_payload &payload, + const fs_visitor &v, + bool &source_depth_to_render_target) +{ + struct brw_wm_prog_data *prog_data = brw_wm_prog_data(v.prog_data); + + const unsigned payload_width = MIN2(16, v.dispatch_width); + assert(v.dispatch_width % payload_width == 0); + assert(v.devinfo->ver >= 6 && v.devinfo->ver < 20); + + payload.num_regs = 0; + + /* R0: PS thread payload header. */ + payload.num_regs++; + + for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) { + /* R1: masks, pixel X/Y coordinates. */ + payload.subspan_coord_reg[j] = payload.num_regs++; + } + + for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) { + /* R3-26: barycentric interpolation coordinates. These appear in the + * same order that they appear in the brw_barycentric_mode enum. Each + * set of coordinates occupies 2 registers if dispatch width == 8 and 4 + * registers if dispatch width == 16. Coordinates only appear if they + * were enabled using the "Barycentric Interpolation Mode" bits in + * WM_STATE. + */ + for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) { + if (prog_data->barycentric_interp_modes & (1 << i)) { + payload.barycentric_coord_reg[i][j] = payload.num_regs; + payload.num_regs += payload_width / 4; + } + } + + /* R27-28: interpolated depth if uses source depth */ + if (prog_data->uses_src_depth) { + payload.source_depth_reg[j] = payload.num_regs; + payload.num_regs += payload_width / 8; + } + + /* R29-30: interpolated W set if GFX6_WM_USES_SOURCE_W. */ + if (prog_data->uses_src_w) { + payload.source_w_reg[j] = payload.num_regs; + payload.num_regs += payload_width / 8; + } + + /* R31: MSAA position offsets. */ + if (prog_data->uses_pos_offset) { + payload.sample_pos_reg[j] = payload.num_regs; + payload.num_regs++; + } + + /* R32-33: MSAA input coverage mask */ + if (prog_data->uses_sample_mask) { + assert(v.devinfo->ver >= 7); + payload.sample_mask_in_reg[j] = payload.num_regs; + payload.num_regs += payload_width / 8; + } + } + + /* R66: Source Depth and/or W Attribute Vertex Deltas */ + if (prog_data->uses_depth_w_coefficients) { + assert(v.max_polygons == 1); + payload.depth_w_coef_reg = payload.num_regs; + payload.num_regs++; + } + + if (v.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { + source_depth_to_render_target = true; + } +} + +#undef P /* prompted depth */ +#undef C /* computed */ +#undef N /* non-promoted? */ + +#define P 0 +#define C 1 +#define N 2 + +static const struct { + GLuint mode:2; + GLuint sd_present:1; + GLuint sd_to_rt:1; + GLuint dd_present:1; + GLuint ds_present:1; +} wm_iz_table[BRW_WM_IZ_BIT_MAX] = +{ + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { N, 1, 1, 0, 0 }, + { N, 0, 1, 0, 0 }, + { N, 0, 1, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { C, 0, 1, 1, 0 }, + { C, 0, 1, 1, 0 }, + { P, 0, 0, 0, 0 }, + { N, 1, 1, 0, 0 }, + { C, 0, 1, 1, 0 }, + { C, 0, 1, 1, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { N, 1, 1, 0, 0 }, + { N, 0, 1, 0, 0 }, + { N, 0, 1, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { C, 0, 1, 1, 0 }, + { C, 0, 1, 1, 0 }, + { P, 0, 0, 0, 0 }, + { N, 1, 1, 0, 0 }, + { C, 0, 1, 1, 0 }, + { C, 0, 1, 1, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { N, 1, 1, 0, 1 }, + { N, 0, 1, 0, 1 }, + { N, 0, 1, 0, 1 }, + { P, 0, 0, 0, 0 }, + { P, 0, 0, 0, 0 }, + { C, 0, 1, 1, 1 }, + { C, 0, 1, 1, 1 }, + { P, 0, 0, 0, 0 }, + { N, 1, 1, 0, 1 }, + { C, 0, 1, 1, 1 }, + { C, 0, 1, 1, 1 }, + { P, 0, 0, 0, 0 }, + { C, 0, 0, 0, 1 }, + { P, 0, 0, 0, 0 }, + { C, 0, 1, 0, 1 }, + { P, 0, 0, 0, 0 }, + { C, 1, 1, 0, 1 }, + { C, 0, 1, 0, 1 }, + { C, 0, 1, 0, 1 }, + { P, 0, 0, 0, 0 }, + { C, 1, 1, 1, 1 }, + { C, 0, 1, 1, 1 }, + { C, 0, 1, 1, 1 }, + { P, 0, 0, 0, 0 }, + { C, 1, 1, 1, 1 }, + { C, 0, 1, 1, 1 }, + { C, 0, 1, 1, 1 } +}; + +/** + * \param line_aa BRW_NEVER, BRW_ALWAYS or BRW_SOMETIMES + * \param lookup bitmask of BRW_WM_IZ_* flags + */ +static inline void +setup_fs_payload_gfx4(fs_thread_payload &payload, + const fs_visitor &v, + bool &source_depth_to_render_target, + bool &runtime_check_aads_emit) +{ + assert(v.dispatch_width <= 16); + + struct brw_wm_prog_data *prog_data = brw_wm_prog_data(v.prog_data); + brw_wm_prog_key *key = (brw_wm_prog_key *) v.key; + + GLuint reg = 1; + bool kill_stats_promoted_workaround = false; + int lookup = key->iz_lookup; + + assert(lookup < BRW_WM_IZ_BIT_MAX); + + /* Crazy workaround in the windowizer, which we need to track in + * our register allocation and render target writes. See the "If + * statistics are enabled..." paragraph of 11.5.3.2: Early Depth + * Test Cases [Pre-DevGT] of the 3D Pipeline - Windower B-Spec. + */ + if (key->stats_wm && + (lookup & BRW_WM_IZ_PS_KILL_ALPHATEST_BIT) && + wm_iz_table[lookup].mode == P) { + kill_stats_promoted_workaround = true; + } + + payload.subspan_coord_reg[0] = reg++; + + if (wm_iz_table[lookup].sd_present || prog_data->uses_src_depth || + kill_stats_promoted_workaround) { + payload.source_depth_reg[0] = reg; + reg += 2; + } + + if (wm_iz_table[lookup].sd_to_rt || kill_stats_promoted_workaround) + source_depth_to_render_target = true; + + if (wm_iz_table[lookup].ds_present || key->line_aa != BRW_NEVER) { + payload.aa_dest_stencil_reg[0] = reg; + runtime_check_aads_emit = + !wm_iz_table[lookup].ds_present && key->line_aa == BRW_SOMETIMES; + reg++; + } + + if (wm_iz_table[lookup].dd_present) { + payload.dest_depth_reg[0] = reg; + reg+=2; + } + + payload.num_regs = reg; +} + +#undef P /* prompted depth */ +#undef C /* computed */ +#undef N /* non-promoted? */ + +fs_thread_payload::fs_thread_payload(const fs_visitor &v, + bool &source_depth_to_render_target, + bool &runtime_check_aads_emit) + : subspan_coord_reg(), + source_depth_reg(), + source_w_reg(), + aa_dest_stencil_reg(), + dest_depth_reg(), + sample_pos_reg(), + sample_mask_in_reg(), + depth_w_coef_reg(), + barycentric_coord_reg() +{ + if (v.devinfo->ver >= 20) + setup_fs_payload_gfx20(*this, v, source_depth_to_render_target); + else if (v.devinfo->ver >= 6) + setup_fs_payload_gfx6(*this, v, source_depth_to_render_target); + else + setup_fs_payload_gfx4(*this, v, source_depth_to_render_target, + runtime_check_aads_emit); +} + +cs_thread_payload::cs_thread_payload(const fs_visitor &v) +{ + struct brw_cs_prog_data *prog_data = brw_cs_prog_data(v.prog_data); + + unsigned r = reg_unit(v.devinfo); + + /* See nir_setup_uniforms for subgroup_id in earlier versions. */ + if (v.devinfo->verx10 >= 125) { + subgroup_id_ = brw_ud1_grf(0, 2); + + for (int i = 0; i < 3; i++) { + if (prog_data->generate_local_id & (1 << i)) { + local_invocation_id[i] = brw_uw8_grf(r, 0); + r += reg_unit(v.devinfo); + if (v.devinfo->ver < 20 && v.dispatch_width == 32) + r += reg_unit(v.devinfo); + } else { + local_invocation_id[i] = brw_imm_uw(0); + } + } + + /* TODO: Fill out uses_btd_stack_ids automatically */ + if (prog_data->uses_btd_stack_ids) + r += reg_unit(v.devinfo); + } + + num_regs = r; +} + +void +cs_thread_payload::load_subgroup_id(const fs_builder &bld, + fs_reg &dest) const +{ + auto devinfo = bld.shader->devinfo; + dest = retype(dest, BRW_REGISTER_TYPE_UD); + + if (subgroup_id_.file != BAD_FILE) { + assert(devinfo->verx10 >= 125); + bld.AND(dest, subgroup_id_, brw_imm_ud(INTEL_MASK(7, 0))); + } else { + assert(devinfo->verx10 < 125); + assert(gl_shader_stage_is_compute(bld.shader->stage)); + int index = brw_get_subgroup_id_param_index(devinfo, + bld.shader->stage_prog_data); + bld.MOV(dest, fs_reg(UNIFORM, index, BRW_REGISTER_TYPE_UD)); + } +} + +task_mesh_thread_payload::task_mesh_thread_payload(fs_visitor &v) + : cs_thread_payload(v) +{ + /* Task and Mesh Shader Payloads (SIMD8 and SIMD16) + * + * R0: Header + * R1: Local_ID.X[0-7 or 0-15] + * R2: Inline Parameter + * + * Task and Mesh Shader Payloads (SIMD32) + * + * R0: Header + * R1: Local_ID.X[0-15] + * R2: Local_ID.X[16-31] + * R3: Inline Parameter + * + * Local_ID.X values are 16 bits. + * + * Inline parameter is optional but always present since we use it to pass + * the address to descriptors. + */ + + const fs_builder bld = fs_builder(&v).at_end(); + + unsigned r = 0; + assert(subgroup_id_.file != BAD_FILE); + extended_parameter_0 = retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD); + + if (v.devinfo->ver >= 20) { + urb_output = brw_ud1_grf(1, 0); + } else { + urb_output = bld.vgrf(BRW_REGISTER_TYPE_UD); + /* In both mesh and task shader payload, lower 16 bits of g0.6 is + * an offset within Slice's Local URB, which says where shader is + * supposed to output its data. + */ + bld.AND(urb_output, brw_ud1_grf(0, 6), brw_imm_ud(0xFFFF)); + } + + if (v.stage == MESA_SHADER_MESH) { + /* g0.7 is Task Shader URB Entry Offset, which contains both an offset + * within Slice's Local USB (bits 0:15) and a slice selector + * (bits 16:24). Slice selector can be non zero when mesh shader + * is spawned on slice other than the one where task shader was run. + * Bit 24 says that Slice ID is present and bits 16:23 is the Slice ID. + */ + task_urb_input = brw_ud1_grf(0, 7); + } + r += reg_unit(v.devinfo); + + local_index = brw_uw8_grf(r, 0); + r += reg_unit(v.devinfo); + if (v.devinfo->ver < 20 && v.dispatch_width == 32) + r += reg_unit(v.devinfo); + + inline_parameter = brw_ud1_grf(r, 0); + r += reg_unit(v.devinfo); + + num_regs = r; +} + +bs_thread_payload::bs_thread_payload(const fs_visitor &v) +{ + unsigned r = 0; + + /* R0: Thread header. */ + r += reg_unit(v.devinfo); + + /* R1: Stack IDs. */ + r += reg_unit(v.devinfo); + + /* R2: Inline Parameter. Used for argument addresses. */ + global_arg_ptr = brw_ud1_grf(r, 0); + local_arg_ptr = brw_ud1_grf(r, 2); + r += reg_unit(v.devinfo); + + num_regs = r; +} + +void +bs_thread_payload::load_shader_type(const fs_builder &bld, fs_reg &dest) const +{ + fs_reg ud_dest = retype(dest, BRW_REGISTER_TYPE_UD); + bld.MOV(ud_dest, retype(brw_vec1_grf(0, 3), ud_dest.type)); + bld.AND(ud_dest, ud_dest, brw_imm_ud(0xf)); +} diff --git a/src/intel/compiler/elk/brw_fs_validate.cpp b/src/intel/compiler/elk/brw_fs_validate.cpp new file mode 100644 index 00000000000..499bc8181c3 --- /dev/null +++ b/src/intel/compiler/elk/brw_fs_validate.cpp @@ -0,0 +1,199 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_fs_validate.cpp + * + * Implements a pass that validates various invariants of the IR. The current + * pass only validates that GRF's uses are sane. More can be added later. + */ + +#include "brw_fs.h" +#include "brw_cfg.h" + +#define fsv_assert(assertion) \ + { \ + if (!(assertion)) { \ + fprintf(stderr, "ASSERT: Scalar %s validation failed!\n", \ + _mesa_shader_stage_to_abbrev(stage)); \ + dump_instruction(inst, stderr); \ + fprintf(stderr, "%s:%d: '%s' failed\n", __FILE__, __LINE__, #assertion); \ + abort(); \ + } \ + } + +#define fsv_assert_eq(first, second) \ + { \ + unsigned f = (first); \ + unsigned s = (second); \ + if (f != s) { \ + fprintf(stderr, "ASSERT: Scalar %s validation failed!\n", \ + _mesa_shader_stage_to_abbrev(stage)); \ + dump_instruction(inst, stderr); \ + fprintf(stderr, "%s:%d: A == B failed\n", __FILE__, __LINE__); \ + fprintf(stderr, " A = %s = %u\n", #first, f); \ + fprintf(stderr, " B = %s = %u\n", #second, s); \ + abort(); \ + } \ + } + +#define fsv_assert_ne(first, second) \ + { \ + unsigned f = (first); \ + unsigned s = (second); \ + if (f == s) { \ + fprintf(stderr, "ASSERT: Scalar %s validation failed!\n", \ + _mesa_shader_stage_to_abbrev(stage)); \ + dump_instruction(inst, stderr); \ + fprintf(stderr, "%s:%d: A != B failed\n", __FILE__, __LINE__); \ + fprintf(stderr, " A = %s = %u\n", #first, f); \ + fprintf(stderr, " B = %s = %u\n", #second, s); \ + abort(); \ + } \ + } + +#define fsv_assert_lte(first, second) \ + { \ + unsigned f = (first); \ + unsigned s = (second); \ + if (f > s) { \ + fprintf(stderr, "ASSERT: Scalar %s validation failed!\n", \ + _mesa_shader_stage_to_abbrev(stage)); \ + dump_instruction(inst, stderr); \ + fprintf(stderr, "%s:%d: A <= B failed\n", __FILE__, __LINE__); \ + fprintf(stderr, " A = %s = %u\n", #first, f); \ + fprintf(stderr, " B = %s = %u\n", #second, s); \ + abort(); \ + } \ + } + +#ifndef NDEBUG +void +fs_visitor::validate() +{ + cfg->validate(_mesa_shader_stage_to_abbrev(stage)); + + foreach_block_and_inst (block, fs_inst, inst, cfg) { + switch (inst->opcode) { + case SHADER_OPCODE_SEND: + fsv_assert(is_uniform(inst->src[0]) && is_uniform(inst->src[1])); + break; + + case BRW_OPCODE_MOV: + fsv_assert(inst->sources == 1); + break; + + default: + break; + } + + if (inst->is_3src(compiler)) { + const unsigned integer_sources = + brw_reg_type_is_integer(inst->src[0].type) + + brw_reg_type_is_integer(inst->src[1].type) + + brw_reg_type_is_integer(inst->src[2].type); + const unsigned float_sources = + brw_reg_type_is_floating_point(inst->src[0].type) + + brw_reg_type_is_floating_point(inst->src[1].type) + + brw_reg_type_is_floating_point(inst->src[2].type); + + fsv_assert((integer_sources == 3 && float_sources == 0) || + (integer_sources == 0 && float_sources == 3)); + + if (devinfo->ver >= 10) { + for (unsigned i = 0; i < 3; i++) { + if (inst->src[i].file == BRW_IMMEDIATE_VALUE) + continue; + + switch (inst->src[i].vstride) { + case BRW_VERTICAL_STRIDE_0: + case BRW_VERTICAL_STRIDE_4: + case BRW_VERTICAL_STRIDE_8: + case BRW_VERTICAL_STRIDE_16: + break; + + case BRW_VERTICAL_STRIDE_1: + fsv_assert_lte(12, devinfo->ver); + break; + + case BRW_VERTICAL_STRIDE_2: + fsv_assert_lte(devinfo->ver, 11); + break; + + default: + fsv_assert(!"invalid vstride"); + break; + } + } + } else if (grf_used != 0) { + /* Only perform the pre-Gfx10 checks after register allocation has + * occured. + * + * Many passes (e.g., constant copy propagation) will genenerate + * invalid 3-source instructions with the expectation that later + * passes (e.g., combine constants) will fix them. + */ + for (unsigned i = 0; i < 3; i++) { + fsv_assert_ne(inst->src[i].file, BRW_IMMEDIATE_VALUE); + + /* A stride of 1 (the usual case) or 0, with a special + * "repctrl" bit, is allowed. The repctrl bit doesn't work for + * 64-bit datatypes, so if the source type is 64-bit then only + * a stride of 1 is allowed. From the Broadwell PRM, Volume 7 + * "3D Media GPGPU", page 944: + * + * This is applicable to 32b datatypes and 16b datatype. 64b + * datatypes cannot use the replicate control. + */ + fsv_assert_lte(inst->src[i].vstride, 1); + + if (type_sz(inst->src[i].type) > 4) + fsv_assert_eq(inst->src[i].vstride, 1); + } + } + } + + if (inst->dst.file == VGRF) { + fsv_assert_lte(inst->dst.offset / REG_SIZE + regs_written(inst), + alloc.sizes[inst->dst.nr]); + } + + for (unsigned i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF) { + fsv_assert_lte(inst->src[i].offset / REG_SIZE + regs_read(inst, i), + alloc.sizes[inst->src[i].nr]); + } + } + + /* Accumulator Registers, bspec 47251: + * + * "When destination is accumulator with offset 0, destination + * horizontal stride must be 1." + */ + if (intel_needs_workaround(devinfo, 14014617373) && + inst->dst.is_accumulator() && + inst->dst.offset == 0) { + fsv_assert_eq(inst->dst.stride, 1); + } + } +} +#endif diff --git a/src/intel/compiler/elk/brw_fs_visitor.cpp b/src/intel/compiler/elk/brw_fs_visitor.cpp new file mode 100644 index 00000000000..9f7f1befd83 --- /dev/null +++ b/src/intel/compiler/elk/brw_fs_visitor.cpp @@ -0,0 +1,1266 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_fs_visitor.cpp + * + * This file supports generating the FS LIR from the GLSL IR. The LIR + * makes it easier to do backend-specific optimizations than doing so + * in the GLSL IR or in the native code. + */ +#include "brw_eu.h" +#include "brw_fs.h" +#include "brw_fs_builder.h" +#include "brw_nir.h" +#include "compiler/glsl_types.h" + +using namespace brw; + +/* Input data is organized with first the per-primitive values, followed + * by per-vertex values. The per-vertex will have interpolation information + * associated, so use 4 components for each value. + */ + +/* The register location here is relative to the start of the URB + * data. It will get adjusted to be a real location before + * generate_code() time. + */ +fs_reg +fs_visitor::interp_reg(const fs_builder &bld, unsigned location, + unsigned channel, unsigned comp) +{ + assert(stage == MESA_SHADER_FRAGMENT); + assert(BITFIELD64_BIT(location) & ~nir->info.per_primitive_inputs); + + const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); + + assert(prog_data->urb_setup[location] >= 0); + unsigned nr = prog_data->urb_setup[location]; + channel += prog_data->urb_setup_channel[location]; + + /* Adjust so we start counting from the first per_vertex input. */ + assert(nr >= prog_data->num_per_primitive_inputs); + nr -= prog_data->num_per_primitive_inputs; + + const unsigned per_vertex_start = prog_data->num_per_primitive_inputs; + const unsigned regnr = per_vertex_start + (nr * 4) + channel; + + if (max_polygons > 1) { + /* In multipolygon dispatch each plane parameter is a + * dispatch_width-wide SIMD vector (see comment in + * assign_urb_setup()), so we need to use offset() instead of + * component() to select the specified parameter. + */ + const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.MOV(tmp, offset(fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_UD), + dispatch_width, comp)); + return retype(tmp, BRW_REGISTER_TYPE_F); + } else { + return component(fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F), comp); + } +} + +/* The register location here is relative to the start of the URB + * data. It will get adjusted to be a real location before + * generate_code() time. + */ +fs_reg +fs_visitor::per_primitive_reg(const fs_builder &bld, int location, unsigned comp) +{ + assert(stage == MESA_SHADER_FRAGMENT); + assert(BITFIELD64_BIT(location) & nir->info.per_primitive_inputs); + + const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); + + comp += prog_data->urb_setup_channel[location]; + + assert(prog_data->urb_setup[location] >= 0); + + const unsigned regnr = prog_data->urb_setup[location] + comp / 4; + + assert(regnr < prog_data->num_per_primitive_inputs); + + if (max_polygons > 1) { + /* In multipolygon dispatch each primitive constant is a + * dispatch_width-wide SIMD vector (see comment in + * assign_urb_setup()), so we need to use offset() instead of + * component() to select the specified parameter. + */ + const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.MOV(tmp, offset(fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_UD), + dispatch_width, comp % 4)); + return retype(tmp, BRW_REGISTER_TYPE_F); + } else { + return component(fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F), comp % 4); + } +} + +/** Emits the interpolation for the varying inputs. */ +void +fs_visitor::emit_interpolation_setup_gfx4() +{ + struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); + + fs_builder abld = fs_builder(this).at_end().annotate("compute pixel centers"); + this->pixel_x = vgrf(glsl_uint_type()); + this->pixel_y = vgrf(glsl_uint_type()); + this->pixel_x.type = BRW_REGISTER_TYPE_UW; + this->pixel_y.type = BRW_REGISTER_TYPE_UW; + abld.ADD(this->pixel_x, + fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)), + fs_reg(brw_imm_v(0x10101010))); + abld.ADD(this->pixel_y, + fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)), + fs_reg(brw_imm_v(0x11001100))); + + const fs_builder bld = fs_builder(this).at_end(); + abld = bld.annotate("compute pixel deltas from v0"); + + this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL] = + vgrf(glsl_vec2_type()); + const fs_reg &delta_xy = this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL]; + const fs_reg xstart(negate(brw_vec1_grf(1, 0))); + const fs_reg ystart(negate(brw_vec1_grf(1, 1))); + + if (devinfo->has_pln) { + for (unsigned i = 0; i < dispatch_width / 8; i++) { + abld.quarter(i).ADD(quarter(offset(delta_xy, abld, 0), i), + quarter(this->pixel_x, i), xstart); + abld.quarter(i).ADD(quarter(offset(delta_xy, abld, 1), i), + quarter(this->pixel_y, i), ystart); + } + } else { + abld.ADD(offset(delta_xy, abld, 0), this->pixel_x, xstart); + abld.ADD(offset(delta_xy, abld, 1), this->pixel_y, ystart); + } + + this->pixel_z = fetch_payload_reg(bld, fs_payload().source_depth_reg); + + /* The SF program automatically handles doing the perspective correction or + * not based on wm_prog_data::interp_mode[] so we can use the same pixel + * offsets for both perspective and non-perspective. + */ + this->delta_xy[BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL] = + this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL]; + + abld = bld.annotate("compute pos.w and 1/pos.w"); + /* Compute wpos.w. It's always in our setup, since it's needed to + * interpolate the other attributes. + */ + this->wpos_w = vgrf(glsl_float_type()); + abld.emit(FS_OPCODE_LINTERP, wpos_w, delta_xy, + interp_reg(abld, VARYING_SLOT_POS, 3, 0)); + /* Compute the pixel 1/W value from wpos.w. */ + this->pixel_w = vgrf(glsl_float_type()); + abld.emit(SHADER_OPCODE_RCP, this->pixel_w, wpos_w); +} + +/** Emits the interpolation for the varying inputs. */ +void +fs_visitor::emit_interpolation_setup_gfx6() +{ + const fs_builder bld = fs_builder(this).at_end(); + fs_builder abld = bld.annotate("compute pixel centers"); + + this->pixel_x = vgrf(glsl_float_type()); + this->pixel_y = vgrf(glsl_float_type()); + + const struct brw_wm_prog_key *wm_key = (brw_wm_prog_key*) this->key; + struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(prog_data); + + fs_reg int_sample_offset_x, int_sample_offset_y; /* Used on Gen12HP+ */ + fs_reg int_sample_offset_xy; /* Used on Gen8+ */ + fs_reg half_int_sample_offset_x, half_int_sample_offset_y; + if (wm_prog_data->coarse_pixel_dispatch != BRW_ALWAYS) { + /* The thread payload only delivers subspan locations (ss0, ss1, + * ss2, ...). Since subspans covers 2x2 pixels blocks, we need to + * generate 4 pixel coordinates out of each subspan location. We do this + * by replicating a subspan coordinate 4 times and adding an offset of 1 + * in each direction from the initial top left (tl) location to generate + * top right (tr = +1 in x), bottom left (bl = +1 in y) and bottom right + * (br = +1 in x, +1 in y). + * + * The locations we build look like this in SIMD8 : + * + * ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br + * + * The value 0x11001010 is a vector of 8 half byte vector. It adds + * following to generate the 4 pixels coordinates out of the subspan0: + * + * 0x + * 1 : ss0.y + 1 -> ss0.br.y + * 1 : ss0.y + 1 -> ss0.bl.y + * 0 : ss0.y + 0 -> ss0.tr.y + * 0 : ss0.y + 0 -> ss0.tl.y + * 1 : ss0.x + 1 -> ss0.br.x + * 0 : ss0.x + 0 -> ss0.bl.x + * 1 : ss0.x + 1 -> ss0.tr.x + * 0 : ss0.x + 0 -> ss0.tl.x + * + * By doing a SIMD16 add in a SIMD8 shader, we can generate the 8 pixels + * coordinates out of 2 subspans coordinates in a single ADD instruction + * (twice the operation above). + */ + int_sample_offset_xy = fs_reg(brw_imm_v(0x11001010)); + half_int_sample_offset_x = fs_reg(brw_imm_uw(0)); + half_int_sample_offset_y = fs_reg(brw_imm_uw(0)); + /* On Gfx12.5, because of regioning restrictions, the interpolation code + * is slightly different and works off X & Y only inputs. The ordering + * of the half bytes here is a bit odd, with each subspan replicated + * twice and every other element is discarded : + * + * ss0.tl ss0.tl ss0.tr ss0.tr ss0.bl ss0.bl ss0.br ss0.br + * X offset: 0 0 1 0 0 0 1 0 + * Y offset: 0 0 0 0 1 0 1 0 + */ + int_sample_offset_x = fs_reg(brw_imm_v(0x01000100)); + int_sample_offset_y = fs_reg(brw_imm_v(0x01010000)); + } + + fs_reg int_coarse_offset_x, int_coarse_offset_y; /* Used on Gen12HP+ */ + fs_reg int_coarse_offset_xy; /* Used on Gen8+ */ + fs_reg half_int_coarse_offset_x, half_int_coarse_offset_y; + if (wm_prog_data->coarse_pixel_dispatch != BRW_NEVER) { + /* In coarse pixel dispatch we have to do the same ADD instruction that + * we do in normal per pixel dispatch, except this time we're not adding + * 1 in each direction, but instead the coarse pixel size. + * + * The coarse pixel size is delivered as 2 u8 in r1.0 + */ + struct brw_reg r1_0 = retype(brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0), BRW_REGISTER_TYPE_UB); + + const fs_builder dbld = + abld.exec_all().group(MIN2(16, dispatch_width) * 2, 0); + + if (devinfo->verx10 >= 125) { + /* To build the array of half bytes we do and AND operation with the + * right mask in X. + */ + int_coarse_offset_x = dbld.vgrf(BRW_REGISTER_TYPE_UW); + dbld.AND(int_coarse_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0f000f00)); + + /* And the right mask in Y. */ + int_coarse_offset_y = dbld.vgrf(BRW_REGISTER_TYPE_UW); + dbld.AND(int_coarse_offset_y, byte_offset(r1_0, 1), brw_imm_v(0x0f0f0000)); + } else { + /* To build the array of half bytes we do and AND operation with the + * right mask in X. + */ + int_coarse_offset_x = dbld.vgrf(BRW_REGISTER_TYPE_UW); + dbld.AND(int_coarse_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0000f0f0)); + + /* And the right mask in Y. */ + int_coarse_offset_y = dbld.vgrf(BRW_REGISTER_TYPE_UW); + dbld.AND(int_coarse_offset_y, byte_offset(r1_0, 1), brw_imm_v(0xff000000)); + + /* Finally OR the 2 registers. */ + int_coarse_offset_xy = dbld.vgrf(BRW_REGISTER_TYPE_UW); + dbld.OR(int_coarse_offset_xy, int_coarse_offset_x, int_coarse_offset_y); + } + + /* Also compute the half coarse size used to center coarses. */ + half_int_coarse_offset_x = bld.vgrf(BRW_REGISTER_TYPE_UW); + half_int_coarse_offset_y = bld.vgrf(BRW_REGISTER_TYPE_UW); + + bld.SHR(half_int_coarse_offset_x, suboffset(r1_0, 0), brw_imm_ud(1)); + bld.SHR(half_int_coarse_offset_y, suboffset(r1_0, 1), brw_imm_ud(1)); + } + + fs_reg int_pixel_offset_x, int_pixel_offset_y; /* Used on Gen12HP+ */ + fs_reg int_pixel_offset_xy; /* Used on Gen8+ */ + fs_reg half_int_pixel_offset_x, half_int_pixel_offset_y; + switch (wm_prog_data->coarse_pixel_dispatch) { + case BRW_NEVER: + int_pixel_offset_x = int_sample_offset_x; + int_pixel_offset_y = int_sample_offset_y; + int_pixel_offset_xy = int_sample_offset_xy; + half_int_pixel_offset_x = half_int_sample_offset_x; + half_int_pixel_offset_y = half_int_sample_offset_y; + break; + + case BRW_SOMETIMES: { + const fs_builder dbld = + abld.exec_all().group(MIN2(16, dispatch_width) * 2, 0); + + check_dynamic_msaa_flag(dbld, wm_prog_data, + INTEL_MSAA_FLAG_COARSE_RT_WRITES); + + int_pixel_offset_x = dbld.vgrf(BRW_REGISTER_TYPE_UW); + set_predicate(BRW_PREDICATE_NORMAL, + dbld.SEL(int_pixel_offset_x, + int_coarse_offset_x, + int_sample_offset_x)); + + int_pixel_offset_y = dbld.vgrf(BRW_REGISTER_TYPE_UW); + set_predicate(BRW_PREDICATE_NORMAL, + dbld.SEL(int_pixel_offset_y, + int_coarse_offset_y, + int_sample_offset_y)); + + int_pixel_offset_xy = dbld.vgrf(BRW_REGISTER_TYPE_UW); + set_predicate(BRW_PREDICATE_NORMAL, + dbld.SEL(int_pixel_offset_xy, + int_coarse_offset_xy, + int_sample_offset_xy)); + + half_int_pixel_offset_x = bld.vgrf(BRW_REGISTER_TYPE_UW); + set_predicate(BRW_PREDICATE_NORMAL, + bld.SEL(half_int_pixel_offset_x, + half_int_coarse_offset_x, + half_int_sample_offset_x)); + + half_int_pixel_offset_y = bld.vgrf(BRW_REGISTER_TYPE_UW); + set_predicate(BRW_PREDICATE_NORMAL, + bld.SEL(half_int_pixel_offset_y, + half_int_coarse_offset_y, + half_int_sample_offset_y)); + break; + } + + case BRW_ALWAYS: + int_pixel_offset_x = int_coarse_offset_x; + int_pixel_offset_y = int_coarse_offset_y; + int_pixel_offset_xy = int_coarse_offset_xy; + half_int_pixel_offset_x = half_int_coarse_offset_x; + half_int_pixel_offset_y = half_int_coarse_offset_y; + break; + } + + for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) { + const fs_builder hbld = abld.group(MIN2(16, dispatch_width), i); + /* According to the "PS Thread Payload for Normal Dispatch" + * pages on the BSpec, subspan X/Y coordinates are stored in + * R1.2-R1.5/R2.2-R2.5 on gfx6+, and on R0.10-R0.13/R1.10-R1.13 + * on gfx20+. gi_reg is the 32B section of the GRF that + * contains the subspan coordinates. + */ + const struct brw_reg gi_reg = devinfo->ver >= 20 ? xe2_vec1_grf(i, 8) : + brw_vec1_grf(i + 1, 0); + const struct brw_reg gi_uw = retype(gi_reg, BRW_REGISTER_TYPE_UW); + + if (devinfo->verx10 >= 125) { + const fs_builder dbld = + abld.exec_all().group(hbld.dispatch_width() * 2, 0); + const fs_reg int_pixel_x = dbld.vgrf(BRW_REGISTER_TYPE_UW); + const fs_reg int_pixel_y = dbld.vgrf(BRW_REGISTER_TYPE_UW); + + dbld.ADD(int_pixel_x, + fs_reg(stride(suboffset(gi_uw, 4), 2, 8, 0)), + int_pixel_offset_x); + dbld.ADD(int_pixel_y, + fs_reg(stride(suboffset(gi_uw, 5), 2, 8, 0)), + int_pixel_offset_y); + + if (wm_prog_data->coarse_pixel_dispatch != BRW_NEVER) { + fs_inst *addx = dbld.ADD(int_pixel_x, int_pixel_x, + horiz_stride(half_int_pixel_offset_x, 0)); + fs_inst *addy = dbld.ADD(int_pixel_y, int_pixel_y, + horiz_stride(half_int_pixel_offset_y, 0)); + if (wm_prog_data->coarse_pixel_dispatch != BRW_ALWAYS) { + addx->predicate = BRW_PREDICATE_NORMAL; + addy->predicate = BRW_PREDICATE_NORMAL; + } + } + + hbld.MOV(offset(pixel_x, hbld, i), horiz_stride(int_pixel_x, 2)); + hbld.MOV(offset(pixel_y, hbld, i), horiz_stride(int_pixel_y, 2)); + + } else if (devinfo->ver >= 8 || dispatch_width == 8) { + /* The "Register Region Restrictions" page says for BDW (and newer, + * presumably): + * + * "When destination spans two registers, the source may be one or + * two registers. The destination elements must be evenly split + * between the two registers." + * + * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16 + * to compute our pixel centers. + */ + const fs_builder dbld = + abld.exec_all().group(hbld.dispatch_width() * 2, 0); + fs_reg int_pixel_xy = dbld.vgrf(BRW_REGISTER_TYPE_UW); + + dbld.ADD(int_pixel_xy, + fs_reg(stride(suboffset(gi_uw, 4), 1, 4, 0)), + int_pixel_offset_xy); + + hbld.emit(FS_OPCODE_PIXEL_X, offset(pixel_x, hbld, i), int_pixel_xy, + horiz_stride(half_int_pixel_offset_x, 0)); + hbld.emit(FS_OPCODE_PIXEL_Y, offset(pixel_y, hbld, i), int_pixel_xy, + horiz_stride(half_int_pixel_offset_y, 0)); + } else { + /* The "Register Region Restrictions" page says for SNB, IVB, HSW: + * + * "When destination spans two registers, the source MUST span + * two registers." + * + * Since the GRF source of the ADD will only read a single register, + * we must do two separate ADDs in SIMD16. + */ + const fs_reg int_pixel_x = hbld.vgrf(BRW_REGISTER_TYPE_UW); + const fs_reg int_pixel_y = hbld.vgrf(BRW_REGISTER_TYPE_UW); + + hbld.ADD(int_pixel_x, + fs_reg(stride(suboffset(gi_uw, 4), 2, 4, 0)), + fs_reg(brw_imm_v(0x10101010))); + hbld.ADD(int_pixel_y, + fs_reg(stride(suboffset(gi_uw, 5), 2, 4, 0)), + fs_reg(brw_imm_v(0x11001100))); + + /* As of gfx6, we can no longer mix float and int sources. We have + * to turn the integer pixel centers into floats for their actual + * use. + */ + hbld.MOV(offset(pixel_x, hbld, i), int_pixel_x); + hbld.MOV(offset(pixel_y, hbld, i), int_pixel_y); + } + } + + abld = bld.annotate("compute pos.z"); + fs_reg coarse_z; + if (wm_prog_data->uses_depth_w_coefficients) { + /* In coarse pixel mode, the HW doesn't interpolate Z coordinate + * properly. In the same way we have to add the coarse pixel size to + * pixels locations, here we recompute the Z value with 2 coefficients + * in X & Y axis. + */ + fs_reg coef_payload = brw_vec8_grf(fs_payload().depth_w_coef_reg, 0); + const fs_reg x_start = brw_vec1_grf(coef_payload.nr, 2); + const fs_reg y_start = brw_vec1_grf(coef_payload.nr, 6); + const fs_reg z_cx = brw_vec1_grf(coef_payload.nr, 1); + const fs_reg z_cy = brw_vec1_grf(coef_payload.nr, 0); + const fs_reg z_c0 = brw_vec1_grf(coef_payload.nr, 3); + + const fs_reg float_pixel_x = abld.vgrf(BRW_REGISTER_TYPE_F); + const fs_reg float_pixel_y = abld.vgrf(BRW_REGISTER_TYPE_F); + + abld.ADD(float_pixel_x, this->pixel_x, negate(x_start)); + abld.ADD(float_pixel_y, this->pixel_y, negate(y_start)); + + /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */ + const fs_reg u8_cps_width = fs_reg(retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UB)); + /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */ + const fs_reg u8_cps_height = byte_offset(u8_cps_width, 1); + const fs_reg u32_cps_width = abld.vgrf(BRW_REGISTER_TYPE_UD); + const fs_reg u32_cps_height = abld.vgrf(BRW_REGISTER_TYPE_UD); + abld.MOV(u32_cps_width, u8_cps_width); + abld.MOV(u32_cps_height, u8_cps_height); + + const fs_reg f_cps_width = abld.vgrf(BRW_REGISTER_TYPE_F); + const fs_reg f_cps_height = abld.vgrf(BRW_REGISTER_TYPE_F); + abld.MOV(f_cps_width, u32_cps_width); + abld.MOV(f_cps_height, u32_cps_height); + + /* Center in the middle of the coarse pixel. */ + abld.MAD(float_pixel_x, float_pixel_x, brw_imm_f(0.5f), f_cps_width); + abld.MAD(float_pixel_y, float_pixel_y, brw_imm_f(0.5f), f_cps_height); + + coarse_z = abld.vgrf(BRW_REGISTER_TYPE_F); + abld.MAD(coarse_z, z_c0, z_cx, float_pixel_x); + abld.MAD(coarse_z, coarse_z, z_cy, float_pixel_y); + } + + if (wm_prog_data->uses_src_depth) + this->pixel_z = fetch_payload_reg(bld, fs_payload().source_depth_reg); + + if (wm_prog_data->uses_depth_w_coefficients || + wm_prog_data->uses_src_depth) { + fs_reg sample_z = this->pixel_z; + + switch (wm_prog_data->coarse_pixel_dispatch) { + case BRW_NEVER: + assert(wm_prog_data->uses_src_depth); + assert(!wm_prog_data->uses_depth_w_coefficients); + this->pixel_z = sample_z; + break; + + case BRW_SOMETIMES: + assert(wm_prog_data->uses_src_depth); + assert(wm_prog_data->uses_depth_w_coefficients); + this->pixel_z = abld.vgrf(BRW_REGISTER_TYPE_F); + + /* We re-use the check_dynamic_msaa_flag() call from above */ + set_predicate(BRW_PREDICATE_NORMAL, + abld.SEL(this->pixel_z, coarse_z, sample_z)); + break; + + case BRW_ALWAYS: + assert(!wm_prog_data->uses_src_depth); + assert(wm_prog_data->uses_depth_w_coefficients); + this->pixel_z = coarse_z; + break; + } + } + + if (wm_prog_data->uses_src_w) { + abld = bld.annotate("compute pos.w"); + this->pixel_w = fetch_payload_reg(abld, fs_payload().source_w_reg); + this->wpos_w = vgrf(glsl_float_type()); + abld.emit(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w); + } + + if (wm_key->persample_interp == BRW_SOMETIMES) { + assert(!devinfo->needs_unlit_centroid_workaround); + + const fs_builder ubld = bld.exec_all().group(16, 0); + bool loaded_flag = false; + + for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) { + if (!(wm_prog_data->barycentric_interp_modes & BITFIELD_BIT(i))) + continue; + + /* The sample mode will always be the top bit set in the perspective + * or non-perspective section. In the case where no SAMPLE mode was + * requested, wm_prog_data_barycentric_modes() will swap out the top + * mode for SAMPLE so this works regardless of whether SAMPLE was + * requested or not. + */ + int sample_mode; + if (BITFIELD_BIT(i) & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) { + sample_mode = util_last_bit(wm_prog_data->barycentric_interp_modes & + BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) - 1; + } else { + sample_mode = util_last_bit(wm_prog_data->barycentric_interp_modes & + BRW_BARYCENTRIC_PERSPECTIVE_BITS) - 1; + } + assert(wm_prog_data->barycentric_interp_modes & + BITFIELD_BIT(sample_mode)); + + if (i == sample_mode) + continue; + + uint8_t *barys = fs_payload().barycentric_coord_reg[i]; + + uint8_t *sample_barys = fs_payload().barycentric_coord_reg[sample_mode]; + assert(barys[0] && sample_barys[0]); + + if (!loaded_flag) { + check_dynamic_msaa_flag(ubld, wm_prog_data, + INTEL_MSAA_FLAG_PERSAMPLE_INTERP); + } + + for (unsigned j = 0; j < dispatch_width / 8; j++) { + set_predicate( + BRW_PREDICATE_NORMAL, + ubld.MOV(brw_vec8_grf(barys[j / 2] + (j % 2) * 2, 0), + brw_vec8_grf(sample_barys[j / 2] + (j % 2) * 2, 0))); + } + } + } + + for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) { + this->delta_xy[i] = fetch_barycentric_reg( + bld, fs_payload().barycentric_coord_reg[i]); + } + + uint32_t centroid_modes = wm_prog_data->barycentric_interp_modes & + (1 << BRW_BARYCENTRIC_PERSPECTIVE_CENTROID | + 1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID); + + if (devinfo->needs_unlit_centroid_workaround && centroid_modes) { + /* Get the pixel/sample mask into f0 so that we know which + * pixels are lit. Then, for each channel that is unlit, + * replace the centroid data with non-centroid data. + */ + for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) { + bld.exec_all().group(1, 0) + .MOV(retype(brw_flag_reg(0, i), BRW_REGISTER_TYPE_UW), + retype(brw_vec1_grf(1 + i, 7), BRW_REGISTER_TYPE_UW)); + } + + for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) { + if (!(centroid_modes & (1 << i))) + continue; + + const fs_reg centroid_delta_xy = delta_xy[i]; + const fs_reg &pixel_delta_xy = delta_xy[i - 1]; + + delta_xy[i] = bld.vgrf(BRW_REGISTER_TYPE_F, 2); + + for (unsigned c = 0; c < 2; c++) { + for (unsigned q = 0; q < dispatch_width / 8; q++) { + set_predicate(BRW_PREDICATE_NORMAL, + bld.quarter(q).SEL( + quarter(offset(delta_xy[i], bld, c), q), + quarter(offset(centroid_delta_xy, bld, c), q), + quarter(offset(pixel_delta_xy, bld, c), q))); + } + } + } + } +} + +static enum brw_conditional_mod +cond_for_alpha_func(enum compare_func func) +{ + switch(func) { + case COMPARE_FUNC_GREATER: + return BRW_CONDITIONAL_G; + case COMPARE_FUNC_GEQUAL: + return BRW_CONDITIONAL_GE; + case COMPARE_FUNC_LESS: + return BRW_CONDITIONAL_L; + case COMPARE_FUNC_LEQUAL: + return BRW_CONDITIONAL_LE; + case COMPARE_FUNC_EQUAL: + return BRW_CONDITIONAL_EQ; + case COMPARE_FUNC_NOTEQUAL: + return BRW_CONDITIONAL_NEQ; + default: + unreachable("Not reached"); + } +} + +/** + * Alpha test support for when we compile it into the shader instead + * of using the normal fixed-function alpha test. + */ +void +fs_visitor::emit_alpha_test() +{ + assert(stage == MESA_SHADER_FRAGMENT); + brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; + const fs_builder bld = fs_builder(this).at_end(); + const fs_builder abld = bld.annotate("Alpha test"); + + fs_inst *cmp; + if (key->alpha_test_func == COMPARE_FUNC_ALWAYS) + return; + + if (key->alpha_test_func == COMPARE_FUNC_NEVER) { + /* f0.1 = 0 */ + fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0), + BRW_REGISTER_TYPE_UW)); + cmp = abld.CMP(bld.null_reg_f(), some_reg, some_reg, + BRW_CONDITIONAL_NEQ); + } else { + /* RT0 alpha */ + fs_reg color = offset(outputs[0], bld, 3); + + /* f0.1 &= func(color, ref) */ + cmp = abld.CMP(bld.null_reg_f(), color, brw_imm_f(key->alpha_test_ref), + cond_for_alpha_func(key->alpha_test_func)); + } + cmp->predicate = BRW_PREDICATE_NORMAL; + cmp->flag_subreg = 1; +} + +fs_inst * +fs_visitor::emit_single_fb_write(const fs_builder &bld, + fs_reg color0, fs_reg color1, + fs_reg src0_alpha, unsigned components) +{ + assert(stage == MESA_SHADER_FRAGMENT); + struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); + + /* Hand over gl_FragDepth or the payload depth. */ + const fs_reg dst_depth = fetch_payload_reg(bld, fs_payload().dest_depth_reg); + fs_reg src_depth, src_stencil; + + if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { + src_depth = frag_depth; + } else if (source_depth_to_render_target) { + /* If we got here, we're in one of those strange Gen4-5 cases where + * we're forced to pass the source depth, unmodified, to the FB write. + * In this case, we don't want to use pixel_z because we may not have + * set up interpolation. It's also perfectly safe because it only + * happens on old hardware (no coarse interpolation) and this is + * explicitly the pass-through case. + */ + assert(devinfo->ver <= 5); + src_depth = fetch_payload_reg(bld, fs_payload().source_depth_reg); + } + + if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) + src_stencil = frag_stencil; + + const fs_reg sources[] = { + color0, color1, src0_alpha, src_depth, dst_depth, src_stencil, + (prog_data->uses_omask ? sample_mask : fs_reg()), + brw_imm_ud(components) + }; + assert(ARRAY_SIZE(sources) - 1 == FB_WRITE_LOGICAL_SRC_COMPONENTS); + fs_inst *write = bld.emit(FS_OPCODE_FB_WRITE_LOGICAL, fs_reg(), + sources, ARRAY_SIZE(sources)); + + if (prog_data->uses_kill) { + write->predicate = BRW_PREDICATE_NORMAL; + write->flag_subreg = sample_mask_flag_subreg(*this); + } + + return write; +} + +void +fs_visitor::do_emit_fb_writes(int nr_color_regions, bool replicate_alpha) +{ + const fs_builder bld = fs_builder(this).at_end(); + fs_inst *inst = NULL; + + for (int target = 0; target < nr_color_regions; target++) { + /* Skip over outputs that weren't written. */ + if (this->outputs[target].file == BAD_FILE) + continue; + + const fs_builder abld = bld.annotate( + ralloc_asprintf(this->mem_ctx, "FB write target %d", target)); + + fs_reg src0_alpha; + if (devinfo->ver >= 6 && replicate_alpha && target != 0) + src0_alpha = offset(outputs[0], bld, 3); + + inst = emit_single_fb_write(abld, this->outputs[target], + this->dual_src_output, src0_alpha, 4); + inst->target = target; + } + + if (inst == NULL) { + /* Even if there's no color buffers enabled, we still need to send + * alpha out the pipeline to our null renderbuffer to support + * alpha-testing, alpha-to-coverage, and so on. + */ + /* FINISHME: Factor out this frequently recurring pattern into a + * helper function. + */ + const fs_reg srcs[] = { reg_undef, reg_undef, + reg_undef, offset(this->outputs[0], bld, 3) }; + const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 4); + bld.LOAD_PAYLOAD(tmp, srcs, 4, 0); + + inst = emit_single_fb_write(bld, tmp, reg_undef, reg_undef, 4); + inst->target = 0; + } + + inst->last_rt = true; + inst->eot = true; +} + +void +fs_visitor::emit_fb_writes() +{ + assert(stage == MESA_SHADER_FRAGMENT); + struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); + brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; + + if (source_depth_to_render_target && devinfo->ver == 6) { + /* For outputting oDepth on gfx6, SIMD8 writes have to be used. This + * would require SIMD8 moves of each half to message regs, e.g. by using + * the SIMD lowering pass. Unfortunately this is more difficult than it + * sounds because the SIMD8 single-source message lacks channel selects + * for the second and third subspans. + */ + limit_dispatch_width(8, "Depth writes unsupported in SIMD16+ mode.\n"); + } + + if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) { + /* From the 'Render Target Write message' section of the docs: + * "Output Stencil is not supported with SIMD16 Render Target Write + * Messages." + */ + limit_dispatch_width(8, "gl_FragStencilRefARB unsupported " + "in SIMD16+ mode.\n"); + } + + /* ANV doesn't know about sample mask output during the wm key creation + * so we compute if we need replicate alpha and emit alpha to coverage + * workaround here. + */ + const bool replicate_alpha = key->alpha_test_replicate_alpha || + (key->nr_color_regions > 1 && key->alpha_to_coverage && + (sample_mask.file == BAD_FILE || devinfo->ver == 6)); + + prog_data->dual_src_blend = (this->dual_src_output.file != BAD_FILE && + this->outputs[0].file != BAD_FILE); + assert(!prog_data->dual_src_blend || key->nr_color_regions == 1); + + /* Following condition implements Wa_14017468336: + * + * "If dual source blend is enabled do not enable SIMD32 dispatch" and + * "For a thread dispatched as SIMD32, must not issue SIMD8 message with Last + * Render Target Select set." + */ + if (devinfo->ver >= 11 && devinfo->ver <= 12 && + prog_data->dual_src_blend) { + /* The dual-source RT write messages fail to release the thread + * dependency on ICL and TGL with SIMD32 dispatch, leading to hangs. + * + * XXX - Emit an extra single-source NULL RT-write marked LastRT in + * order to release the thread dependency without disabling + * SIMD32. + * + * The dual-source RT write messages may lead to hangs with SIMD16 + * dispatch on ICL due some unknown reasons, see + * https://gitlab.freedesktop.org/mesa/mesa/-/issues/2183 + */ + limit_dispatch_width(8, "Dual source blending unsupported " + "in SIMD16 and SIMD32 modes.\n"); + } + + do_emit_fb_writes(key->nr_color_regions, replicate_alpha); +} + +void +fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count) +{ + int slot, urb_offset, length; + int starting_urb_offset = 0; + const struct brw_vue_prog_data *vue_prog_data = + brw_vue_prog_data(this->prog_data); + const struct brw_vs_prog_key *vs_key = + (const struct brw_vs_prog_key *) this->key; + const GLbitfield64 psiz_mask = + VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ | VARYING_BIT_PRIMITIVE_SHADING_RATE; + const struct intel_vue_map *vue_map = &vue_prog_data->vue_map; + bool flush; + fs_reg sources[8]; + fs_reg urb_handle; + + switch (stage) { + case MESA_SHADER_VERTEX: + urb_handle = vs_payload().urb_handles; + break; + case MESA_SHADER_TESS_EVAL: + urb_handle = tes_payload().urb_output; + break; + case MESA_SHADER_GEOMETRY: + urb_handle = gs_payload().urb_handles; + break; + default: + unreachable("invalid stage"); + } + + const fs_builder bld = fs_builder(this).at_end(); + + fs_reg per_slot_offsets; + + if (stage == MESA_SHADER_GEOMETRY) { + const struct brw_gs_prog_data *gs_prog_data = + brw_gs_prog_data(this->prog_data); + + /* We need to increment the Global Offset to skip over the control data + * header and the extra "Vertex Count" field (1 HWord) at the beginning + * of the VUE. We're counting in OWords, so the units are doubled. + */ + starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords; + if (gs_prog_data->static_vertex_count == -1) + starting_urb_offset += 2; + + /* The URB offset is in 128-bit units, so we need to multiply by 2 */ + const int output_vertex_size_owords = + gs_prog_data->output_vertex_size_hwords * 2; + + if (gs_vertex_count.file == IMM) { + per_slot_offsets = brw_imm_ud(output_vertex_size_owords * + gs_vertex_count.ud); + } else { + per_slot_offsets = vgrf(glsl_uint_type()); + bld.MUL(per_slot_offsets, gs_vertex_count, + brw_imm_ud(output_vertex_size_owords)); + } + } + + length = 0; + urb_offset = starting_urb_offset; + flush = false; + + /* SSO shaders can have VUE slots allocated which are never actually + * written to, so ignore them when looking for the last (written) slot. + */ + int last_slot = vue_map->num_slots - 1; + while (last_slot > 0 && + (vue_map->slot_to_varying[last_slot] == BRW_VARYING_SLOT_PAD || + outputs[vue_map->slot_to_varying[last_slot]].file == BAD_FILE)) { + last_slot--; + } + + bool urb_written = false; + for (slot = 0; slot < vue_map->num_slots; slot++) { + int varying = vue_map->slot_to_varying[slot]; + switch (varying) { + case VARYING_SLOT_PSIZ: { + /* The point size varying slot is the vue header and is always in the + * vue map. But often none of the special varyings that live there + * are written and in that case we can skip writing to the vue + * header, provided the corresponding state properly clamps the + * values further down the pipeline. */ + if ((vue_map->slots_valid & psiz_mask) == 0) { + assert(length == 0); + urb_offset++; + break; + } + + fs_reg zero(VGRF, alloc.allocate(dispatch_width / 8), + BRW_REGISTER_TYPE_UD); + bld.MOV(zero, brw_imm_ud(0u)); + + if (vue_map->slots_valid & VARYING_BIT_PRIMITIVE_SHADING_RATE && + this->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE].file != BAD_FILE) { + sources[length++] = this->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE]; + } else if (devinfo->has_coarse_pixel_primitive_and_cb) { + uint32_t one_fp16 = 0x3C00; + fs_reg one_by_one_fp16(VGRF, alloc.allocate(dispatch_width / 8), + BRW_REGISTER_TYPE_UD); + bld.MOV(one_by_one_fp16, brw_imm_ud((one_fp16 << 16) | one_fp16)); + sources[length++] = one_by_one_fp16; + } else { + sources[length++] = zero; + } + + if (vue_map->slots_valid & VARYING_BIT_LAYER) + sources[length++] = this->outputs[VARYING_SLOT_LAYER]; + else + sources[length++] = zero; + + if (vue_map->slots_valid & VARYING_BIT_VIEWPORT) + sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT]; + else + sources[length++] = zero; + + if (vue_map->slots_valid & VARYING_BIT_PSIZ) + sources[length++] = this->outputs[VARYING_SLOT_PSIZ]; + else + sources[length++] = zero; + break; + } + case BRW_VARYING_SLOT_NDC: + case VARYING_SLOT_EDGE: + unreachable("unexpected scalar vs output"); + break; + + default: + /* gl_Position is always in the vue map, but isn't always written by + * the shader. Other varyings (clip distances) get added to the vue + * map but don't always get written. In those cases, the + * corresponding this->output[] slot will be invalid we and can skip + * the urb write for the varying. If we've already queued up a vue + * slot for writing we flush a mlen 5 urb write, otherwise we just + * advance the urb_offset. + */ + if (varying == BRW_VARYING_SLOT_PAD || + this->outputs[varying].file == BAD_FILE) { + if (length > 0) + flush = true; + else + urb_offset++; + break; + } + + if (stage == MESA_SHADER_VERTEX && vs_key->clamp_vertex_color && + (varying == VARYING_SLOT_COL0 || + varying == VARYING_SLOT_COL1 || + varying == VARYING_SLOT_BFC0 || + varying == VARYING_SLOT_BFC1)) { + /* We need to clamp these guys, so do a saturating MOV into a + * temp register and use that for the payload. + */ + for (int i = 0; i < 4; i++) { + fs_reg reg = fs_reg(VGRF, alloc.allocate(dispatch_width / 8), + outputs[varying].type); + fs_reg src = offset(this->outputs[varying], bld, i); + set_saturate(true, bld.MOV(reg, src)); + sources[length++] = reg; + } + } else { + int slot_offset = 0; + + /* When using Primitive Replication, there may be multiple slots + * assigned to POS. + */ + if (varying == VARYING_SLOT_POS) + slot_offset = slot - vue_map->varying_to_slot[VARYING_SLOT_POS]; + + for (unsigned i = 0; i < 4; i++) { + sources[length++] = offset(this->outputs[varying], bld, + i + (slot_offset * 4)); + } + } + break; + } + + const fs_builder abld = bld.annotate("URB write"); + + /* If we've queued up 8 registers of payload (2 VUE slots), if this is + * the last slot or if we need to flush (see BAD_FILE varying case + * above), emit a URB write send now to flush out the data. + */ + if (length == 8 || (length > 0 && slot == last_slot)) + flush = true; + if (flush) { + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + + srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle; + srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offsets; + srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, + alloc.allocate((dispatch_width / 8) * length), + BRW_REGISTER_TYPE_F); + srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length); + abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0); + + fs_inst *inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, + srcs, ARRAY_SIZE(srcs)); + + /* For ICL Wa_1805992985 one needs additional write in the end. */ + if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL) + inst->eot = false; + else + inst->eot = slot == last_slot && stage != MESA_SHADER_GEOMETRY; + + inst->offset = urb_offset; + urb_offset = starting_urb_offset + slot + 1; + length = 0; + flush = false; + urb_written = true; + } + } + + /* If we don't have any valid slots to write, just do a minimal urb write + * send to terminate the shader. This includes 1 slot of undefined data, + * because it's invalid to write 0 data: + * + * From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions - + * Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read > + * Write Data Payload: + * + * "The write data payload can be between 1 and 8 message phases long." + */ + if (!urb_written) { + /* For GS, just turn EmitVertex() into a no-op. We don't want it to + * end the thread, and emit_gs_thread_end() already emits a SEND with + * EOT at the end of the program for us. + */ + if (stage == MESA_SHADER_GEOMETRY) + return; + + fs_reg uniform_urb_handle = fs_reg(VGRF, alloc.allocate(dispatch_width / 8), + BRW_REGISTER_TYPE_UD); + fs_reg payload = fs_reg(VGRF, alloc.allocate(dispatch_width / 8), + BRW_REGISTER_TYPE_UD); + + bld.exec_all().MOV(uniform_urb_handle, urb_handle); + + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle; + srcs[URB_LOGICAL_SRC_DATA] = payload; + srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1); + + fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, + srcs, ARRAY_SIZE(srcs)); + inst->eot = true; + inst->offset = 1; + return; + } + + /* ICL Wa_1805992985: + * + * ICLLP GPU hangs on one of tessellation vkcts tests with DS not done. The + * send cycle, which is a urb write with an eot must be 4 phases long and + * all 8 lanes must valid. + */ + if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL) { + assert(dispatch_width == 8); + fs_reg uniform_urb_handle = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); + fs_reg uniform_mask = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); + fs_reg payload = fs_reg(VGRF, alloc.allocate(4), BRW_REGISTER_TYPE_UD); + + /* Workaround requires all 8 channels (lanes) to be valid. This is + * understood to mean they all need to be alive. First trick is to find + * a live channel and copy its urb handle for all the other channels to + * make sure all handles are valid. + */ + bld.exec_all().MOV(uniform_urb_handle, bld.emit_uniformize(urb_handle)); + + /* Second trick is to use masked URB write where one can tell the HW to + * actually write data only for selected channels even though all are + * active. + * Third trick is to take advantage of the must-be-zero (MBZ) area in + * the very beginning of the URB. + * + * One masks data to be written only for the first channel and uses + * offset zero explicitly to land data to the MBZ area avoiding trashing + * any other part of the URB. + * + * Since the WA says that the write needs to be 4 phases long one uses + * 4 slots data. All are explicitly zeros in order to to keep the MBZ + * area written as zeros. + */ + bld.exec_all().MOV(uniform_mask, brw_imm_ud(0x10000u)); + bld.exec_all().MOV(offset(payload, bld, 0), brw_imm_ud(0u)); + bld.exec_all().MOV(offset(payload, bld, 1), brw_imm_ud(0u)); + bld.exec_all().MOV(offset(payload, bld, 2), brw_imm_ud(0u)); + bld.exec_all().MOV(offset(payload, bld, 3), brw_imm_ud(0u)); + + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle; + srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = uniform_mask; + srcs[URB_LOGICAL_SRC_DATA] = payload; + srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(4); + + fs_inst *inst = bld.exec_all().emit(SHADER_OPCODE_URB_WRITE_LOGICAL, + reg_undef, srcs, ARRAY_SIZE(srcs)); + inst->eot = true; + inst->offset = 0; + } +} + +void +fs_visitor::emit_urb_fence() +{ + const fs_builder bld = fs_builder(this).at_end(); + fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD); + fs_inst *fence = bld.emit(SHADER_OPCODE_MEMORY_FENCE, dst, + brw_vec8_grf(0, 0), + brw_imm_ud(true), + brw_imm_ud(0)); + fence->sfid = BRW_SFID_URB; + fence->desc = lsc_fence_msg_desc(devinfo, LSC_FENCE_LOCAL, + LSC_FLUSH_TYPE_NONE, true); + + bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE, + bld.null_reg_ud(), + &dst, + 1); +} + +void +fs_visitor::emit_cs_terminate() +{ + assert(devinfo->ver >= 7); + const fs_builder bld = fs_builder(this).at_end(); + + /* We can't directly send from g0, since sends with EOT have to use + * g112-127. So, copy it to a virtual register, The register allocator will + * make sure it uses the appropriate register range. + */ + struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD); + fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); + bld.group(8, 0).exec_all().MOV(payload, g0); + + /* Send a message to the thread spawner to terminate the thread. */ + fs_inst *inst = bld.exec_all() + .emit(CS_OPCODE_CS_TERMINATE, reg_undef, payload); + inst->eot = true; +} + +fs_visitor::fs_visitor(const struct brw_compiler *compiler, + const struct brw_compile_params *params, + const brw_base_prog_key *key, + struct brw_stage_prog_data *prog_data, + const nir_shader *shader, + unsigned dispatch_width, + bool needs_register_pressure, + bool debug_enabled) + : backend_shader(compiler, params, shader, prog_data, debug_enabled), + key(key), gs_compile(NULL), prog_data(prog_data), + live_analysis(this), regpressure_analysis(this), + performance_analysis(this), + needs_register_pressure(needs_register_pressure), + dispatch_width(dispatch_width), + max_polygons(0), + api_subgroup_size(brw_nir_api_subgroup_size(shader, dispatch_width)) +{ + init(); +} + +fs_visitor::fs_visitor(const struct brw_compiler *compiler, + const struct brw_compile_params *params, + const brw_wm_prog_key *key, + struct brw_wm_prog_data *prog_data, + const nir_shader *shader, + unsigned dispatch_width, unsigned max_polygons, + bool needs_register_pressure, + bool debug_enabled) + : backend_shader(compiler, params, shader, &prog_data->base, + debug_enabled), + key(&key->base), gs_compile(NULL), prog_data(&prog_data->base), + live_analysis(this), regpressure_analysis(this), + performance_analysis(this), + needs_register_pressure(needs_register_pressure), + dispatch_width(dispatch_width), + max_polygons(max_polygons), + api_subgroup_size(brw_nir_api_subgroup_size(shader, dispatch_width)) +{ + init(); + assert(api_subgroup_size == 0 || + api_subgroup_size == 8 || + api_subgroup_size == 16 || + api_subgroup_size == 32); +} + +fs_visitor::fs_visitor(const struct brw_compiler *compiler, + const struct brw_compile_params *params, + struct brw_gs_compile *c, + struct brw_gs_prog_data *prog_data, + const nir_shader *shader, + bool needs_register_pressure, + bool debug_enabled) + : backend_shader(compiler, params, shader, &prog_data->base.base, + debug_enabled), + key(&c->key.base), gs_compile(c), + prog_data(&prog_data->base.base), + live_analysis(this), regpressure_analysis(this), + performance_analysis(this), + needs_register_pressure(needs_register_pressure), + dispatch_width(compiler->devinfo->ver >= 20 ? 16 : 8), + max_polygons(0), + api_subgroup_size(brw_nir_api_subgroup_size(shader, dispatch_width)) +{ + init(); + assert(api_subgroup_size == 0 || + api_subgroup_size == 8 || + api_subgroup_size == 16 || + api_subgroup_size == 32); +} + +void +fs_visitor::init() +{ + if (key) + this->key_tex = &key->tex; + else + this->key_tex = NULL; + + this->max_dispatch_width = 32; + this->prog_data = this->stage_prog_data; + + this->failed = false; + this->fail_msg = NULL; + + this->payload_ = NULL; + this->source_depth_to_render_target = false; + this->runtime_check_aads_emit = false; + this->first_non_payload_grf = 0; + this->max_grf = devinfo->ver >= 7 ? GFX7_MRF_HACK_START : BRW_MAX_GRF; + + this->uniforms = 0; + this->last_scratch = 0; + this->push_constant_loc = NULL; + + memset(&this->shader_stats, 0, sizeof(this->shader_stats)); + + this->grf_used = 0; + this->spilled_any_registers = false; +} + +fs_visitor::~fs_visitor() +{ + delete this->payload_; +} diff --git a/src/intel/compiler/elk/brw_gram.y b/src/intel/compiler/elk/brw_gram.y new file mode 100644 index 00000000000..a32b2bffb0c --- /dev/null +++ b/src/intel/compiler/elk/brw_gram.y @@ -0,0 +1,2566 @@ +%{ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include "brw_asm.h" + +#undef yyerror +#ifdef YYBYACC +struct YYLTYPE; +void yyerror (struct YYLTYPE *, char *); +#else +void yyerror (char *); +#endif + +#undef ALIGN16 + +#define YYLTYPE YYLTYPE +typedef struct YYLTYPE +{ + int first_line; + int first_column; + int last_line; + int last_column; +} YYLTYPE; + +enum message_level { + WARN, + ERROR, +}; + +int yydebug = 1; + +static void +message(enum message_level level, YYLTYPE *location, + const char *fmt, ...) +{ + static const char *level_str[] = { "warning", "error" }; + va_list args; + + if (location) + fprintf(stderr, "%s:%d:%d: %s: ", input_filename, + location->first_line, + location->first_column, level_str[level]); + else + fprintf(stderr, "%s:%s: ", input_filename, level_str[level]); + + va_start(args, fmt); + vfprintf(stderr, fmt, args); + va_end(args); +} + +#define warn(flag, l, fmt, ...) \ + do { \ + if (warning_flags & WARN_ ## flag) \ + message(WARN, l, fmt, ## __VA_ARGS__); \ + } while (0) + +#define error(l, fmt, ...) \ + do { \ + message(ERROR, l, fmt, ## __VA_ARGS__); \ + } while (0) + +static bool +isPowerofTwo(unsigned int x) +{ + return x && (!(x & (x - 1))); +} + +static struct brw_reg +set_direct_src_operand(struct brw_reg *reg, int type) +{ + return brw_reg(reg->file, + reg->nr, + reg->subnr, + 0, // negate + 0, // abs + type, + 0, // vstride + 0, // width + 0, // hstride + BRW_SWIZZLE_NOOP, + WRITEMASK_XYZW); +} + +static void +i965_asm_unary_instruction(int opcode, struct brw_codegen *p, + struct brw_reg dest, struct brw_reg src0) +{ + switch (opcode) { + case BRW_OPCODE_BFREV: + brw_BFREV(p, dest, src0); + break; + case BRW_OPCODE_CBIT: + brw_CBIT(p, dest, src0); + break; + case BRW_OPCODE_F32TO16: + brw_F32TO16(p, dest, src0); + break; + case BRW_OPCODE_F16TO32: + brw_F16TO32(p, dest, src0); + break; + case BRW_OPCODE_MOV: + brw_MOV(p, dest, src0); + break; + case BRW_OPCODE_FBL: + brw_FBL(p, dest, src0); + break; + case BRW_OPCODE_FRC: + brw_FRC(p, dest, src0); + break; + case BRW_OPCODE_FBH: + brw_FBH(p, dest, src0); + break; + case BRW_OPCODE_NOT: + brw_NOT(p, dest, src0); + break; + case BRW_OPCODE_RNDE: + brw_RNDE(p, dest, src0); + break; + case BRW_OPCODE_RNDZ: + brw_RNDZ(p, dest, src0); + break; + case BRW_OPCODE_RNDD: + brw_RNDD(p, dest, src0); + break; + case BRW_OPCODE_LZD: + brw_LZD(p, dest, src0); + break; + case BRW_OPCODE_DIM: + brw_DIM(p, dest, src0); + break; + case BRW_OPCODE_RNDU: + fprintf(stderr, "Opcode BRW_OPCODE_RNDU unhandled\n"); + break; + default: + fprintf(stderr, "Unsupported unary opcode\n"); + } +} + +static void +i965_asm_binary_instruction(int opcode, + struct brw_codegen *p, + struct brw_reg dest, + struct brw_reg src0, + struct brw_reg src1) +{ + switch (opcode) { + case BRW_OPCODE_ADDC: + brw_ADDC(p, dest, src0, src1); + break; + case BRW_OPCODE_BFI1: + brw_BFI1(p, dest, src0, src1); + break; + case BRW_OPCODE_DP2: + brw_DP2(p, dest, src0, src1); + break; + case BRW_OPCODE_DP3: + brw_DP3(p, dest, src0, src1); + break; + case BRW_OPCODE_DP4: + brw_DP4(p, dest, src0, src1); + break; + case BRW_OPCODE_DPH: + brw_DPH(p, dest, src0, src1); + break; + case BRW_OPCODE_LINE: + brw_LINE(p, dest, src0, src1); + break; + case BRW_OPCODE_MAC: + brw_MAC(p, dest, src0, src1); + break; + case BRW_OPCODE_MACH: + brw_MACH(p, dest, src0, src1); + break; + case BRW_OPCODE_PLN: + brw_PLN(p, dest, src0, src1); + break; + case BRW_OPCODE_ROL: + brw_ROL(p, dest, src0, src1); + break; + case BRW_OPCODE_ROR: + brw_ROR(p, dest, src0, src1); + break; + case BRW_OPCODE_SAD2: + fprintf(stderr, "Opcode BRW_OPCODE_SAD2 unhandled\n"); + break; + case BRW_OPCODE_SADA2: + fprintf(stderr, "Opcode BRW_OPCODE_SADA2 unhandled\n"); + break; + case BRW_OPCODE_SUBB: + brw_SUBB(p, dest, src0, src1); + break; + case BRW_OPCODE_ADD: + brw_ADD(p, dest, src0, src1); + break; + case BRW_OPCODE_CMP: + /* Third parameter is conditional modifier + * which gets updated later + */ + brw_CMP(p, dest, 0, src0, src1); + break; + case BRW_OPCODE_AND: + brw_AND(p, dest, src0, src1); + break; + case BRW_OPCODE_ASR: + brw_ASR(p, dest, src0, src1); + break; + case BRW_OPCODE_AVG: + brw_AVG(p, dest, src0, src1); + break; + case BRW_OPCODE_OR: + brw_OR(p, dest, src0, src1); + break; + case BRW_OPCODE_SEL: + brw_SEL(p, dest, src0, src1); + break; + case BRW_OPCODE_SHL: + brw_SHL(p, dest, src0, src1); + break; + case BRW_OPCODE_SHR: + brw_SHR(p, dest, src0, src1); + break; + case BRW_OPCODE_XOR: + brw_XOR(p, dest, src0, src1); + break; + case BRW_OPCODE_MUL: + brw_MUL(p, dest, src0, src1); + break; + default: + fprintf(stderr, "Unsupported binary opcode\n"); + } +} + +static void +i965_asm_ternary_instruction(int opcode, + struct brw_codegen *p, + struct brw_reg dest, + struct brw_reg src0, + struct brw_reg src1, + struct brw_reg src2) +{ + switch (opcode) { + case BRW_OPCODE_MAD: + brw_MAD(p, dest, src0, src1, src2); + break; + case BRW_OPCODE_CSEL: + brw_CSEL(p, dest, src0, src1, src2); + break; + case BRW_OPCODE_LRP: + brw_LRP(p, dest, src0, src1, src2); + break; + case BRW_OPCODE_BFE: + brw_BFE(p, dest, src0, src1, src2); + break; + case BRW_OPCODE_BFI2: + brw_BFI2(p, dest, src0, src1, src2); + break; + case BRW_OPCODE_DP4A: + brw_DP4A(p, dest, src0, src1, src2); + break; + case BRW_OPCODE_ADD3: + brw_ADD3(p, dest, src0, src1, src2); + break; + default: + fprintf(stderr, "Unsupported ternary opcode\n"); + } +} + +static void +i965_asm_set_instruction_options(struct brw_codegen *p, + struct options options) +{ + brw_inst_set_access_mode(p->devinfo, brw_last_inst, + options.access_mode); + brw_inst_set_mask_control(p->devinfo, brw_last_inst, + options.mask_control); + if (p->devinfo->ver < 12) { + brw_inst_set_thread_control(p->devinfo, brw_last_inst, + options.thread_control); + brw_inst_set_no_dd_check(p->devinfo, brw_last_inst, + options.no_dd_check); + brw_inst_set_no_dd_clear(p->devinfo, brw_last_inst, + options.no_dd_clear); + } else { + brw_inst_set_swsb(p->devinfo, brw_last_inst, + tgl_swsb_encode(p->devinfo, options.depinfo)); + } + brw_inst_set_debug_control(p->devinfo, brw_last_inst, + options.debug_control); + if (p->devinfo->ver >= 6) + brw_inst_set_acc_wr_control(p->devinfo, brw_last_inst, + options.acc_wr_control); + brw_inst_set_cmpt_control(p->devinfo, brw_last_inst, + options.compaction); +} + +static void +i965_asm_set_dst_nr(struct brw_codegen *p, + struct brw_reg *reg, + struct options options) +{ + if (p->devinfo->ver <= 6) { + if (reg->file == BRW_MESSAGE_REGISTER_FILE && + options.qtr_ctrl == BRW_COMPRESSION_COMPRESSED && + !options.is_compr) + reg->nr |= BRW_MRF_COMPR4; + } +} + +static void +add_label(struct brw_codegen *p, const char* label_name, enum instr_label_type type) +{ + if (!label_name) { + return; + } + + struct instr_label *label = rzalloc(p->mem_ctx, struct instr_label); + + label->name = ralloc_strdup(p->mem_ctx, label_name); + label->offset = p->next_insn_offset; + label->type = type; + + list_addtail(&label->link, &instr_labels); +} + +%} + +%locations + +%start ROOT + +%union { + char *string; + double number; + int integer; + unsigned long long int llint; + struct brw_reg reg; + enum brw_reg_type reg_type; + struct brw_codegen *program; + struct predicate predicate; + struct condition condition; + struct options options; + struct instoption instoption; + struct msgdesc msgdesc; + struct tgl_swsb depinfo; + brw_inst *instruction; +} + +%token ABS +%token COLON +%token COMMA +%token DOT +%token LANGLE RANGLE +%token LCURLY RCURLY +%token LPAREN RPAREN +%token LSQUARE RSQUARE +%token PLUS MINUS +%token SEMICOLON +%token ASSIGN + +/* datatypes */ +%token TYPE_B TYPE_UB +%token TYPE_W TYPE_UW +%token TYPE_D TYPE_UD +%token TYPE_Q TYPE_UQ +%token TYPE_V TYPE_UV +%token TYPE_F TYPE_HF +%token TYPE_DF TYPE_NF +%token TYPE_VF + +/* label */ +%token JUMP_LABEL +%token JUMP_LABEL_TARGET + +/* opcodes */ +%token ADD ADD3 ADDC AND ASR AVG +%token BFE BFI1 BFI2 BFB BFREV BRC BRD BREAK +%token CALL CALLA CASE CBIT CMP CMPN CONT CSEL +%token DIM DO DPAS DPASW DP2 DP3 DP4 DP4A DPH +%token ELSE ENDIF F16TO32 F32TO16 FBH FBL FORK FRC +%token GOTO +%token HALT +%token IF IFF ILLEGAL +%token JMPI JOIN +%token LINE LRP LZD +%token MAC MACH MAD MADM MOV MOVI MUL MREST MSAVE +%token NENOP NOP NOT +%token OR +%token PLN POP PUSH +%token RET RNDD RNDE RNDU RNDZ ROL ROR +%token SAD2 SADA2 SEL SENDS SENDSC SHL SHR SMOV SUBB SYNC +%token SEND_GFX4 SENDC_GFX4 SEND_GFX12 SENDC_GFX12 +%token WAIT WHILE +%token XOR + +/* extended math functions */ +%token COS EXP FDIV INV INVM INTDIV INTDIVMOD INTMOD LOG POW RSQ +%token RSQRTM SIN SINCOS SQRT + +/* sync instruction */ +%token ALLRD ALLWR FENCE BAR HOST +%type sync_function +%type sync_arg + +/* shared functions for send */ +%token CONST CRE DATA DP_DATA_1 GATEWAY MATH PIXEL_INTERP READ RENDER SAMPLER +%token THREAD_SPAWNER URB VME WRITE DP_SAMPLER RT_ACCEL SLM TGM UGM + +/* message details for send */ +%token MSGDESC_BEGIN SRC1_LEN EX_BSO MSGDESC_END +%type msgdesc msgdesc_parts; + +/* Conditional modifiers */ +%token EQUAL GREATER GREATER_EQUAL LESS LESS_EQUAL NOT_EQUAL +%token NOT_ZERO OVERFLOW UNORDERED ZERO + +/* register Access Modes */ +%token ALIGN1 ALIGN16 + +/* accumulator write control */ +%token ACCWREN + +/* compaction control */ +%token CMPTCTRL + +/* compression control */ +%token COMPR COMPR4 SECHALF + +/* mask control (WeCtrl) */ +%token WECTRL + +/* debug control */ +%token BREAKPOINT + +/* dependency control */ +%token NODDCLR NODDCHK + +/* end of thread */ +%token EOT + +/* mask control */ +%token MASK_DISABLE; + +/* predicate control */ +%token ANYV ALLV ANY2H ALL2H ANY4H ALL4H ANY8H ALL8H ANY16H ALL16H +%token ANY32H ALL32H + +/* round instructions */ +%token ROUND_INCREMENT + +/* staturation */ +%token SATURATE + +/* thread control */ +%token ATOMIC SWITCH + +/* quater control */ +%token QTR_2Q QTR_3Q QTR_4Q QTR_2H QTR_2N QTR_3N QTR_4N QTR_5N +%token QTR_6N QTR_7N QTR_8N + +/* channels */ +%token X Y Z W + +/* reg files */ +%token GENREGFILE MSGREGFILE + +/* vertical stride in register region */ +%token VxH + +/* register type */ +%token GENREG MSGREG ADDRREG ACCREG FLAGREG NOTIFYREG STATEREG +%token CONTROLREG IPREG PERFORMANCEREG THREADREG CHANNELENABLEREG +%token MASKREG + +%token INTEGER +%token LONG +%token NULL_TOKEN + +%nonassoc SUBREGNUM +%left PLUS MINUS +%nonassoc DOT +%nonassoc EMPTYEXECSIZE +%nonassoc LPAREN + +%type execsize simple_int exp +%type exp2 + +/* predicate control */ +%type predctrl predstate +%type predicate + +/* conditional modifier */ +%type cond_mod +%type condModifiers + +/* instruction options */ +%type instoptions instoption_list +%type instoption + +/* writemask */ +%type writemask_x writemask_y writemask_z writemask_w +%type writemask + +/* dst operand */ +%type dst dstoperand dstoperandex dstoperandex_typed dstreg +%type dstregion + +%type saturate relativelocation rellocation +%type relativelocation2 + +/* src operand */ +%type directsrcoperand directsrcaccoperand indirectsrcoperand srcacc +%type srcarcoperandex srcaccimm srcarcoperandex_typed srcimm +%type indirectgenreg indirectregion +%type immreg src reg32 payload directgenreg_list addrparam region +%type region_wh directgenreg directmsgreg indirectmsgreg +%type desc ex_desc reg32a +%type swizzle + +/* registers */ +%type accreg addrreg channelenablereg controlreg flagreg ipreg +%type notifyreg nullreg performancereg threadcontrolreg statereg maskreg +%type subregnum + +/* register types */ +%type reg_type imm_type + +/* immediate values */ +%type immval + +/* instruction opcodes */ +%type unaryopcodes binaryopcodes binaryaccopcodes ternaryopcodes +%type sendop sendsop +%type sendopcode sendsopcode + +%type negate abs chansel math_function sharedfunction + +%type jumplabeltarget +%type jumplabel + +/* SWSB */ +%token REG_DIST_CURRENT +%token REG_DIST_FLOAT +%token REG_DIST_INT +%token REG_DIST_LONG +%token REG_DIST_ALL +%token SBID_ALLOC +%token SBID_WAIT_SRC +%token SBID_WAIT_DST + +%type depinfo + +%code { + +static void +add_instruction_option(struct options *options, struct instoption opt) +{ + if (opt.type == INSTOPTION_DEP_INFO) { + if (opt.depinfo_value.regdist) { + options->depinfo.regdist = opt.depinfo_value.regdist; + options->depinfo.pipe = opt.depinfo_value.pipe; + } else { + options->depinfo.sbid = opt.depinfo_value.sbid; + options->depinfo.mode = opt.depinfo_value.mode; + } + return; + } + switch (opt.uint_value) { + case ALIGN1: + options->access_mode = BRW_ALIGN_1; + break; + case ALIGN16: + options->access_mode = BRW_ALIGN_16; + break; + case SECHALF: + options->qtr_ctrl |= BRW_COMPRESSION_2NDHALF; + break; + case COMPR: + options->qtr_ctrl |= BRW_COMPRESSION_COMPRESSED; + options->is_compr = true; + break; + case COMPR4: + options->qtr_ctrl |= BRW_COMPRESSION_COMPRESSED; + break; + case SWITCH: + options->thread_control |= BRW_THREAD_SWITCH; + break; + case ATOMIC: + options->thread_control |= BRW_THREAD_ATOMIC; + break; + case NODDCHK: + options->no_dd_check = true; + break; + case NODDCLR: + options->no_dd_clear = BRW_DEPENDENCY_NOTCLEARED; + break; + case MASK_DISABLE: + options->mask_control |= BRW_MASK_DISABLE; + break; + case BREAKPOINT: + options->debug_control = BRW_DEBUG_BREAKPOINT; + break; + case WECTRL: + options->mask_control |= BRW_WE_ALL; + break; + case CMPTCTRL: + options->compaction = true; + break; + case ACCWREN: + options->acc_wr_control = true; + break; + case EOT: + options->end_of_thread = true; + break; + /* TODO : Figure out how to set instruction group and get rid of + * code below + */ + case QTR_2Q: + options->qtr_ctrl = BRW_COMPRESSION_2NDHALF; + break; + case QTR_3Q: + options->qtr_ctrl = BRW_COMPRESSION_COMPRESSED; + break; + case QTR_4Q: + options->qtr_ctrl = 3; + break; + case QTR_2H: + options->qtr_ctrl = BRW_COMPRESSION_COMPRESSED; + break; + case QTR_2N: + options->qtr_ctrl = BRW_COMPRESSION_NONE; + options->nib_ctrl = true; + break; + case QTR_3N: + options->qtr_ctrl = BRW_COMPRESSION_2NDHALF; + break; + case QTR_4N: + options->qtr_ctrl = BRW_COMPRESSION_2NDHALF; + options->nib_ctrl = true; + break; + case QTR_5N: + options->qtr_ctrl = BRW_COMPRESSION_COMPRESSED; + break; + case QTR_6N: + options->qtr_ctrl = BRW_COMPRESSION_COMPRESSED; + options->nib_ctrl = true; + break; + case QTR_7N: + options->qtr_ctrl = 3; + break; + case QTR_8N: + options->qtr_ctrl = 3; + options->nib_ctrl = true; + break; + } +} +} +%% + +ROOT: + instrseq + ; + +instrseq: + instrseq instruction SEMICOLON + | instrseq relocatableinstruction SEMICOLON + | instruction SEMICOLON + | relocatableinstruction SEMICOLON + | instrseq jumplabeltarget + | jumplabeltarget + ; + +/* Instruction Group */ +instruction: + unaryinstruction + | binaryinstruction + | binaryaccinstruction + | mathinstruction + | nopinstruction + | waitinstruction + | ternaryinstruction + | sendinstruction + | illegalinstruction + | syncinstruction + ; + +relocatableinstruction: + jumpinstruction + | branchinstruction + | breakinstruction + | loopinstruction + ; + +illegalinstruction: + ILLEGAL execsize instoptions + { + brw_next_insn(p, $1); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $2); + i965_asm_set_instruction_options(p, $3); + } + ; + +/* Unary instruction */ +unaryinstruction: + predicate unaryopcodes saturate cond_mod execsize dst srcaccimm instoptions + { + i965_asm_set_dst_nr(p, &$6, $8); + brw_set_default_access_mode(p, $8.access_mode); + i965_asm_unary_instruction($2, p, $6, $7); + brw_pop_insn_state(p); + i965_asm_set_instruction_options(p, $8); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, + $4.cond_modifier); + + if (p->devinfo->ver >= 7 && $2 != BRW_OPCODE_DIM && + !brw_inst_flag_reg_nr(p->devinfo, brw_last_inst)) { + brw_inst_set_flag_reg_nr(p->devinfo, + brw_last_inst, + $4.flag_reg_nr); + brw_inst_set_flag_subreg_nr(p->devinfo, + brw_last_inst, + $4.flag_subreg_nr); + } + + if ($7.file != BRW_IMMEDIATE_VALUE) { + brw_inst_set_src0_vstride(p->devinfo, brw_last_inst, + $7.vstride); + } + brw_inst_set_saturate(p->devinfo, brw_last_inst, $3); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $5); + // TODO: set instruction group instead of qtr and nib ctrl + brw_inst_set_qtr_control(p->devinfo, brw_last_inst, + $8.qtr_ctrl); + + if (p->devinfo->ver >= 7) + brw_inst_set_nib_control(p->devinfo, brw_last_inst, + $8.nib_ctrl); + } + ; + +unaryopcodes: + BFREV + | CBIT + | DIM + | F16TO32 + | F32TO16 + | FBH + | FBL + | FRC + | LZD + | MOV + | NOT + | RNDD + | RNDE + | RNDU + | RNDZ + ; + +/* Binary instruction */ +binaryinstruction: + predicate binaryopcodes saturate cond_mod execsize dst srcimm srcimm instoptions + { + i965_asm_set_dst_nr(p, &$6, $9); + brw_set_default_access_mode(p, $9.access_mode); + i965_asm_binary_instruction($2, p, $6, $7, $8); + i965_asm_set_instruction_options(p, $9); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, + $4.cond_modifier); + + if (p->devinfo->ver >= 7 && + !brw_inst_flag_reg_nr(p->devinfo, brw_last_inst)) { + brw_inst_set_flag_reg_nr(p->devinfo, brw_last_inst, + $4.flag_reg_nr); + brw_inst_set_flag_subreg_nr(p->devinfo, brw_last_inst, + $4.flag_subreg_nr); + } + + brw_inst_set_saturate(p->devinfo, brw_last_inst, $3); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $5); + // TODO: set instruction group instead of qtr and nib ctrl + brw_inst_set_qtr_control(p->devinfo, brw_last_inst, + $9.qtr_ctrl); + + if (p->devinfo->ver >= 7) + brw_inst_set_nib_control(p->devinfo, brw_last_inst, + $9.nib_ctrl); + + brw_pop_insn_state(p); + } + ; + +binaryopcodes: + ADDC + | BFI1 + | DP2 + | DP3 + | DP4 + | DPH + | LINE + | MAC + | MACH + | MUL + | PLN + | ROL + | ROR + | SAD2 + | SADA2 + | SUBB + ; + +/* Binary acc instruction */ +binaryaccinstruction: + predicate binaryaccopcodes saturate cond_mod execsize dst srcacc srcimm instoptions + { + i965_asm_set_dst_nr(p, &$6, $9); + brw_set_default_access_mode(p, $9.access_mode); + i965_asm_binary_instruction($2, p, $6, $7, $8); + brw_pop_insn_state(p); + i965_asm_set_instruction_options(p, $9); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, + $4.cond_modifier); + + if (p->devinfo->ver >= 7 && + !brw_inst_flag_reg_nr(p->devinfo, brw_last_inst)) { + brw_inst_set_flag_reg_nr(p->devinfo, + brw_last_inst, + $4.flag_reg_nr); + brw_inst_set_flag_subreg_nr(p->devinfo, + brw_last_inst, + $4.flag_subreg_nr); + } + + brw_inst_set_saturate(p->devinfo, brw_last_inst, $3); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $5); + // TODO: set instruction group instead of qtr and nib ctrl + brw_inst_set_qtr_control(p->devinfo, brw_last_inst, + $9.qtr_ctrl); + + if (p->devinfo->ver >= 7) + brw_inst_set_nib_control(p->devinfo, brw_last_inst, + $9.nib_ctrl); + } + ; + +binaryaccopcodes: + ADD + | AND + | ASR + | AVG + | CMP + | CMPN + | OR + | SEL + | SHL + | SHR + | XOR + ; + +/* Math instruction */ +mathinstruction: + predicate MATH saturate math_function execsize dst src srcimm instoptions + { + brw_set_default_access_mode(p, $9.access_mode); + gfx6_math(p, $6, $4, $7, $8); + i965_asm_set_instruction_options(p, $9); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $5); + brw_inst_set_saturate(p->devinfo, brw_last_inst, $3); + // TODO: set instruction group instead + brw_inst_set_qtr_control(p->devinfo, brw_last_inst, + $9.qtr_ctrl); + + if (p->devinfo->ver >= 7) + brw_inst_set_nib_control(p->devinfo, brw_last_inst, + $9.nib_ctrl); + + brw_pop_insn_state(p); + } + ; + +math_function: + COS + | EXP + | FDIV + | INV + | INVM + | INTDIV + | INTDIVMOD + | INTMOD + | LOG + | POW + | RSQ + | RSQRTM + | SIN + | SQRT + | SINCOS + ; + +/* NOP instruction */ +nopinstruction: + NOP + { + brw_NOP(p); + } + ; + +/* Ternary operand instruction */ +ternaryinstruction: + predicate ternaryopcodes saturate cond_mod execsize dst srcimm src srcimm instoptions + { + brw_set_default_access_mode(p, $10.access_mode); + i965_asm_ternary_instruction($2, p, $6, $7, $8, $9); + brw_pop_insn_state(p); + i965_asm_set_instruction_options(p, $10); + brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, + $4.cond_modifier); + + if (p->devinfo->ver >= 7 && p->devinfo->ver < 12) { + brw_inst_set_3src_a16_flag_reg_nr(p->devinfo, brw_last_inst, + $4.flag_reg_nr); + brw_inst_set_3src_a16_flag_subreg_nr(p->devinfo, brw_last_inst, + $4.flag_subreg_nr); + } + + brw_inst_set_saturate(p->devinfo, brw_last_inst, $3); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $5); + // TODO: set instruction group instead of qtr and nib ctrl + brw_inst_set_qtr_control(p->devinfo, brw_last_inst, + $10.qtr_ctrl); + + if (p->devinfo->ver >= 7) + brw_inst_set_nib_control(p->devinfo, brw_last_inst, + $10.nib_ctrl); + } + ; + +ternaryopcodes: + CSEL + | BFE + | BFI2 + | LRP + | MAD + | DP4A + | ADD3 + ; + +/* Wait instruction */ +waitinstruction: + WAIT execsize dst instoptions + { + brw_next_insn(p, $1); + i965_asm_set_instruction_options(p, $4); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $2); + brw_set_default_access_mode(p, $4.access_mode); + struct brw_reg dest = $3; + dest.swizzle = brw_swizzle_for_mask(dest.writemask); + if (dest.file != ARF || dest.nr != BRW_ARF_NOTIFICATION_COUNT) + error(&@1, "WAIT must use the notification register\n"); + brw_set_dest(p, brw_last_inst, dest); + brw_set_src0(p, brw_last_inst, dest); + brw_set_src1(p, brw_last_inst, brw_null_reg()); + brw_inst_set_mask_control(p->devinfo, brw_last_inst, BRW_MASK_DISABLE); + } + ; + +/* Send instruction */ +sendinstruction: + predicate sendopcode execsize dst payload exp2 sharedfunction msgdesc instoptions + { + i965_asm_set_instruction_options(p, $9); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3); + brw_set_dest(p, brw_last_inst, $4); + brw_set_src0(p, brw_last_inst, $5); + brw_inst_set_bits(brw_last_inst, 127, 96, $6); + brw_inst_set_src1_file_type(p->devinfo, brw_last_inst, + BRW_IMMEDIATE_VALUE, + BRW_REGISTER_TYPE_UD); + brw_inst_set_sfid(p->devinfo, brw_last_inst, $7); + brw_inst_set_eot(p->devinfo, brw_last_inst, $9.end_of_thread); + // TODO: set instruction group instead of qtr and nib ctrl + brw_inst_set_qtr_control(p->devinfo, brw_last_inst, + $9.qtr_ctrl); + + if (p->devinfo->ver >= 7) + brw_inst_set_nib_control(p->devinfo, brw_last_inst, + $9.nib_ctrl); + + brw_pop_insn_state(p); + } + | predicate sendopcode execsize exp dst payload exp2 sharedfunction msgdesc instoptions + { + assert(p->devinfo->ver < 6); + + i965_asm_set_instruction_options(p, $10); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3); + brw_inst_set_base_mrf(p->devinfo, brw_last_inst, $4); + brw_set_dest(p, brw_last_inst, $5); + brw_set_src0(p, brw_last_inst, $6); + brw_inst_set_bits(brw_last_inst, 127, 96, $7); + brw_inst_set_src1_file_type(p->devinfo, brw_last_inst, + BRW_IMMEDIATE_VALUE, + BRW_REGISTER_TYPE_UD); + brw_inst_set_sfid(p->devinfo, brw_last_inst, $8); + brw_inst_set_eot(p->devinfo, brw_last_inst, $10.end_of_thread); + // TODO: set instruction group instead of qtr and nib ctrl + brw_inst_set_qtr_control(p->devinfo, brw_last_inst, + $10.qtr_ctrl); + + brw_pop_insn_state(p); + } + | predicate sendopcode execsize dst payload payload exp2 sharedfunction msgdesc instoptions + { + assert(p->devinfo->ver >= 6 && p->devinfo->ver < 12); + + i965_asm_set_instruction_options(p, $10); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3); + brw_set_dest(p, brw_last_inst, $4); + brw_set_src0(p, brw_last_inst, $5); + brw_inst_set_bits(brw_last_inst, 127, 96, $7); + brw_inst_set_sfid(p->devinfo, brw_last_inst, $8); + brw_inst_set_eot(p->devinfo, brw_last_inst, $10.end_of_thread); + // TODO: set instruction group instead of qtr and nib ctrl + brw_inst_set_qtr_control(p->devinfo, brw_last_inst, + $10.qtr_ctrl); + + if (p->devinfo->ver >= 7) + brw_inst_set_nib_control(p->devinfo, brw_last_inst, + $10.nib_ctrl); + + brw_pop_insn_state(p); + } + | predicate sendsopcode execsize dst payload payload desc ex_desc sharedfunction msgdesc instoptions + { + assert(p->devinfo->ver >= 9); + + i965_asm_set_instruction_options(p, $11); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3); + brw_set_dest(p, brw_last_inst, $4); + brw_set_src0(p, brw_last_inst, $5); + brw_set_src1(p, brw_last_inst, $6); + + if ($7.file == BRW_IMMEDIATE_VALUE) { + brw_inst_set_send_sel_reg32_desc(p->devinfo, brw_last_inst, 0); + brw_inst_set_send_desc(p->devinfo, brw_last_inst, $7.ud); + } else { + brw_inst_set_send_sel_reg32_desc(p->devinfo, brw_last_inst, 1); + } + + if ($8.file == BRW_IMMEDIATE_VALUE) { + brw_inst_set_send_sel_reg32_ex_desc(p->devinfo, brw_last_inst, 0); + brw_inst_set_sends_ex_desc(p->devinfo, brw_last_inst, $8.ud); + } else { + brw_inst_set_send_sel_reg32_ex_desc(p->devinfo, brw_last_inst, 1); + brw_inst_set_send_ex_desc_ia_subreg_nr(p->devinfo, brw_last_inst, $8.subnr >> 2); + } + + brw_inst_set_sfid(p->devinfo, brw_last_inst, $9); + brw_inst_set_eot(p->devinfo, brw_last_inst, $11.end_of_thread); + // TODO: set instruction group instead of qtr and nib ctrl + brw_inst_set_qtr_control(p->devinfo, brw_last_inst, + $11.qtr_ctrl); + + brw_inst_set_nib_control(p->devinfo, brw_last_inst, + $11.nib_ctrl); + + if (p->devinfo->verx10 >= 125 && $10.ex_bso) { + brw_inst_set_send_ex_bso(p->devinfo, brw_last_inst, 1); + brw_inst_set_send_src1_len(p->devinfo, brw_last_inst, + $10.src1_len); + } + + brw_pop_insn_state(p); + } + ; + +sendop: + SEND_GFX4 + | SENDC_GFX4 + ; + +sendsop: + SEND_GFX12 + | SENDC_GFX12 + | SENDS + | SENDSC + ; + +sendopcode: + sendop { $$ = brw_next_insn(p, $1); } + ; + +sendsopcode: + sendsop { $$ = brw_next_insn(p, $1); } + ; + +sharedfunction: + NULL_TOKEN { $$ = BRW_SFID_NULL; } + | MATH { $$ = BRW_SFID_MATH; } + | GATEWAY { $$ = BRW_SFID_MESSAGE_GATEWAY; } + | READ { $$ = BRW_SFID_DATAPORT_READ; } + | WRITE { $$ = BRW_SFID_DATAPORT_WRITE; } + | URB { $$ = BRW_SFID_URB; } + | THREAD_SPAWNER { $$ = BRW_SFID_THREAD_SPAWNER; } + | VME { $$ = BRW_SFID_VME; } + | RENDER { $$ = GFX6_SFID_DATAPORT_RENDER_CACHE; } + | CONST { $$ = GFX6_SFID_DATAPORT_CONSTANT_CACHE; } + | DATA { $$ = GFX7_SFID_DATAPORT_DATA_CACHE; } + | PIXEL_INTERP { $$ = GFX7_SFID_PIXEL_INTERPOLATOR; } + | DP_DATA_1 { $$ = HSW_SFID_DATAPORT_DATA_CACHE_1; } + | CRE { $$ = HSW_SFID_CRE; } + | SAMPLER { $$ = BRW_SFID_SAMPLER; } + | DP_SAMPLER { $$ = GFX6_SFID_DATAPORT_SAMPLER_CACHE; } + | RT_ACCEL { $$ = GEN_RT_SFID_RAY_TRACE_ACCELERATOR; } + | SLM { $$ = GFX12_SFID_SLM; } + | TGM { $$ = GFX12_SFID_TGM; } + | UGM { $$ = GFX12_SFID_UGM; } + ; + +exp2: + LONG { $$ = $1; } + | MINUS LONG { $$ = -$2; } + ; + +desc: + reg32a + | exp2 + { + $$ = brw_imm_ud($1); + } + ; + +ex_desc: + reg32a + | exp2 + { + $$ = brw_imm_ud($1); + } + ; + +reg32a: + addrreg region reg_type + { + $$ = set_direct_src_operand(&$1, $3); + $$ = stride($$, $2.vstride, $2.width, $2.hstride); + } + ; + + +/* Jump instruction */ +jumpinstruction: + predicate JMPI execsize relativelocation2 instoptions + { + brw_next_insn(p, $2); + i965_asm_set_instruction_options(p, $5); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3); + brw_set_dest(p, brw_last_inst, brw_ip_reg()); + brw_set_src0(p, brw_last_inst, brw_ip_reg()); + brw_set_src1(p, brw_last_inst, $4); + brw_inst_set_pred_control(p->devinfo, brw_last_inst, + brw_inst_pred_control(p->devinfo, + brw_last_inst)); + brw_pop_insn_state(p); + } + ; + +/* branch instruction */ +branchinstruction: + predicate ENDIF execsize JUMP_LABEL instoptions + { + add_label(p, $4, INSTR_LABEL_JIP); + + brw_next_insn(p, $2); + i965_asm_set_instruction_options(p, $5); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3); + + if (p->devinfo->ver == 6) { + brw_set_dest(p, brw_last_inst, brw_imm_w(0x0)); + brw_set_src0(p, brw_last_inst, retype(brw_null_reg(), + BRW_REGISTER_TYPE_D)); + brw_set_src1(p, brw_last_inst, retype(brw_null_reg(), + BRW_REGISTER_TYPE_D)); + } else if (p->devinfo->ver == 7) { + brw_set_dest(p, brw_last_inst, retype(brw_null_reg(), + BRW_REGISTER_TYPE_D)); + brw_set_src0(p, brw_last_inst, retype(brw_null_reg(), + BRW_REGISTER_TYPE_D)); + brw_set_src1(p, brw_last_inst, brw_imm_w(0x0)); + } else { + brw_set_src0(p, brw_last_inst, brw_imm_d(0x0)); + } + + brw_pop_insn_state(p); + } + | predicate ENDIF execsize relativelocation instoptions + { + brw_next_insn(p, $2); + i965_asm_set_instruction_options(p, $5); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3); + + brw_set_dest(p, brw_last_inst, retype(brw_null_reg(), + BRW_REGISTER_TYPE_D)); + brw_set_src0(p, brw_last_inst, retype(brw_null_reg(), + BRW_REGISTER_TYPE_D)); + brw_set_src1(p, brw_last_inst, brw_imm_d(0x0)); + brw_inst_set_gfx4_pop_count(p->devinfo, brw_last_inst, $4); + + brw_inst_set_thread_control(p->devinfo, brw_last_inst, + BRW_THREAD_SWITCH); + + brw_pop_insn_state(p); + } + | ELSE execsize JUMP_LABEL jumplabel instoptions + { + add_label(p, $3, INSTR_LABEL_JIP); + add_label(p, $4, INSTR_LABEL_UIP); + + brw_next_insn(p, $1); + i965_asm_set_instruction_options(p, $5); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $2); + + if (p->devinfo->ver == 6) { + brw_set_dest(p, brw_last_inst, brw_imm_w(0x0)); + brw_set_src0(p, brw_last_inst, retype(brw_null_reg(), + BRW_REGISTER_TYPE_D)); + brw_set_src1(p, brw_last_inst, retype(brw_null_reg(), + BRW_REGISTER_TYPE_D)); + } else if (p->devinfo->ver == 7) { + brw_set_dest(p, brw_last_inst, retype(brw_null_reg(), + BRW_REGISTER_TYPE_D)); + brw_set_src0(p, brw_last_inst, retype(brw_null_reg(), + BRW_REGISTER_TYPE_D)); + brw_set_src1(p, brw_last_inst, brw_imm_w(0)); + } else { + brw_set_dest(p, brw_last_inst, retype(brw_null_reg(), + BRW_REGISTER_TYPE_D)); + if (p->devinfo->ver < 12) + brw_set_src0(p, brw_last_inst, brw_imm_d(0)); + } + } + | ELSE execsize relativelocation rellocation instoptions + { + brw_next_insn(p, $1); + i965_asm_set_instruction_options(p, $5); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $2); + + brw_set_dest(p, brw_last_inst, brw_ip_reg()); + brw_set_src0(p, brw_last_inst, brw_ip_reg()); + brw_set_src1(p, brw_last_inst, brw_imm_d(0x0)); + brw_inst_set_gfx4_jump_count(p->devinfo, brw_last_inst, $3); + brw_inst_set_gfx4_pop_count(p->devinfo, brw_last_inst, $4); + + if (!p->single_program_flow) + brw_inst_set_thread_control(p->devinfo, brw_last_inst, + BRW_THREAD_SWITCH); + } + | predicate IF execsize JUMP_LABEL jumplabel instoptions + { + add_label(p, $4, INSTR_LABEL_JIP); + add_label(p, $5, INSTR_LABEL_UIP); + + brw_next_insn(p, $2); + i965_asm_set_instruction_options(p, $6); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3); + + if (p->devinfo->ver == 6) { + brw_set_dest(p, brw_last_inst, brw_imm_w(0x0)); + brw_set_src0(p, brw_last_inst, + vec1(retype(brw_null_reg(), + BRW_REGISTER_TYPE_D))); + brw_set_src1(p, brw_last_inst, + vec1(retype(brw_null_reg(), + BRW_REGISTER_TYPE_D))); + } else if (p->devinfo->ver == 7) { + brw_set_dest(p, brw_last_inst, + vec1(retype(brw_null_reg(), + BRW_REGISTER_TYPE_D))); + brw_set_src0(p, brw_last_inst, + vec1(retype(brw_null_reg(), + BRW_REGISTER_TYPE_D))); + brw_set_src1(p, brw_last_inst, brw_imm_w(0x0)); + } else { + brw_set_dest(p, brw_last_inst, + vec1(retype(brw_null_reg(), + BRW_REGISTER_TYPE_D))); + if (p->devinfo->ver < 12) + brw_set_src0(p, brw_last_inst, brw_imm_d(0x0)); + } + + brw_pop_insn_state(p); + } + | predicate IF execsize relativelocation rellocation instoptions + { + brw_next_insn(p, $2); + i965_asm_set_instruction_options(p, $6); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3); + + brw_set_dest(p, brw_last_inst, brw_ip_reg()); + brw_set_src0(p, brw_last_inst, brw_ip_reg()); + brw_set_src1(p, brw_last_inst, brw_imm_d(0x0)); + brw_inst_set_gfx4_jump_count(p->devinfo, brw_last_inst, $4); + brw_inst_set_gfx4_pop_count(p->devinfo, brw_last_inst, $5); + + if (!p->single_program_flow) + brw_inst_set_thread_control(p->devinfo, brw_last_inst, + BRW_THREAD_SWITCH); + + brw_pop_insn_state(p); + } + | predicate IFF execsize JUMP_LABEL instoptions + { + add_label(p, $4, INSTR_LABEL_JIP); + + brw_next_insn(p, $2); + i965_asm_set_instruction_options(p, $5); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3); + + if (p->devinfo->ver == 6) { + brw_set_src0(p, brw_last_inst, + vec1(retype(brw_null_reg(), + BRW_REGISTER_TYPE_D))); + brw_set_src1(p, brw_last_inst, + vec1(retype(brw_null_reg(), + BRW_REGISTER_TYPE_D))); + } else if (p->devinfo->ver == 7) { + brw_set_dest(p, brw_last_inst, + vec1(retype(brw_null_reg(), + BRW_REGISTER_TYPE_D))); + brw_set_src0(p, brw_last_inst, + vec1(retype(brw_null_reg(), + BRW_REGISTER_TYPE_D))); + brw_set_src1(p, brw_last_inst, brw_imm_w(0x0)); + } else { + brw_set_dest(p, brw_last_inst, + vec1(retype(brw_null_reg(), + BRW_REGISTER_TYPE_D))); + if (p->devinfo->ver < 12) + brw_set_src0(p, brw_last_inst, brw_imm_d(0x0)); + } + + brw_pop_insn_state(p); + } + | predicate IFF execsize relativelocation instoptions + { + brw_next_insn(p, $2); + i965_asm_set_instruction_options(p, $5); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3); + + brw_set_dest(p, brw_last_inst, brw_ip_reg()); + brw_set_src0(p, brw_last_inst, brw_ip_reg()); + brw_inst_set_gfx4_jump_count(p->devinfo, brw_last_inst, $4); + brw_set_src1(p, brw_last_inst, brw_imm_d($4)); + + if (!p->single_program_flow) + brw_inst_set_thread_control(p->devinfo, brw_last_inst, + BRW_THREAD_SWITCH); + + brw_pop_insn_state(p); + } + ; + +/* break instruction */ +breakinstruction: + predicate BREAK execsize JUMP_LABEL JUMP_LABEL instoptions + { + add_label(p, $4, INSTR_LABEL_JIP); + add_label(p, $5, INSTR_LABEL_UIP); + + brw_next_insn(p, $2); + i965_asm_set_instruction_options(p, $6); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3); + + if (p->devinfo->ver >= 8) { + brw_set_dest(p, brw_last_inst, retype(brw_null_reg(), + BRW_REGISTER_TYPE_D)); + brw_set_src0(p, brw_last_inst, brw_imm_d(0x0)); + } else { + brw_set_dest(p, brw_last_inst, retype(brw_null_reg(), + BRW_REGISTER_TYPE_D)); + brw_set_src0(p, brw_last_inst, retype(brw_null_reg(), + BRW_REGISTER_TYPE_D)); + brw_set_src1(p, brw_last_inst, brw_imm_d(0x0)); + } + + brw_pop_insn_state(p); + } + | predicate BREAK execsize relativelocation relativelocation instoptions + { + brw_next_insn(p, $2); + i965_asm_set_instruction_options(p, $6); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3); + + brw_set_dest(p, brw_last_inst, brw_ip_reg()); + brw_set_src0(p, brw_last_inst, brw_ip_reg()); + brw_set_src1(p, brw_last_inst, brw_imm_d(0x0)); + brw_inst_set_gfx4_jump_count(p->devinfo, brw_last_inst, $4); + brw_inst_set_gfx4_pop_count(p->devinfo, brw_last_inst, $5); + + brw_pop_insn_state(p); + } + | predicate HALT execsize JUMP_LABEL JUMP_LABEL instoptions + { + add_label(p, $4, INSTR_LABEL_JIP); + add_label(p, $5, INSTR_LABEL_UIP); + + brw_next_insn(p, $2); + i965_asm_set_instruction_options(p, $6); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3); + + brw_set_dest(p, brw_last_inst, retype(brw_null_reg(), + BRW_REGISTER_TYPE_D)); + + if (p->devinfo->ver < 8) { + brw_set_src0(p, brw_last_inst, retype(brw_null_reg(), + BRW_REGISTER_TYPE_D)); + brw_set_src1(p, brw_last_inst, brw_imm_d(0x0)); + } else if (p->devinfo->ver < 12) { + brw_set_src0(p, brw_last_inst, brw_imm_d(0x0)); + } + + brw_pop_insn_state(p); + } + | predicate CONT execsize JUMP_LABEL JUMP_LABEL instoptions + { + add_label(p, $4, INSTR_LABEL_JIP); + add_label(p, $5, INSTR_LABEL_UIP); + + brw_next_insn(p, $2); + i965_asm_set_instruction_options(p, $6); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3); + brw_set_dest(p, brw_last_inst, brw_ip_reg()); + + if (p->devinfo->ver >= 8) { + brw_set_src0(p, brw_last_inst, brw_imm_d(0x0)); + } else { + brw_set_src0(p, brw_last_inst, brw_ip_reg()); + brw_set_src1(p, brw_last_inst, brw_imm_d(0x0)); + } + + brw_pop_insn_state(p); + } + | predicate CONT execsize relativelocation relativelocation instoptions + { + brw_next_insn(p, $2); + i965_asm_set_instruction_options(p, $6); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3); + brw_set_dest(p, brw_last_inst, brw_ip_reg()); + + brw_set_src0(p, brw_last_inst, brw_ip_reg()); + brw_set_src1(p, brw_last_inst, brw_imm_d(0x0)); + + brw_inst_set_gfx4_jump_count(p->devinfo, brw_last_inst, $4); + brw_inst_set_gfx4_pop_count(p->devinfo, brw_last_inst, $5); + + brw_pop_insn_state(p); + } + ; + +/* loop instruction */ +loopinstruction: + predicate WHILE execsize JUMP_LABEL instoptions + { + add_label(p, $4, INSTR_LABEL_JIP); + + brw_next_insn(p, $2); + i965_asm_set_instruction_options(p, $5); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3); + + if (p->devinfo->ver >= 8) { + brw_set_dest(p, brw_last_inst, + retype(brw_null_reg(), + BRW_REGISTER_TYPE_D)); + if (p->devinfo->ver < 12) + brw_set_src0(p, brw_last_inst, brw_imm_d(0x0)); + } else if (p->devinfo->ver == 7) { + brw_set_dest(p, brw_last_inst, + retype(brw_null_reg(), + BRW_REGISTER_TYPE_D)); + brw_set_src0(p, brw_last_inst, + retype(brw_null_reg(), + BRW_REGISTER_TYPE_D)); + brw_set_src1(p, brw_last_inst, + brw_imm_w(0x0)); + } else { + brw_set_dest(p, brw_last_inst, brw_imm_w(0x0)); + brw_set_src0(p, brw_last_inst, + retype(brw_null_reg(), + BRW_REGISTER_TYPE_D)); + brw_set_src1(p, brw_last_inst, + retype(brw_null_reg(), + BRW_REGISTER_TYPE_D)); + } + + brw_pop_insn_state(p); + } + | predicate WHILE execsize relativelocation instoptions + { + brw_next_insn(p, $2); + i965_asm_set_instruction_options(p, $5); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3); + + brw_set_dest(p, brw_last_inst, brw_ip_reg()); + brw_set_src0(p, brw_last_inst, brw_ip_reg()); + brw_set_src1(p, brw_last_inst, brw_imm_d(0x0)); + brw_inst_set_gfx4_jump_count(p->devinfo, brw_last_inst, $4); + brw_inst_set_gfx4_pop_count(p->devinfo, brw_last_inst, 0); + + brw_pop_insn_state(p); + } + | DO execsize instoptions + { + brw_next_insn(p, $1); + if (p->devinfo->ver < 6) { + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $2); + i965_asm_set_instruction_options(p, $3); + brw_set_dest(p, brw_last_inst, brw_null_reg()); + brw_set_src0(p, brw_last_inst, brw_null_reg()); + brw_set_src1(p, brw_last_inst, brw_null_reg()); + + brw_inst_set_qtr_control(p->devinfo, brw_last_inst, BRW_COMPRESSION_NONE); + } + } + ; + +/* sync instruction */ +syncinstruction: + predicate SYNC sync_function execsize sync_arg instoptions + { + if (p->devinfo->ver < 12) { + error(&@2, "sync instruction is supported only on gfx12+\n"); + } + + if ($5.file == BRW_IMMEDIATE_VALUE && + $3 != TGL_SYNC_ALLRD && + $3 != TGL_SYNC_ALLWR) { + error(&@2, "Only allrd and allwr support immediate argument\n"); + } + + brw_set_default_access_mode(p, $6.access_mode); + brw_SYNC(p, $3); + i965_asm_set_instruction_options(p, $6); + brw_inst_set_exec_size(p->devinfo, brw_last_inst, $4); + brw_set_src0(p, brw_last_inst, $5); + brw_inst_set_eot(p->devinfo, brw_last_inst, $6.end_of_thread); + brw_inst_set_qtr_control(p->devinfo, brw_last_inst, $6.qtr_ctrl); + brw_inst_set_nib_control(p->devinfo, brw_last_inst, $6.nib_ctrl); + + brw_pop_insn_state(p); + } + ; + +sync_function: + NOP { $$ = TGL_SYNC_NOP; } + | ALLRD + | ALLWR + | FENCE + | BAR + | HOST + ; + +sync_arg: + nullreg region reg_type + { + $$ = $1; + $$.vstride = $2.vstride; + $$.width = $2.width; + $$.hstride = $2.hstride; + $$.type = $3; + } + | immreg + ; + +/* Relative location */ +relativelocation2: + immreg + | reg32 + ; + +simple_int: + INTEGER { $$ = $1; } + | MINUS INTEGER { $$ = -$2; } + | LONG { $$ = $1; } + | MINUS LONG { $$ = -$2; } + ; + +rellocation: + relativelocation + | /* empty */ { $$ = 0; } + ; + +relativelocation: + simple_int + { + $$ = $1; + } + ; + +jumplabel: + JUMP_LABEL { $$ = $1; } + | /* empty */ { $$ = NULL; } + ; + +jumplabeltarget: + JUMP_LABEL_TARGET + { + struct target_label *label = rzalloc(p->mem_ctx, struct target_label); + + label->name = ralloc_strdup(p->mem_ctx, $1); + label->offset = p->next_insn_offset; + + list_addtail(&label->link, &target_labels); + } + ; + +/* Destination register */ +dst: + dstoperand + | dstoperandex + ; + +dstoperand: + dstreg dstregion writemask reg_type + { + $$ = $1; + $$.vstride = BRW_VERTICAL_STRIDE_1; + $$.width = BRW_WIDTH_1; + $$.hstride = $2; + $$.type = $4; + $$.writemask = $3; + $$.swizzle = BRW_SWIZZLE_NOOP; + $$.subnr = $$.subnr * brw_reg_type_to_size($4); + } + ; + +dstoperandex: + dstoperandex_typed dstregion writemask reg_type + { + $$ = $1; + $$.hstride = $2; + $$.type = $4; + $$.writemask = $3; + $$.subnr = $$.subnr * brw_reg_type_to_size($4); + } + /* BSpec says "When the conditional modifier is present, updates + * to the selected flag register also occur. In this case, the + * register region fields of the ‘null’ operand are valid." + */ + | nullreg dstregion writemask reg_type + { + $$ = $1; + $$.vstride = BRW_VERTICAL_STRIDE_1; + $$.width = BRW_WIDTH_1; + $$.hstride = $2; + $$.writemask = $3; + $$.type = $4; + } + | threadcontrolreg + { + $$ = $1; + $$.hstride = 1; + $$.type = BRW_REGISTER_TYPE_UW; + } + ; + +dstoperandex_typed: + accreg + | addrreg + | channelenablereg + | controlreg + | flagreg + | ipreg + | maskreg + | notifyreg + | performancereg + | statereg + ; + +dstreg: + directgenreg + { + $$ = $1; + $$.address_mode = BRW_ADDRESS_DIRECT; + } + | indirectgenreg + { + $$ = $1; + $$.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER; + } + | directmsgreg + { + $$ = $1; + $$.address_mode = BRW_ADDRESS_DIRECT; + } + | indirectmsgreg + { + $$ = $1; + $$.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER; + } + ; + +/* Source register */ +srcaccimm: + srcacc + | immreg + ; + +immreg: + immval imm_type + { + switch ($2) { + case BRW_REGISTER_TYPE_UD: + $$ = brw_imm_ud($1); + break; + case BRW_REGISTER_TYPE_D: + $$ = brw_imm_d($1); + break; + case BRW_REGISTER_TYPE_UW: + $$ = brw_imm_uw($1 | ($1 << 16)); + break; + case BRW_REGISTER_TYPE_W: + $$ = brw_imm_w($1); + break; + case BRW_REGISTER_TYPE_F: + $$ = brw_imm_reg(BRW_REGISTER_TYPE_F); + /* Set u64 instead of ud since DIM uses a 64-bit F-typed imm */ + $$.u64 = $1; + break; + case BRW_REGISTER_TYPE_V: + $$ = brw_imm_v($1); + break; + case BRW_REGISTER_TYPE_UV: + $$ = brw_imm_uv($1); + break; + case BRW_REGISTER_TYPE_VF: + $$ = brw_imm_vf($1); + break; + case BRW_REGISTER_TYPE_Q: + $$ = brw_imm_q($1); + break; + case BRW_REGISTER_TYPE_UQ: + $$ = brw_imm_uq($1); + break; + case BRW_REGISTER_TYPE_DF: + $$ = brw_imm_reg(BRW_REGISTER_TYPE_DF); + $$.d64 = $1; + break; + case BRW_REGISTER_TYPE_HF: + $$ = brw_imm_reg(BRW_REGISTER_TYPE_HF); + $$.ud = $1 | ($1 << 16); + break; + default: + error(&@2, "Unknown immediate type %s\n", + brw_reg_type_to_letters($2)); + } + } + ; + +reg32: + directgenreg region reg_type + { + $$ = set_direct_src_operand(&$1, $3); + $$ = stride($$, $2.vstride, $2.width, $2.hstride); + } + ; + +payload: + directsrcoperand + ; + +src: + directsrcoperand + | indirectsrcoperand + ; + +srcacc: + directsrcaccoperand + | indirectsrcoperand + ; + +srcimm: + directsrcoperand + | indirectsrcoperand + | immreg + ; + +directsrcaccoperand: + directsrcoperand + | negate abs accreg region reg_type + { + $$ = set_direct_src_operand(&$3, $5); + $$.negate = $1; + $$.abs = $2; + $$.vstride = $4.vstride; + $$.width = $4.width; + $$.hstride = $4.hstride; + } + ; + +srcarcoperandex: + srcarcoperandex_typed region reg_type + { + $$ = brw_reg($1.file, + $1.nr, + $1.subnr, + 0, + 0, + $3, + $2.vstride, + $2.width, + $2.hstride, + BRW_SWIZZLE_NOOP, + WRITEMASK_XYZW); + } + | nullreg region reg_type + { + $$ = set_direct_src_operand(&$1, $3); + $$.vstride = $2.vstride; + $$.width = $2.width; + $$.hstride = $2.hstride; + } + | threadcontrolreg + { + $$ = set_direct_src_operand(&$1, BRW_REGISTER_TYPE_UW); + } + ; + +srcarcoperandex_typed: + channelenablereg + | controlreg + | flagreg + | ipreg + | maskreg + | statereg + ; + +indirectsrcoperand: + negate abs indirectgenreg indirectregion swizzle reg_type + { + $$ = brw_reg($3.file, + 0, + $3.subnr, + $1, // negate + $2, // abs + $6, + $4.vstride, + $4.width, + $4.hstride, + $5, + WRITEMASK_X); + + $$.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER; + // brw_reg set indirect_offset to 0 so set it to valid value + $$.indirect_offset = $3.indirect_offset; + } + ; + +directgenreg_list: + directgenreg + | directmsgreg + | notifyreg + | addrreg + | performancereg + ; + +directsrcoperand: + negate abs directgenreg_list region swizzle reg_type + { + $$ = brw_reg($3.file, + $3.nr, + $3.subnr, + $1, + $2, + $6, + $4.vstride, + $4.width, + $4.hstride, + $5, + WRITEMASK_X); + } + | srcarcoperandex + ; + +/* Address register */ +addrparam: + addrreg exp + { + memset(&$$, '\0', sizeof($$)); + $$.subnr = $1.subnr; + $$.indirect_offset = $2; + } + | addrreg + ; + +/* Register files and register numbers */ +exp: + INTEGER { $$ = $1; } + | LONG { $$ = $1; } + ; + +subregnum: + DOT exp { $$ = $2; } + | /* empty */ %prec SUBREGNUM { $$ = 0; } + ; + +directgenreg: + GENREG subregnum + { + memset(&$$, '\0', sizeof($$)); + $$.file = BRW_GENERAL_REGISTER_FILE; + $$.nr = $1; + $$.subnr = $2; + } + ; + +indirectgenreg: + GENREGFILE LSQUARE addrparam RSQUARE + { + memset(&$$, '\0', sizeof($$)); + $$.file = BRW_GENERAL_REGISTER_FILE; + $$.subnr = $3.subnr; + $$.indirect_offset = $3.indirect_offset; + } + ; + +directmsgreg: + MSGREG subregnum + { + $$.file = BRW_MESSAGE_REGISTER_FILE; + $$.nr = $1; + $$.subnr = $2; + } + ; + +indirectmsgreg: + MSGREGFILE LSQUARE addrparam RSQUARE + { + memset(&$$, '\0', sizeof($$)); + $$.file = BRW_MESSAGE_REGISTER_FILE; + $$.subnr = $3.subnr; + $$.indirect_offset = $3.indirect_offset; + } + ; + +addrreg: + ADDRREG subregnum + { + int subnr = (p->devinfo->ver >= 8) ? 16 : 8; + + if ($2 > subnr) + error(&@2, "Address sub register number %d" + "out of range\n", $2); + + $$.file = BRW_ARCHITECTURE_REGISTER_FILE; + $$.nr = BRW_ARF_ADDRESS; + $$.subnr = $2; + } + ; + +accreg: + ACCREG subregnum + { + int nr_reg; + if (p->devinfo->ver < 8) + nr_reg = 2; + else + nr_reg = 10; + + if ($1 > nr_reg) + error(&@1, "Accumulator register number %d" + " out of range\n", $1); + + memset(&$$, '\0', sizeof($$)); + $$.file = BRW_ARCHITECTURE_REGISTER_FILE; + $$.nr = BRW_ARF_ACCUMULATOR; + $$.subnr = $2; + } + ; + +flagreg: + FLAGREG subregnum + { + // SNB = 1 flag reg and IVB+ = 2 flag reg + int nr_reg = (p->devinfo->ver >= 7) ? 2 : 1; + int subnr = nr_reg; + + if ($1 > nr_reg) + error(&@1, "Flag register number %d" + " out of range \n", $1); + if ($2 > subnr) + error(&@2, "Flag subregister number %d" + " out of range\n", $2); + + $$.file = BRW_ARCHITECTURE_REGISTER_FILE; + $$.nr = BRW_ARF_FLAG | $1; + $$.subnr = $2; + } + ; + +maskreg: + MASKREG subregnum + { + if ($1 > 0) + error(&@1, "Mask register number %d" + " out of range\n", $1); + + $$.file = BRW_ARCHITECTURE_REGISTER_FILE; + $$.nr = BRW_ARF_MASK; + $$.subnr = $2; + } + ; + +notifyreg: + NOTIFYREG subregnum + { + int subnr = (p->devinfo->ver >= 11) ? 2 : 3; + if ($2 > subnr) + error(&@2, "Notification sub register number %d" + " out of range\n", $2); + + $$.file = BRW_ARCHITECTURE_REGISTER_FILE; + $$.nr = BRW_ARF_NOTIFICATION_COUNT; + $$.subnr = $2; + } + ; + +statereg: + STATEREG subregnum + { + if ($1 > 2) + error(&@1, "State register number %d" + " out of range\n", $1); + + if ($2 > 4) + error(&@2, "State sub register number %d" + " out of range\n", $2); + + $$.file = BRW_ARCHITECTURE_REGISTER_FILE; + $$.nr = BRW_ARF_STATE; + $$.subnr = $2; + } + ; + +controlreg: + CONTROLREG subregnum + { + if ($2 > 3) + error(&@2, "control sub register number %d" + " out of range\n", $2); + + $$.file = BRW_ARCHITECTURE_REGISTER_FILE; + $$.nr = BRW_ARF_CONTROL; + $$.subnr = $2; + } + ; + +ipreg: + IPREG { $$ = brw_ip_reg(); } + ; + +nullreg: + NULL_TOKEN { $$ = brw_null_reg(); } + ; + +threadcontrolreg: + THREADREG subregnum + { + if ($2 > 7) + error(&@2, "Thread control sub register number %d" + " out of range\n", $2); + + $$.file = BRW_ARCHITECTURE_REGISTER_FILE; + $$.nr = BRW_ARF_TDR; + $$.subnr = $2; + } + ; + +performancereg: + PERFORMANCEREG subregnum + { + int subnr; + if (p->devinfo->ver >= 10) + subnr = 5; + else if (p->devinfo->ver <= 8) + subnr = 3; + else + subnr = 4; + + if ($2 > subnr) + error(&@2, "Performance sub register number %d" + " out of range\n", $2); + + $$.file = BRW_ARCHITECTURE_REGISTER_FILE; + $$.nr = BRW_ARF_TIMESTAMP; + $$.subnr = $2; + } + ; + +channelenablereg: + CHANNELENABLEREG subregnum + { + if ($1 > 0) + error(&@1, "Channel enable register number %d" + " out of range\n", $1); + + $$.file = BRW_ARCHITECTURE_REGISTER_FILE; + $$.nr = BRW_ARF_MASK; + $$.subnr = $2; + } + ; + +/* Immediate values */ +immval: + exp2 + { + $$ = $1; + } + | LSQUARE exp2 COMMA exp2 COMMA exp2 COMMA exp2 RSQUARE + { + $$ = ($2 << 0) | ($4 << 8) | ($6 << 16) | ($8 << 24); + } + ; + +/* Regions */ +dstregion: + /* empty */ + { + $$ = BRW_HORIZONTAL_STRIDE_1; + } + | LANGLE exp RANGLE + { + if ($2 != 0 && ($2 > 4 || !isPowerofTwo($2))) + error(&@2, "Invalid Horizontal stride %d\n", $2); + + $$ = ffs($2); + } + ; + +indirectregion: + region + | region_wh + ; + +region: + /* empty */ + { + $$ = stride($$, 0, 1, 0); + } + | LANGLE exp RANGLE + { + if ($2 != 0 && ($2 > 32 || !isPowerofTwo($2))) + error(&@2, "Invalid VertStride %d\n", $2); + + $$ = stride($$, $2, 1, 0); + } + | LANGLE exp COMMA exp COMMA exp RANGLE + { + + if ($2 != 0 && ($2 > 32 || !isPowerofTwo($2))) + error(&@2, "Invalid VertStride %d\n", $2); + + if ($4 > 16 || !isPowerofTwo($4)) + error(&@4, "Invalid width %d\n", $4); + + if ($6 != 0 && ($6 > 4 || !isPowerofTwo($6))) + error(&@6, "Invalid Horizontal stride in" + " region_wh %d\n", $6); + + $$ = stride($$, $2, $4, $6); + } + | LANGLE exp SEMICOLON exp COMMA exp RANGLE + { + if ($2 != 0 && ($2 > 32 || !isPowerofTwo($2))) + error(&@2, "Invalid VertStride %d\n", $2); + + if ($4 > 16 || !isPowerofTwo($4)) + error(&@4, "Invalid width %d\n", $4); + + if ($6 != 0 && ($6 > 4 || !isPowerofTwo($6))) + error(&@6, "Invalid Horizontal stride in" + " region_wh %d\n", $6); + + $$ = stride($$, $2, $4, $6); + } + | LANGLE VxH COMMA exp COMMA exp RANGLE + { + if ($4 > 16 || !isPowerofTwo($4)) + error(&@4, "Invalid width %d\n", $4); + + if ($6 != 0 && ($6 > 4 || !isPowerofTwo($6))) + error(&@6, "Invalid Horizontal stride in" + " region_wh %d\n", $6); + + $$ = brw_VxH_indirect(0, 0); + } + ; + +region_wh: + LANGLE exp COMMA exp RANGLE + { + if ($2 > 16 || !isPowerofTwo($2)) + error(&@2, "Invalid width %d\n", $2); + + if ($4 != 0 && ($4 > 4 || !isPowerofTwo($4))) + error(&@4, "Invalid Horizontal stride in" + " region_wh %d\n", $4); + + $$ = stride($$, 0, $2, $4); + $$.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL; + } + ; + +reg_type: + TYPE_F { $$ = BRW_REGISTER_TYPE_F; } + | TYPE_UD { $$ = BRW_REGISTER_TYPE_UD; } + | TYPE_D { $$ = BRW_REGISTER_TYPE_D; } + | TYPE_UW { $$ = BRW_REGISTER_TYPE_UW; } + | TYPE_W { $$ = BRW_REGISTER_TYPE_W; } + | TYPE_UB { $$ = BRW_REGISTER_TYPE_UB; } + | TYPE_B { $$ = BRW_REGISTER_TYPE_B; } + | TYPE_DF { $$ = BRW_REGISTER_TYPE_DF; } + | TYPE_UQ { $$ = BRW_REGISTER_TYPE_UQ; } + | TYPE_Q { $$ = BRW_REGISTER_TYPE_Q; } + | TYPE_HF { $$ = BRW_REGISTER_TYPE_HF; } + | TYPE_NF { $$ = BRW_REGISTER_TYPE_NF; } + ; + +imm_type: + reg_type { $$ = $1; } + | TYPE_V { $$ = BRW_REGISTER_TYPE_V; } + | TYPE_VF { $$ = BRW_REGISTER_TYPE_VF; } + | TYPE_UV { $$ = BRW_REGISTER_TYPE_UV; } + ; + +writemask: + /* empty */ + { + $$ = WRITEMASK_XYZW; + } + | DOT writemask_x writemask_y writemask_z writemask_w + { + $$ = $2 | $3 | $4 | $5; + } + ; + +writemask_x: + /* empty */ { $$ = 0; } + | X { $$ = 1 << BRW_CHANNEL_X; } + ; + +writemask_y: + /* empty */ { $$ = 0; } + | Y { $$ = 1 << BRW_CHANNEL_Y; } + ; + +writemask_z: + /* empty */ { $$ = 0; } + | Z { $$ = 1 << BRW_CHANNEL_Z; } + ; + +writemask_w: + /* empty */ { $$ = 0; } + | W { $$ = 1 << BRW_CHANNEL_W; } + ; + +swizzle: + /* empty */ + { + $$ = BRW_SWIZZLE_NOOP; + } + | DOT chansel + { + $$ = BRW_SWIZZLE4($2, $2, $2, $2); + } + | DOT chansel chansel chansel chansel + { + $$ = BRW_SWIZZLE4($2, $3, $4, $5); + } + ; + +chansel: + X + | Y + | Z + | W + ; + +/* Instruction prediction and modifiers */ +predicate: + /* empty */ + { + brw_push_insn_state(p); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_set_default_flag_reg(p, 0, 0); + brw_set_default_predicate_inverse(p, false); + } + | LPAREN predstate flagreg predctrl RPAREN + { + brw_push_insn_state(p); + brw_set_default_predicate_inverse(p, $2); + brw_set_default_flag_reg(p, $3.nr, $3.subnr); + brw_set_default_predicate_control(p, $4); + } + ; + +predstate: + /* empty */ { $$ = 0; } + | PLUS { $$ = 0; } + | MINUS { $$ = 1; } + ; + +predctrl: + /* empty */ { $$ = BRW_PREDICATE_NORMAL; } + | DOT X { $$ = BRW_PREDICATE_ALIGN16_REPLICATE_X; } + | DOT Y { $$ = BRW_PREDICATE_ALIGN16_REPLICATE_Y; } + | DOT Z { $$ = BRW_PREDICATE_ALIGN16_REPLICATE_Z; } + | DOT W { $$ = BRW_PREDICATE_ALIGN16_REPLICATE_W; } + | ANYV + | ALLV + | ANY2H + | ALL2H + | ANY4H + | ALL4H + | ANY8H + | ALL8H + | ANY16H + | ALL16H + | ANY32H + | ALL32H + ; + +/* Source Modification */ +negate: + /* empty */ { $$ = 0; } + | MINUS { $$ = 1; } + ; + +abs: + /* empty */ { $$ = 0; } + | ABS { $$ = 1; } + ; + +/* Flag (Conditional) Modifier */ +cond_mod: + condModifiers + { + $$.cond_modifier = $1; + $$.flag_reg_nr = 0; + $$.flag_subreg_nr = 0; + } + | condModifiers DOT flagreg + { + $$.cond_modifier = $1; + $$.flag_reg_nr = $3.nr; + $$.flag_subreg_nr = $3.subnr; + } + ; + +condModifiers: + /* empty */ { $$ = BRW_CONDITIONAL_NONE; } + | ZERO + | EQUAL + | NOT_ZERO + | NOT_EQUAL + | GREATER + | GREATER_EQUAL + | LESS + | LESS_EQUAL + | OVERFLOW + | ROUND_INCREMENT + | UNORDERED + ; + +/* message details for send */ +msgdesc: + MSGDESC_BEGIN msgdesc_parts MSGDESC_END { $$ = $2; } + ; + +msgdesc_parts: + SRC1_LEN ASSIGN INTEGER msgdesc_parts + { + $$ = $4; + $$.src1_len = $3; + } + | EX_BSO msgdesc_parts + { + $$ = $2; + $$.ex_bso = 1; + } + | INTEGER msgdesc_parts { $$ = $2; } + | ASSIGN msgdesc_parts { $$ = $2; } + | /* empty */ + { + memset(&$$, 0, sizeof($$)); + } + ; + +saturate: + /* empty */ { $$ = BRW_INSTRUCTION_NORMAL; } + | SATURATE { $$ = BRW_INSTRUCTION_SATURATE; } + ; + +/* Execution size */ +execsize: + /* empty */ %prec EMPTYEXECSIZE + { + $$ = 0; + } + | LPAREN exp2 RPAREN + { + if ($2 > 32 || !isPowerofTwo($2)) + error(&@2, "Invalid execution size %llu\n", $2); + + $$ = cvt($2) - 1; + } + ; + +/* Instruction options */ +instoptions: + /* empty */ + { + memset(&$$, 0, sizeof($$)); + } + | LCURLY instoption_list RCURLY + { + memset(&$$, 0, sizeof($$)); + $$ = $2; + } + ; + +instoption_list: + instoption_list COMMA instoption + { + memset(&$$, 0, sizeof($$)); + $$ = $1; + add_instruction_option(&$$, $3); + } + | instoption_list instoption + { + memset(&$$, 0, sizeof($$)); + $$ = $1; + add_instruction_option(&$$, $2); + } + | /* empty */ + { + memset(&$$, 0, sizeof($$)); + } + ; + +depinfo: + REG_DIST_CURRENT + { + memset(&$$, 0, sizeof($$)); + $$.regdist = $1; + $$.pipe = TGL_PIPE_NONE; + } + | REG_DIST_FLOAT + { + memset(&$$, 0, sizeof($$)); + $$.regdist = $1; + $$.pipe = TGL_PIPE_FLOAT; + } + | REG_DIST_INT + { + memset(&$$, 0, sizeof($$)); + $$.regdist = $1; + $$.pipe = TGL_PIPE_INT; + } + | REG_DIST_LONG + { + memset(&$$, 0, sizeof($$)); + $$.regdist = $1; + $$.pipe = TGL_PIPE_LONG; + } + | REG_DIST_ALL + { + memset(&$$, 0, sizeof($$)); + $$.regdist = $1; + $$.pipe = TGL_PIPE_ALL; + } + | SBID_ALLOC + { + memset(&$$, 0, sizeof($$)); + $$.sbid = $1; + $$.mode = TGL_SBID_SET; + } + | SBID_WAIT_SRC + { + memset(&$$, 0, sizeof($$)); + $$.sbid = $1; + $$.mode = TGL_SBID_SRC; + } + | SBID_WAIT_DST + { + memset(&$$, 0, sizeof($$)); + $$.sbid = $1; + $$.mode = TGL_SBID_DST; + } + +instoption: + ALIGN1 { $$.type = INSTOPTION_FLAG; $$.uint_value = ALIGN1;} + | ALIGN16 { $$.type = INSTOPTION_FLAG; $$.uint_value = ALIGN16; } + | ACCWREN { $$.type = INSTOPTION_FLAG; $$.uint_value = ACCWREN; } + | SECHALF { $$.type = INSTOPTION_FLAG; $$.uint_value = SECHALF; } + | COMPR { $$.type = INSTOPTION_FLAG; $$.uint_value = COMPR; } + | COMPR4 { $$.type = INSTOPTION_FLAG; $$.uint_value = COMPR4; } + | BREAKPOINT { $$.type = INSTOPTION_FLAG; $$.uint_value = BREAKPOINT; } + | NODDCLR { $$.type = INSTOPTION_FLAG; $$.uint_value = NODDCLR; } + | NODDCHK { $$.type = INSTOPTION_FLAG; $$.uint_value = NODDCHK; } + | MASK_DISABLE { $$.type = INSTOPTION_FLAG; $$.uint_value = MASK_DISABLE; } + | EOT { $$.type = INSTOPTION_FLAG; $$.uint_value = EOT; } + | SWITCH { $$.type = INSTOPTION_FLAG; $$.uint_value = SWITCH; } + | ATOMIC { $$.type = INSTOPTION_FLAG; $$.uint_value = ATOMIC; } + | CMPTCTRL { $$.type = INSTOPTION_FLAG; $$.uint_value = CMPTCTRL; } + | WECTRL { $$.type = INSTOPTION_FLAG; $$.uint_value = WECTRL; } + | QTR_2Q { $$.type = INSTOPTION_FLAG; $$.uint_value = QTR_2Q; } + | QTR_3Q { $$.type = INSTOPTION_FLAG; $$.uint_value = QTR_3Q; } + | QTR_4Q { $$.type = INSTOPTION_FLAG; $$.uint_value = QTR_4Q; } + | QTR_2H { $$.type = INSTOPTION_FLAG; $$.uint_value = QTR_2H; } + | QTR_2N { $$.type = INSTOPTION_FLAG; $$.uint_value = QTR_2N; } + | QTR_3N { $$.type = INSTOPTION_FLAG; $$.uint_value = QTR_3N; } + | QTR_4N { $$.type = INSTOPTION_FLAG; $$.uint_value = QTR_4N; } + | QTR_5N { $$.type = INSTOPTION_FLAG; $$.uint_value = QTR_5N; } + | QTR_6N { $$.type = INSTOPTION_FLAG; $$.uint_value = QTR_6N; } + | QTR_7N { $$.type = INSTOPTION_FLAG; $$.uint_value = QTR_7N; } + | QTR_8N { $$.type = INSTOPTION_FLAG; $$.uint_value = QTR_8N; } + | depinfo { $$.type = INSTOPTION_DEP_INFO; $$.depinfo_value = $1; } + ; + +%% + +extern int yylineno; + +#ifdef YYBYACC +void +yyerror(YYLTYPE *ltype, char *msg) +#else +void +yyerror(char *msg) +#endif +{ + fprintf(stderr, "%s: %d: %s at \"%s\"\n", + input_filename, yylineno, msg, lex_text()); + ++errors; +} diff --git a/src/intel/compiler/elk/brw_inst.h b/src/intel/compiler/elk/brw_inst.h new file mode 100644 index 00000000000..c3a324257cb --- /dev/null +++ b/src/intel/compiler/elk/brw_inst.h @@ -0,0 +1,1732 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * @file brw_inst.h + * + * A representation of i965 EU assembly instructions, with helper methods to + * get and set various fields. This is the actual hardware format. + */ + +#ifndef BRW_INST_H +#define BRW_INST_H + +#include +#include + +#include "brw_eu_defines.h" +#include "brw_isa_info.h" +#include "brw_reg_type.h" +#include "dev/intel_device_info.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* brw_context.h has a forward declaration of brw_inst, so name the struct. */ +typedef struct brw_inst { + uint64_t data[2]; +} brw_inst; + +static inline uint64_t brw_inst_bits(const brw_inst *inst, + unsigned high, unsigned low); +static inline void brw_inst_set_bits(brw_inst *inst, + unsigned high, unsigned low, + uint64_t value); + +#define FC(name, hi4, lo4, hi12, lo12, assertions) \ +static inline void \ +brw_inst_set_##name(const struct intel_device_info *devinfo, \ + brw_inst *inst, uint64_t v) \ +{ \ + assert(assertions); \ + if (devinfo->ver >= 12) \ + brw_inst_set_bits(inst, hi12, lo12, v); \ + else \ + brw_inst_set_bits(inst, hi4, lo4, v); \ +} \ +static inline uint64_t \ +brw_inst_##name(const struct intel_device_info *devinfo, \ + const brw_inst *inst) \ +{ \ + assert(assertions); \ + if (devinfo->ver >= 12) \ + return brw_inst_bits(inst, hi12, lo12); \ + else \ + return brw_inst_bits(inst, hi4, lo4); \ +} + +/* A simple macro for fields which stay in the same place on all generations, + * except for Gfx12! + */ +#define F(name, hi4, lo4, hi12, lo12) FC(name, hi4, lo4, hi12, lo12, true) + +/* A simple macro for fields which stay in the same place on all generations, + * except for Gfx12 and Gfx20. + */ +#define F20(name, hi4, lo4, hi12, lo12, hi20, lo20) \ + static inline void \ + brw_inst_set_##name(const struct intel_device_info *devinfo, \ + brw_inst *inst, uint64_t v) \ + { \ + if (devinfo->ver >= 20) \ + brw_inst_set_bits(inst, hi20, lo20, v); \ + else if (devinfo->ver >= 12) \ + brw_inst_set_bits(inst, hi12, lo12, v); \ + else \ + brw_inst_set_bits(inst, hi4, lo4, v); \ + } \ + static inline uint64_t \ + brw_inst_##name(const struct intel_device_info *devinfo, \ + const brw_inst *inst) \ + { \ + if (devinfo->ver >= 20) \ + return brw_inst_bits(inst, hi20, lo20); \ + else if (devinfo->ver >= 12) \ + return brw_inst_bits(inst, hi12, lo12); \ + else \ + return brw_inst_bits(inst, hi4, lo4); \ + } + +#define FV20(name, hi4, lo4, hi12, lo12, hi20, lo20) \ + static inline void \ + brw_inst_set_##name(const struct intel_device_info *devinfo, \ + brw_inst *inst, uint64_t v) \ + { \ + if (devinfo->ver >= 20) \ + brw_inst_set_bits(inst, hi20, lo20, v & 0x7); \ + else if (devinfo->ver >= 12) \ + brw_inst_set_bits(inst, hi12, lo12, v); \ + else \ + brw_inst_set_bits(inst, hi4, lo4, v); \ + } \ + static inline uint64_t \ + brw_inst_##name(const struct intel_device_info *devinfo, \ + const brw_inst *inst) \ + { \ + if (devinfo->ver >= 20) \ + return brw_inst_bits(inst, hi20, lo20) == 0x7 ? 0xF : \ + brw_inst_bits(inst, hi20, lo20); \ + else if (devinfo->ver >= 12) \ + return brw_inst_bits(inst, hi12, lo12); \ + else \ + return brw_inst_bits(inst, hi4, lo4); \ + } + +#define FD20(name, hi4, lo4, hi12, lo12, hi20, lo20, zero20) \ + static inline void \ + brw_inst_set_##name(const struct intel_device_info *devinfo, \ + brw_inst *inst, uint64_t v) \ + { \ + if (devinfo->ver >= 20) { \ + brw_inst_set_bits(inst, hi20, lo20, v >> 1); \ + if (zero20 == -1) \ + assert((v & 1) == 0); \ + else \ + brw_inst_set_bits(inst, zero20, zero20, v & 1); \ + } else if (devinfo->ver >= 12) \ + brw_inst_set_bits(inst, hi12, lo12, v); \ + else \ + brw_inst_set_bits(inst, hi4, lo4, v); \ + } \ + static inline uint64_t \ + brw_inst_##name(const struct intel_device_info *devinfo, \ + const brw_inst *inst) \ + { \ + if (devinfo->ver >= 20) \ + return (brw_inst_bits(inst, hi20, lo20) << 1) | \ + (zero20 == -1 ? 0 : \ + brw_inst_bits(inst, zero20, zero20)); \ + else if (devinfo->ver >= 12) \ + return brw_inst_bits(inst, hi12, lo12); \ + else \ + return brw_inst_bits(inst, hi4, lo4); \ + } + +#define BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, \ + hi7, lo7, hi8, lo8, hi12, lo12, hi20, lo20) \ + unsigned high, low; \ + if (devinfo->ver >= 20) { \ + high = hi20; low = lo20; \ + } else if (devinfo->ver >= 12) { \ + high = hi12; low = lo12; \ + } else if (devinfo->ver >= 8) { \ + high = hi8; low = lo8; \ + } else if (devinfo->ver >= 7) { \ + high = hi7; low = lo7; \ + } else if (devinfo->ver >= 6) { \ + high = hi6; low = lo6; \ + } else if (devinfo->ver >= 5) { \ + high = hi5; low = lo5; \ + } else if (devinfo->verx10 >= 45) { \ + high = hi45; low = lo45; \ + } else { \ + high = hi4; low = lo4; \ + } \ + assert(((int) high) != -1 && ((int) low) != -1); + +/* A general macro for cases where the field has moved to several different + * bit locations across generations. GCC appears to combine cases where the + * bits are identical, removing some of the inefficiency. + */ +#define FF(name, hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, \ + hi7, lo7, hi8, lo8, hi12, lo12, hi20, lo20) \ +static inline void \ +brw_inst_set_##name(const struct intel_device_info *devinfo, \ + brw_inst *inst, uint64_t value) \ +{ \ + BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, \ + hi7, lo7, hi8, lo8, hi12, lo12, hi20, lo20) \ + brw_inst_set_bits(inst, high, low, value); \ +} \ +static inline uint64_t \ +brw_inst_##name(const struct intel_device_info *devinfo, const brw_inst *inst)\ +{ \ + BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, \ + hi7, lo7, hi8, lo8, hi12, lo12, hi20, lo20) \ + return brw_inst_bits(inst, high, low); \ +} + +/* A macro for fields which moved as of Gfx8+. */ +#define F8(name, gfx4_high, gfx4_low, gfx8_high, gfx8_low, \ + gfx12_high, gfx12_low) \ +FF(name, \ + /* 4: */ gfx4_high, gfx4_low, \ + /* 4.5: */ gfx4_high, gfx4_low, \ + /* 5: */ gfx4_high, gfx4_low, \ + /* 6: */ gfx4_high, gfx4_low, \ + /* 7: */ gfx4_high, gfx4_low, \ + /* 8: */ gfx8_high, gfx8_low, \ + /* 12: */ gfx12_high, gfx12_low, \ + /* 20: */ gfx12_high, gfx12_low); + +/* Macro for fields that gained extra discontiguous MSBs in Gfx12 (specified + * by hi12ex-lo12ex). + */ +#define FFDC(name, hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, \ + hi7, lo7, hi8, lo8, hi12ex, lo12ex, hi12, lo12, assertions) \ +static inline void \ +brw_inst_set_##name(const struct intel_device_info *devinfo, \ + brw_inst *inst, uint64_t value) \ +{ \ + assert(assertions); \ + if (devinfo->ver >= 12) { \ + const unsigned k = hi12 - lo12 + 1; \ + if (hi12ex != -1 && lo12ex != -1) \ + brw_inst_set_bits(inst, hi12ex, lo12ex, value >> k); \ + brw_inst_set_bits(inst, hi12, lo12, value & ((1ull << k) - 1)); \ + } else { \ + BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, \ + hi7, lo7, hi8, lo8, -1, -1, -1, -1); \ + brw_inst_set_bits(inst, high, low, value); \ + } \ +} \ +static inline uint64_t \ +brw_inst_##name(const struct intel_device_info *devinfo, const brw_inst *inst)\ +{ \ + assert(assertions); \ + if (devinfo->ver >= 12) { \ + const unsigned k = hi12 - lo12 + 1; \ + return (hi12ex == -1 || lo12ex == -1 ? 0 : \ + brw_inst_bits(inst, hi12ex, lo12ex) << k) | \ + brw_inst_bits(inst, hi12, lo12); \ + } else { \ + BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, \ + hi7, lo7, hi8, lo8, -1, -1, -1, -1); \ + return brw_inst_bits(inst, high, low); \ + } \ +} + +#define FD(name, hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, \ + hi7, lo7, hi8, lo8, hi12ex, lo12ex, hi12, lo12) \ + FFDC(name, hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, \ + hi7, lo7, hi8, lo8, hi12ex, lo12ex, hi12, lo12, true) + +/* Macro for fields that didn't move across generations until Gfx12, and then + * gained extra discontiguous bits. + */ +#define FDC(name, hi4, lo4, hi12ex, lo12ex, hi12, lo12, assertions) \ + FFDC(name, hi4, lo4, hi4, lo4, hi4, lo4, hi4, lo4, \ + hi4, lo4, hi4, lo4, hi12ex, lo12ex, hi12, lo12, assertions) + + +/* Macro for the 2-bit register file field, which on Gfx12+ is stored as the + * variable length combination of an IsImm (hi12) bit and an additional file + * (lo12) bit. + */ +#define FI(name, hi4, lo4, hi8, lo8, hi12, lo12) \ +static inline void \ +brw_inst_set_##name(const struct intel_device_info *devinfo, \ + brw_inst *inst, uint64_t value) \ +{ \ + if (devinfo->ver >= 12) { \ + brw_inst_set_bits(inst, hi12, hi12, value >> 1); \ + if ((value >> 1) == 0) \ + brw_inst_set_bits(inst, lo12, lo12, value & 1); \ + } else { \ + BOUNDS(hi4, lo4, hi4, lo4, hi4, lo4, hi4, lo4, \ + hi4, lo4, hi8, lo8, -1, -1, -1, -1); \ + brw_inst_set_bits(inst, high, low, value); \ + } \ +} \ +static inline uint64_t \ +brw_inst_##name(const struct intel_device_info *devinfo, const brw_inst *inst)\ +{ \ + if (devinfo->ver >= 12) { \ + return (brw_inst_bits(inst, hi12, hi12) << 1) | \ + (brw_inst_bits(inst, hi12, hi12) == 0 ? \ + brw_inst_bits(inst, lo12, lo12) : 1); \ + } else { \ + BOUNDS(hi4, lo4, hi4, lo4, hi4, lo4, hi4, lo4, \ + hi4, lo4, hi8, lo8, -1, -1, -1, -1); \ + return brw_inst_bits(inst, high, low); \ + } \ +} + +/* Macro for fields that become a constant in Gfx12+ not actually represented + * in the instruction. + */ +#define FK(name, hi4, lo4, const12) \ +static inline void \ +brw_inst_set_##name(const struct intel_device_info *devinfo, \ + brw_inst *inst, uint64_t v) \ +{ \ + if (devinfo->ver >= 12) \ + assert(v == (const12)); \ + else \ + brw_inst_set_bits(inst, hi4, lo4, v); \ +} \ +static inline uint64_t \ +brw_inst_##name(const struct intel_device_info *devinfo, \ + const brw_inst *inst) \ +{ \ + if (devinfo->ver >= 12) \ + return (const12); \ + else \ + return brw_inst_bits(inst, hi4, lo4); \ +} + +FV20(src1_vstride, /* 4+ */ 120, 117, /* 12+ */ 119, 116, /* 20+ */ 118, 116) +F(src1_width, /* 4+ */ 116, 114, /* 12+ */ 115, 113) +F(src1_da16_swiz_w, /* 4+ */ 115, 114, /* 12+ */ -1, -1) +F(src1_da16_swiz_z, /* 4+ */ 113, 112, /* 12+ */ -1, -1) +F(src1_hstride, /* 4+ */ 113, 112, /* 12+ */ 97, 96) +F(src1_address_mode, /* 4+ */ 111, 111, /* 12+ */ 112, 112) +/** Src1.SrcMod @{ */ +F(src1_negate, /* 4+ */ 110, 110, /* 12+ */ 121, 121) +F(src1_abs, /* 4+ */ 109, 109, /* 12+ */ 120, 120) +/** @} */ +F8(src1_ia_subreg_nr, /* 4+ */ 108, 106, /* 8+ */ 108, 105, /* 12+ */ 111, 108) +F(src1_da_reg_nr, /* 4+ */ 108, 101, /* 12+ */ 111, 104) +F(src1_da16_subreg_nr, /* 4+ */ 100, 100, /* 12+ */ -1, -1) +FD20(src1_da1_subreg_nr, /* 4+ */ 100, 96, /* 12+ */ 103, 99, /* 20+ */ 103, 99, -1) +F(src1_da16_swiz_y, /* 4+ */ 99, 98, /* 12+ */ -1, -1) +F(src1_da16_swiz_x, /* 4+ */ 97, 96, /* 12+ */ -1, -1) +F8(src1_reg_hw_type, /* 4+ */ 46, 44, /* 8+ */ 94, 91, /* 12+ */ 91, 88) +FI(src1_reg_file, /* 4+ */ 43, 42, /* 8+ */ 90, 89, /* 12+ */ 47, 98) +F(src1_is_imm, /* 4+ */ -1, -1, /* 12+ */ 47, 47) +FV20(src0_vstride, /* 4+ */ 88, 85, /* 12+ */ 87, 84, /* 20+ */ 86, 84) +F(src0_width, /* 4+ */ 84, 82, /* 12+ */ 83, 81) +F(src0_da16_swiz_w, /* 4+ */ 83, 82, /* 12+ */ -1, -1) +F(src0_da16_swiz_z, /* 4+ */ 81, 80, /* 12+ */ -1, -1) +F(src0_hstride, /* 4+ */ 81, 80, /* 12+ */ 65, 64) +F(src0_address_mode, /* 4+ */ 79, 79, /* 12+ */ 80, 80) +/** Src0.SrcMod @{ */ +F(src0_negate, /* 4+ */ 78, 78, /* 12+ */ 45, 45) +F(src0_abs, /* 4+ */ 77, 77, /* 12+ */ 44, 44) +/** @} */ +F8(src0_ia_subreg_nr, /* 4+ */ 76, 74, /* 8+ */ 76, 73, /* 12+ */ 79, 76) +F(src0_da_reg_nr, /* 4+ */ 76, 69, /* 12+ */ 79, 72) +F(src0_da16_subreg_nr, /* 4+ */ 68, 68, /* 12+ */ -1, -1) +FD20(src0_da1_subreg_nr, /* 4+ */ 68, 64, /* 12+ */ 71, 67, /* 20+ */ 71, 67, 87) +F(src0_da16_swiz_y, /* 4+ */ 67, 66, /* 12+ */ -1, -1) +F(src0_da16_swiz_x, /* 4+ */ 65, 64, /* 12+ */ -1, -1) +F(dst_address_mode, /* 4+ */ 63, 63, /* 12+ */ 35, 35) +F(dst_hstride, /* 4+ */ 62, 61, /* 12+ */ 49, 48) +F8(dst_ia_subreg_nr, /* 4+ */ 60, 58, /* 8+ */ 60, 57, /* 12+ */ 63, 60) +F(dst_da_reg_nr, /* 4+ */ 60, 53, /* 12+ */ 63, 56) +F(dst_da16_subreg_nr, /* 4+ */ 52, 52, /* 12+ */ -1, -1) +FD20(dst_da1_subreg_nr, /* 4+ */ 52, 48, /* 12+ */ 55, 51, /* 20+ */ 55, 51, 33) +F(da16_writemask, /* 4+ */ 51, 48, /* 12+ */ -1, -1) /* Dst.ChanEn */ +F8(src0_reg_hw_type, /* 4+ */ 41, 39, /* 8+ */ 46, 43, /* 12+ */ 43, 40) +FI(src0_reg_file, /* 4+ */ 38, 37, /* 8+ */ 42, 41, /* 12+ */ 46, 66) +F(src0_is_imm, /* 4+ */ -1, -1, /* 12+ */ 46, 46) +F8(dst_reg_hw_type, /* 4+ */ 36, 34, /* 8+ */ 40, 37, /* 12+ */ 39, 36) +F8(dst_reg_file, /* 4+ */ 33, 32, /* 8+ */ 36, 35, /* 12+ */ 50, 50) +F8(mask_control, /* 4+ */ 9, 9, /* 8+ */ 34, 34, /* 12+ */ 31, 31) +FF(flag_reg_nr, + /* 4-6: doesn't exist */ -1, -1, -1, -1, -1, -1, -1, -1, + /* 7: */ 90, 90, + /* 8: */ 33, 33, + /* 12: */ 23, 23, + /* 20: */ 23, 22) +FF(flag_subreg_nr, + /* 4-7: */ 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, + /* 8: */ 32, 32, + /* 12: */ 22, 22, + /* 20: */ 21, 21) +F(saturate, /* 4+ */ 31, 31, /* 12+ */ 34, 34) +F(debug_control, /* 4+ */ 30, 30, /* 12+ */ 30, 30) +F(cmpt_control, /* 4+ */ 29, 29, /* 12+ */ 29, 29) +FC(branch_control, /* 4+ */ 28, 28, /* 12+ */ 33, 33, devinfo->ver >= 8) +FC(acc_wr_control, /* 4+ */ 28, 28, /* 12+ */ 33, 33, devinfo->ver >= 6 && devinfo->ver < 20) +FC(mask_control_ex, /* 4+ */ 28, 28, /* 12+ */ -1, -1, devinfo->verx10 == 45 || + devinfo->ver == 5) +F(cond_modifier, /* 4+ */ 27, 24, /* 12+ */ 95, 92) +FC(math_function, /* 4+ */ 27, 24, /* 12+ */ 95, 92, devinfo->ver >= 6) +F20(exec_size, /* 4+ */ 23, 21, /* 12+ */ 18, 16, /* 20+ */ 20, 18) +F(pred_inv, /* 4+ */ 20, 20, /* 12+ */ 28, 28) +F20(pred_control, /* 4+ */ 19, 16, /* 12+ */ 27, 24, /* 20+ */ 27, 26) +F(thread_control, /* 4+ */ 15, 14, /* 12+ */ -1, -1) +F(atomic_control, /* 4+ */ -1, -1, /* 12+ */ 32, 32) +F20(qtr_control, /* 4+ */ 13, 12, /* 12+ */ 21, 20, /* 20+ */ 25, 24) +FF(nib_control, + /* 4-6: doesn't exist */ -1, -1, -1, -1, -1, -1, -1, -1, + /* 7: */ 47, 47, + /* 8: */ 11, 11, + /* 12: */ 19, 19, + /* 20: */ -1, -1) +F8(no_dd_check, /* 4+ */ 11, 11, /* 8+ */ 10, 10, /* 12+ */ -1, -1) +F8(no_dd_clear, /* 4+ */ 10, 10, /* 8+ */ 9, 9, /* 12+ */ -1, -1) +F20(swsb, /* 4+ */ -1, -1, /* 12+ */ 15, 8, /* 20+ */ 17, 8) +FK(access_mode, /* 4+ */ 8, 8, /* 12+ */ BRW_ALIGN_1) +/* Bit 7 is Reserved (for future Opcode expansion) */ +F(hw_opcode, /* 4+ */ 6, 0, /* 12+ */ 6, 0) + +/** + * Three-source instructions: + * @{ + */ +F(3src_src2_reg_nr, /* 4+ */ 125, 118, /* 12+ */ 127, 120) /* same in align1 */ +F(3src_a16_src2_subreg_nr, /* 4+ */ 117, 115, /* 12+ */ -1, -1) /* Extra discontiguous bit on CHV? */ +F(3src_a16_src2_swizzle, /* 4+ */ 114, 107, /* 12+ */ -1, -1) +F(3src_a16_src2_rep_ctrl, /* 4+ */ 106, 106, /* 12+ */ -1, -1) +F(3src_src1_reg_nr, /* 4+ */ 104, 97, /* 12+ */ 111, 104) /* same in align1 */ +F(3src_a16_src1_subreg_nr, /* 4+ */ 96, 94, /* 12+ */ -1, -1) /* Extra discontiguous bit on CHV? */ +F(3src_a16_src1_swizzle, /* 4+ */ 93, 86, /* 12+ */ -1, -1) +F(3src_a16_src1_rep_ctrl, /* 4+ */ 85, 85, /* 12+ */ -1, -1) +F(3src_src0_reg_nr, /* 4+ */ 83, 76, /* 12+ */ 79, 72) /* same in align1 */ +F(3src_a16_src0_subreg_nr, /* 4+ */ 75, 73, /* 12+ */ -1, -1) /* Extra discontiguous bit on CHV? */ +F(3src_a16_src0_swizzle, /* 4+ */ 72, 65, /* 12+ */ -1, -1) +F(3src_a16_src0_rep_ctrl, /* 4+ */ 64, 64, /* 12+ */ -1, -1) +F(3src_dst_reg_nr, /* 4+ */ 63, 56, /* 12+ */ 63, 56) /* same in align1 */ +F(3src_a16_dst_subreg_nr, /* 4+ */ 55, 53, /* 12+ */ -1, -1) +F(3src_a16_dst_writemask, /* 4+ */ 52, 49, /* 12+ */ -1, -1) +F8(3src_a16_nib_ctrl, /* 4+ */ 47, 47, /* 8+ */ 11, 11, /* 12+ */ -1, -1) /* only exists on IVB+ */ +F8(3src_a16_dst_hw_type, /* 4+ */ 45, 44, /* 8+ */ 48, 46, /* 12+ */ -1, -1) /* only exists on IVB+ */ +F8(3src_a16_src_hw_type, /* 4+ */ 43, 42, /* 8+ */ 45, 43, /* 12+ */ -1, -1) +F8(3src_src2_negate, /* 4+ */ 41, 41, /* 8+ */ 42, 42, /* 12+ */ 85, 85) +F8(3src_src2_abs, /* 4+ */ 40, 40, /* 8+ */ 41, 41, /* 12+ */ 84, 84) +F8(3src_src1_negate, /* 4+ */ 39, 39, /* 8+ */ 40, 40, /* 12+ */ 87, 87) +F8(3src_src1_abs, /* 4+ */ 38, 38, /* 8+ */ 39, 39, /* 12+ */ 86, 86) +F8(3src_src0_negate, /* 4+ */ 37, 37, /* 8+ */ 38, 38, /* 12+ */ 45, 45) +F8(3src_src0_abs, /* 4+ */ 36, 36, /* 8+ */ 37, 37, /* 12+ */ 44, 44) +F8(3src_a16_src1_type, /* 4+ */ -1, -1, /* 8+ */ 36, 36, /* 12+ */ -1, -1) +F8(3src_a16_src2_type, /* 4+ */ -1, -1, /* 8+ */ 35, 35, /* 12+ */ -1, -1) +F8(3src_a16_flag_reg_nr, /* 4+ */ 34, 34, /* 8+ */ 33, 33, /* 12+ */ -1, -1) +F8(3src_a16_flag_subreg_nr, /* 4+ */ 33, 33, /* 8+ */ 32, 32, /* 12+ */ -1, -1) +FF(3src_a16_dst_reg_file, + /* 4-5: doesn't exist - no 3-source instructions */ -1, -1, -1, -1, -1, -1, + /* 6: */ 32, 32, + /* 7-8: doesn't exist - no MRFs */ -1, -1, -1, -1, + /* 12: */ -1, -1, + /* 20: */ -1, -1) +F(3src_saturate, /* 4+ */ 31, 31, /* 12+ */ 34, 34) +F(3src_debug_control, /* 4+ */ 30, 30, /* 12+ */ 30, 30) +F(3src_cmpt_control, /* 4+ */ 29, 29, /* 12+ */ 29, 29) +FC(3src_acc_wr_control, /* 4+ */ 28, 28, /* 12+ */ 33, 33, devinfo->ver < 20) +F(3src_cond_modifier, /* 4+ */ 27, 24, /* 12+ */ 95, 92) +F(3src_exec_size, /* 4+ */ 23, 21, /* 12+ */ 18, 16) +F(3src_pred_inv, /* 4+ */ 20, 20, /* 12+ */ 28, 28) +F20(3src_pred_control, /* 4+ */ 19, 16, /* 12+ */ 27, 24, /* 20+ */ 27, 26) +F(3src_thread_control, /* 4+ */ 15, 14, /* 12+ */ -1, -1) +F(3src_atomic_control, /* 4+ */ -1, -1, /* 12+ */ 32, 32) +F20(3src_qtr_control, /* 4+ */ 13, 12, /* 12+ */ 21, 20, /* 20+ */ 25, 24) +F8(3src_no_dd_check, /* 4+ */ 11, 11, /* 8+ */ 10, 10, /* 12+ */ -1, -1) +F8(3src_no_dd_clear, /* 4+ */ 10, 10, /* 8+ */ 9, 9, /* 12+ */ -1, -1) +F8(3src_mask_control, /* 4+ */ 9, 9, /* 8+ */ 34, 34, /* 12+ */ 31, 31) +FK(3src_access_mode, /* 4+ */ 8, 8, /* 12+ */ BRW_ALIGN_1) +F(3src_swsb, /* 4+ */ -1, -1, /* 12+ */ 15, 8) +/* Bit 7 is Reserved (for future Opcode expansion) */ +F(3src_hw_opcode, /* 4+ */ 6, 0, /* 12+ */ 6, 0) +/** @} */ + +#define REG_TYPE(reg) \ +static inline void \ +brw_inst_set_3src_a16_##reg##_type(const struct intel_device_info *devinfo, \ + brw_inst *inst, enum brw_reg_type type) \ +{ \ + unsigned hw_type = brw_reg_type_to_a16_hw_3src_type(devinfo, type); \ + brw_inst_set_3src_a16_##reg##_hw_type(devinfo, inst, hw_type); \ +} \ + \ +static inline enum brw_reg_type \ +brw_inst_3src_a16_##reg##_type(const struct intel_device_info *devinfo, \ + const brw_inst *inst) \ +{ \ + unsigned hw_type = brw_inst_3src_a16_##reg##_hw_type(devinfo, inst); \ + return brw_a16_hw_3src_type_to_reg_type(devinfo, hw_type); \ +} + +REG_TYPE(dst) +REG_TYPE(src) +#undef REG_TYPE + +/** + * Three-source align1 instructions: + * @{ + */ +/* Reserved 127:126 */ +/* src2_reg_nr same in align16 */ +FD20(3src_a1_src2_subreg_nr,/* 4+ */ 117, 113, /* 12+ */ 119, 115, /* 20+ */ 119, 115, -1) +FC(3src_a1_src2_hstride, /* 4+ */ 112, 111, /* 12+ */ 113, 112, devinfo->ver >= 10) +/* Reserved 110:109. src2 vstride is an implied parameter */ +FC(3src_a1_src2_hw_type, /* 4+ */ 108, 106, /* 12+ */ 82, 80, devinfo->ver >= 10) +/* Reserved 105 */ +/* src1_reg_nr same in align16 */ +FD20(3src_a1_src1_subreg_nr, /* 4+ */ 96, 92, /* 12+ */ 103, 99, /* 20+ */ 103, 99, -1) +FC(3src_a1_src1_hstride, /* 4+ */ 91, 90, /* 12+ */ 97, 96, devinfo->ver >= 10) +FDC(3src_a1_src1_vstride, /* 4+ */ 89, 88, /* 12+ */ 91, 91, 83, 83, devinfo->ver >= 10) +FC(3src_a1_src1_hw_type, /* 4+ */ 87, 85, /* 12+ */ 90, 88, devinfo->ver >= 10) +/* Reserved 84 */ +/* src0_reg_nr same in align16 */ +FD20(3src_a1_src0_subreg_nr, /* 4+ */ 75, 71, /* 12+ */ 71, 67, /* 20+ */ 71, 67, -1) +FC(3src_a1_src0_hstride, /* 4+ */ 70, 69, /* 12+ */ 65, 64, devinfo->ver >= 10) +FDC(3src_a1_src0_vstride, /* 4+ */ 68, 67, /* 12+ */ 43, 43, 35, 35, devinfo->ver >= 10) +FC(3src_a1_src0_hw_type, /* 4+ */ 66, 64, /* 12+ */ 42, 40, devinfo->ver >= 10) +/* dst_reg_nr same in align16 */ +FC(3src_a1_dst_subreg_nr, /* 4+ */ 55, 54, /* 12+ */ 55, 54, devinfo->ver >= 10) +FC(3src_a1_special_acc, /* 4+ */ 55, 52, /* 12+ */ 54, 51, devinfo->ver >= 10) /* aliases dst_subreg_nr */ +/* Reserved 51:50 */ +FC(3src_a1_dst_hstride, /* 4+ */ 49, 49, /* 12+ */ 48, 48, devinfo->ver >= 10) +FC(3src_a1_dst_hw_type, /* 4+ */ 48, 46, /* 12+ */ 38, 36, devinfo->ver >= 10) +FI(3src_a1_src2_reg_file, /* 4+ */ -1, -1, /* 8+ */ 45, 45, /* 12+ */ 47, 114) +FC(3src_a1_src1_reg_file, /* 4+ */ 44, 44, /* 12+ */ 98, 98, devinfo->ver >= 10) +FI(3src_a1_src0_reg_file, /* 4+ */ -1, -1, /* 8+ */ 43, 43, /* 12+ */ 46, 66) + +F(3src_a1_src2_is_imm, /* 4+ */ -1, -1, /* 12+ */ 47, 47) +F(3src_a1_src0_is_imm, /* 4+ */ -1, -1, /* 12+ */ 46, 46) + +/* Source Modifier fields same in align16 */ +FC(3src_a1_dst_reg_file, /* 4+ */ 36, 36, /* 12+ */ 50, 50, devinfo->ver >= 10) +FC(3src_a1_exec_type, /* 4+ */ 35, 35, /* 12+ */ 39, 39, devinfo->ver >= 10) +/* Fields below this same in align16 */ +/** @} */ + +#define REG_TYPE(reg) \ +static inline void \ +brw_inst_set_3src_a1_##reg##_type(const struct intel_device_info *devinfo, \ + brw_inst *inst, enum brw_reg_type type) \ +{ \ + UNUSED enum gfx10_align1_3src_exec_type exec_type = \ + (enum gfx10_align1_3src_exec_type) brw_inst_3src_a1_exec_type(devinfo, \ + inst); \ + if (brw_reg_type_is_floating_point(type)) { \ + assert(exec_type == BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT); \ + } else { \ + assert(exec_type == BRW_ALIGN1_3SRC_EXEC_TYPE_INT); \ + } \ + unsigned hw_type = brw_reg_type_to_a1_hw_3src_type(devinfo, type); \ + brw_inst_set_3src_a1_##reg##_hw_type(devinfo, inst, hw_type); \ +} \ + \ +static inline enum brw_reg_type \ +brw_inst_3src_a1_##reg##_type(const struct intel_device_info *devinfo, \ + const brw_inst *inst) \ +{ \ + enum gfx10_align1_3src_exec_type exec_type = \ + (enum gfx10_align1_3src_exec_type) brw_inst_3src_a1_exec_type(devinfo, \ + inst); \ + unsigned hw_type = brw_inst_3src_a1_##reg##_hw_type(devinfo, inst); \ + return brw_a1_hw_3src_type_to_reg_type(devinfo, hw_type, exec_type); \ +} + +REG_TYPE(dst) +REG_TYPE(src0) +REG_TYPE(src1) +REG_TYPE(src2) +#undef REG_TYPE + +/** + * Three-source align1 instruction immediates: + * @{ + */ +static inline uint16_t +brw_inst_3src_a1_src0_imm(ASSERTED const struct intel_device_info *devinfo, + const brw_inst *insn) +{ + assert(devinfo->ver >= 10); + if (devinfo->ver >= 12) + return brw_inst_bits(insn, 79, 64); + else + return brw_inst_bits(insn, 82, 67); +} + +static inline uint16_t +brw_inst_3src_a1_src2_imm(ASSERTED const struct intel_device_info *devinfo, + const brw_inst *insn) +{ + assert(devinfo->ver >= 10); + if (devinfo->ver >= 12) + return brw_inst_bits(insn, 127, 112); + else + return brw_inst_bits(insn, 124, 109); +} + +static inline void +brw_inst_set_3src_a1_src0_imm(ASSERTED const struct intel_device_info *devinfo, + brw_inst *insn, uint16_t value) +{ + assert(devinfo->ver >= 10); + if (devinfo->ver >= 12) + brw_inst_set_bits(insn, 79, 64, value); + else + brw_inst_set_bits(insn, 82, 67, value); +} + +static inline void +brw_inst_set_3src_a1_src2_imm(ASSERTED const struct intel_device_info *devinfo, + brw_inst *insn, uint16_t value) +{ + assert(devinfo->ver >= 10); + if (devinfo->ver >= 12) + brw_inst_set_bits(insn, 127, 112, value); + else + brw_inst_set_bits(insn, 124, 109, value); +} +/** @} */ + +/** + * Three-source systolic instructions: + * @{ + */ +F(dpas_3src_src2_reg_nr, /* 4+ */ -1, -1, /* 12+ */ 127, 120) +F(dpas_3src_src2_subreg_nr, /* 4+ */ -1, -1, /* 12+ */ 119, 115) +F(dpas_3src_src2_reg_file, /* 4+ */ -1, -1, /* 12+ */ 114, 114) +F(dpas_3src_src1_reg_nr, /* 4+ */ -1, -1, /* 12+ */ 111, 104) +F(dpas_3src_src1_subreg_nr, /* 4+ */ -1, -1, /* 12+ */ 103, 99) +F(dpas_3src_src1_reg_file, /* 4+ */ -1, -1, /* 12+ */ 98, 98) +F(dpas_3src_src1_hw_type, /* 4+ */ -1, -1, /* 12+ */ 90, 88) +F(dpas_3src_src1_subbyte, /* 4+ */ -1, -1, /* 12+ */ 87, 86) +F(dpas_3src_src2_subbyte, /* 4+ */ -1, -1, /* 12+ */ 85, 84) +F(dpas_3src_src2_hw_type, /* 4+ */ -1, -1, /* 12+ */ 82, 80) +F(dpas_3src_src0_reg_nr, /* 4+ */ -1, -1, /* 12+ */ 79, 72) +F(dpas_3src_src0_subreg_nr, /* 4+ */ -1, -1, /* 12+ */ 71, 67) +F(dpas_3src_src0_reg_file, /* 4+ */ -1, -1, /* 12+ */ 66, 66) +F(dpas_3src_dst_reg_nr, /* 4+ */ -1, -1, /* 12+ */ 63, 56) +F(dpas_3src_dst_subreg_nr, /* 4+ */ -1, -1, /* 12+ */ 55, 51) +F(dpas_3src_dst_reg_file, /* 4+ */ -1, -1, /* 12+ */ 50, 50) +F(dpas_3src_sdepth, /* 4+ */ -1, -1, /* 12+ */ 49, 48) +F(dpas_3src_rcount, /* 4+ */ -1, -1, /* 12+ */ 45, 43) +F(dpas_3src_src0_hw_type, /* 4+ */ -1, -1, /* 12+ */ 42, 40) +F(dpas_3src_exec_type, /* 4+ */ -1, -1, /* 12+ */ 39, 39) +F(dpas_3src_dst_hw_type, /* 4+ */ -1, -1, /* 12+ */ 38, 36) +/** @} */ + +#define REG_TYPE(reg) \ +static inline void \ +brw_inst_set_dpas_3src_##reg##_type(const struct intel_device_info *devinfo, \ + brw_inst *inst, enum brw_reg_type type) \ +{ \ + UNUSED enum gfx10_align1_3src_exec_type exec_type = \ + (enum gfx10_align1_3src_exec_type) brw_inst_dpas_3src_exec_type(devinfo,\ + inst); \ + if (brw_reg_type_is_floating_point(type)) { \ + assert(exec_type == BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT); \ + } else { \ + assert(exec_type == BRW_ALIGN1_3SRC_EXEC_TYPE_INT); \ + } \ + unsigned hw_type = brw_reg_type_to_a1_hw_3src_type(devinfo, type); \ + brw_inst_set_dpas_3src_##reg##_hw_type(devinfo, inst, hw_type); \ +} \ + \ +static inline enum brw_reg_type \ +brw_inst_dpas_3src_##reg##_type(const struct intel_device_info *devinfo, \ + const brw_inst *inst) \ +{ \ + enum gfx10_align1_3src_exec_type exec_type = \ + (enum gfx10_align1_3src_exec_type) brw_inst_dpas_3src_exec_type(devinfo,\ + inst); \ + unsigned hw_type = brw_inst_dpas_3src_##reg##_hw_type(devinfo, inst); \ + return brw_a1_hw_3src_type_to_reg_type(devinfo, hw_type, exec_type); \ +} + +REG_TYPE(dst) +REG_TYPE(src0) +REG_TYPE(src1) +REG_TYPE(src2) +#undef REG_TYPE + +/** + * Flow control instruction bits: + * @{ + */ +static inline void +brw_inst_set_uip(const struct intel_device_info *devinfo, + brw_inst *inst, int32_t value) +{ + assert(devinfo->ver >= 6); + + if (devinfo->ver >= 12) + brw_inst_set_src1_is_imm(devinfo, inst, 1); + + if (devinfo->ver >= 8) { + brw_inst_set_bits(inst, 95, 64, (uint32_t)value); + } else { + assert(value <= (1 << 16) - 1); + assert(value > -(1 << 16)); + brw_inst_set_bits(inst, 127, 112, (uint16_t)value); + } +} + +static inline int32_t +brw_inst_uip(const struct intel_device_info *devinfo, const brw_inst *inst) +{ + assert(devinfo->ver >= 6); + + if (devinfo->ver >= 8) { + return brw_inst_bits(inst, 95, 64); + } else { + return (int16_t)brw_inst_bits(inst, 127, 112); + } +} + +static inline void +brw_inst_set_jip(const struct intel_device_info *devinfo, + brw_inst *inst, int32_t value) +{ + assert(devinfo->ver >= 6); + + if (devinfo->ver >= 12) + brw_inst_set_src0_is_imm(devinfo, inst, 1); + + if (devinfo->ver >= 8) { + brw_inst_set_bits(inst, 127, 96, (uint32_t)value); + } else { + assert(value <= (1 << 15) - 1); + assert(value >= -(1 << 15)); + brw_inst_set_bits(inst, 111, 96, (uint16_t)value); + } +} + +static inline int32_t +brw_inst_jip(const struct intel_device_info *devinfo, const brw_inst *inst) +{ + assert(devinfo->ver >= 6); + + if (devinfo->ver >= 8) { + return brw_inst_bits(inst, 127, 96); + } else { + return (int16_t)brw_inst_bits(inst, 111, 96); + } +} + +/** Like FC, but using int16_t to handle negative jump targets. */ +#define FJ(name, high, low, assertions) \ +static inline void \ +brw_inst_set_##name(const struct intel_device_info *devinfo, brw_inst *inst, int16_t v) \ +{ \ + assert(assertions); \ + (void) devinfo; \ + brw_inst_set_bits(inst, high, low, (uint16_t) v); \ +} \ +static inline int16_t \ +brw_inst_##name(const struct intel_device_info *devinfo, const brw_inst *inst)\ +{ \ + assert(assertions); \ + (void) devinfo; \ + return brw_inst_bits(inst, high, low); \ +} + +FJ(gfx6_jump_count, 63, 48, devinfo->ver == 6) +FJ(gfx4_jump_count, 111, 96, devinfo->ver < 6) +FC(gfx4_pop_count, /* 4+ */ 115, 112, /* 12+ */ -1, -1, devinfo->ver < 6) +/** @} */ + +/** + * SEND instructions: + * @{ + */ +FC(send_ex_desc_ia_subreg_nr, /* 4+ */ 82, 80, /* 12+ */ 42, 40, devinfo->ver >= 9) +FC(send_src0_address_mode, /* 4+ */ 79, 79, /* 12+ */ -1, -1, devinfo->ver >= 9) +FC(send_sel_reg32_desc, /* 4+ */ 77, 77, /* 12+ */ 48, 48, devinfo->ver >= 9) +FC(send_sel_reg32_ex_desc, /* 4+ */ 61, 61, /* 12+ */ 49, 49, devinfo->ver >= 9) +F8(send_src0_reg_file, /* 4+ */ 38, 37, /* 8+ */ 42, 41, /* 12+ */ 66, 66) +FC(send_src1_reg_nr, /* 4+ */ 51, 44, /* 12+ */ 111, 104, devinfo->ver >= 9) +FC(send_src1_len, /* 4+ */ -1, -1, /* 12+ */ 103, 99, devinfo->verx10 >= 125) +FC(send_src1_reg_file, /* 4+ */ 36, 36, /* 12+ */ 98, 98, devinfo->ver >= 9) +FC(send_dst_reg_file, /* 4+ */ 35, 35, /* 12+ */ 50, 50, devinfo->ver >= 9) +FC(send_ex_bso, /* 4+ */ -1, -1, /* 12+ */ 39, 39, devinfo->verx10 >= 125) +/** @} */ + +/* Message descriptor bits */ +#define MD(x) ((x) + 96) +#define MD12(x) ((x) >= 30 ? (x) - 30 + 122 : \ + (x) >= 25 ? (x) - 25 + 67 : \ + (x) >= 20 ? (x) - 20 + 51 : \ + (x) >= 11 ? (x) - 11 + 113 : \ + (x) - 0 + 81) + +/** + * Set the SEND(C) message descriptor immediate. + * + * This doesn't include the SFID nor the EOT field that were considered to be + * part of the message descriptor by ancient versions of the BSpec, because + * they are present in the instruction even if the message descriptor is + * provided indirectly in the address register, so we want to specify them + * separately. + */ +static inline void +brw_inst_set_send_desc(const struct intel_device_info *devinfo, + brw_inst *inst, uint32_t value) +{ + if (devinfo->ver >= 12) { + brw_inst_set_bits(inst, 123, 122, GET_BITS(value, 31, 30)); + brw_inst_set_bits(inst, 71, 67, GET_BITS(value, 29, 25)); + brw_inst_set_bits(inst, 55, 51, GET_BITS(value, 24, 20)); + brw_inst_set_bits(inst, 121, 113, GET_BITS(value, 19, 11)); + brw_inst_set_bits(inst, 91, 81, GET_BITS(value, 10, 0)); + } else if (devinfo->ver >= 9) { + brw_inst_set_bits(inst, 126, 96, value); + assert(value >> 31 == 0); + } else if (devinfo->ver >= 5) { + brw_inst_set_bits(inst, 124, 96, value); + assert(value >> 29 == 0); + } else { + brw_inst_set_bits(inst, 119, 96, value); + assert(value >> 24 == 0); + } +} + +/** + * Get the SEND(C) message descriptor immediate. + * + * \sa brw_inst_set_send_desc(). + */ +static inline uint32_t +brw_inst_send_desc(const struct intel_device_info *devinfo, + const brw_inst *inst) +{ + if (devinfo->ver >= 12) { + return (brw_inst_bits(inst, 123, 122) << 30 | + brw_inst_bits(inst, 71, 67) << 25 | + brw_inst_bits(inst, 55, 51) << 20 | + brw_inst_bits(inst, 121, 113) << 11 | + brw_inst_bits(inst, 91, 81)); + } else if (devinfo->ver >= 9) { + return brw_inst_bits(inst, 126, 96); + } else if (devinfo->ver >= 5) { + return brw_inst_bits(inst, 124, 96); + } else { + return brw_inst_bits(inst, 119, 96); + } +} + +/** + * Set the SEND(C) message extended descriptor immediate. + * + * This doesn't include the SFID nor the EOT field that were considered to be + * part of the extended message descriptor by some versions of the BSpec, + * because they are present in the instruction even if the extended message + * descriptor is provided indirectly in a register, so we want to specify them + * separately. + */ +static inline void +brw_inst_set_send_ex_desc(const struct intel_device_info *devinfo, + brw_inst *inst, uint32_t value) +{ + if (devinfo->ver >= 12) { + brw_inst_set_bits(inst, 127, 124, GET_BITS(value, 31, 28)); + brw_inst_set_bits(inst, 97, 96, GET_BITS(value, 27, 26)); + brw_inst_set_bits(inst, 65, 64, GET_BITS(value, 25, 24)); + brw_inst_set_bits(inst, 47, 35, GET_BITS(value, 23, 11)); + brw_inst_set_bits(inst, 103, 99, GET_BITS(value, 10, 6)); + assert(GET_BITS(value, 5, 0) == 0); + } else { + assert(devinfo->ver >= 9); + brw_inst_set_bits(inst, 94, 91, GET_BITS(value, 31, 28)); + brw_inst_set_bits(inst, 88, 85, GET_BITS(value, 27, 24)); + brw_inst_set_bits(inst, 83, 80, GET_BITS(value, 23, 20)); + brw_inst_set_bits(inst, 67, 64, GET_BITS(value, 19, 16)); + assert(GET_BITS(value, 15, 0) == 0); + } +} + +/** + * Set the SENDS(C) message extended descriptor immediate. + * + * This doesn't include the SFID nor the EOT field that were considered to be + * part of the extended message descriptor by some versions of the BSpec, + * because they are present in the instruction even if the extended message + * descriptor is provided indirectly in a register, so we want to specify them + * separately. + */ +static inline void +brw_inst_set_sends_ex_desc(const struct intel_device_info *devinfo, + brw_inst *inst, uint32_t value) +{ + if (devinfo->ver >= 12) { + brw_inst_set_send_ex_desc(devinfo, inst, value); + } else { + brw_inst_set_bits(inst, 95, 80, GET_BITS(value, 31, 16)); + assert(GET_BITS(value, 15, 10) == 0); + brw_inst_set_bits(inst, 67, 64, GET_BITS(value, 9, 6)); + assert(GET_BITS(value, 5, 0) == 0); + } +} + +/** + * Get the SEND(C) message extended descriptor immediate. + * + * \sa brw_inst_set_send_ex_desc(). + */ +static inline uint32_t +brw_inst_send_ex_desc(const struct intel_device_info *devinfo, + const brw_inst *inst) +{ + if (devinfo->ver >= 12) { + return (brw_inst_bits(inst, 127, 124) << 28 | + brw_inst_bits(inst, 97, 96) << 26 | + brw_inst_bits(inst, 65, 64) << 24 | + brw_inst_bits(inst, 47, 35) << 11 | + brw_inst_bits(inst, 103, 99) << 6); + } else { + assert(devinfo->ver >= 9); + return (brw_inst_bits(inst, 94, 91) << 28 | + brw_inst_bits(inst, 88, 85) << 24 | + brw_inst_bits(inst, 83, 80) << 20 | + brw_inst_bits(inst, 67, 64) << 16); + } +} + +/** + * Get the SENDS(C) message extended descriptor immediate. + * + * \sa brw_inst_set_send_ex_desc(). + */ +static inline uint32_t +brw_inst_sends_ex_desc(const struct intel_device_info *devinfo, + const brw_inst *inst) +{ + if (devinfo->ver >= 12) { + return brw_inst_send_ex_desc(devinfo, inst); + } else { + return (brw_inst_bits(inst, 95, 80) << 16 | + brw_inst_bits(inst, 67, 64) << 6); + } +} + +/** + * Fields for SEND messages: + * @{ + */ +F(eot, /* 4+ */ 127, 127, /* 12+ */ 34, 34) +FF(mlen, + /* 4: */ 119, 116, + /* 4.5: */ 119, 116, + /* 5: */ 124, 121, + /* 6: */ 124, 121, + /* 7: */ 124, 121, + /* 8: */ 124, 121, + /* 12: */ MD12(28), MD12(25), + /* 20: */ MD12(28), MD12(25)); +FF(rlen, + /* 4: */ 115, 112, + /* 4.5: */ 115, 112, + /* 5: */ 120, 116, + /* 6: */ 120, 116, + /* 7: */ 120, 116, + /* 8: */ 120, 116, + /* 12: */ MD12(24), MD12(20), + /* 20: */ MD12(24), MD12(20)); +FF(header_present, + /* 4: doesn't exist */ -1, -1, -1, -1, + /* 5: */ 115, 115, + /* 6: */ 115, 115, + /* 7: */ 115, 115, + /* 8: */ 115, 115, + /* 12: */ MD12(19), MD12(19), + /* 20: */ MD12(19), MD12(19)) +F(gateway_notify, /* 4+ */ MD(16), MD(15), /* 12+ */ -1, -1) +FD(function_control, + /* 4: */ 111, 96, + /* 4.5: */ 111, 96, + /* 5: */ 114, 96, + /* 6: */ 114, 96, + /* 7: */ 114, 96, + /* 8: */ 114, 96, + /* 12: */ MD12(18), MD12(11), MD12(10), MD12(0)) +FF(gateway_subfuncid, + /* 4: */ MD(1), MD(0), + /* 4.5: */ MD(1), MD(0), + /* 5: */ MD(1), MD(0), /* 2:0, but bit 2 is reserved MBZ */ + /* 6: */ MD(2), MD(0), + /* 7: */ MD(2), MD(0), + /* 8: */ MD(2), MD(0), + /* 12: */ MD12(2), MD12(0), + /* 20: */ MD12(2), MD12(0)) +FF(sfid, + /* 4: */ 123, 120, /* called msg_target */ + /* 4.5 */ 123, 120, + /* 5: */ 95, 92, + /* 6: */ 27, 24, + /* 7: */ 27, 24, + /* 8: */ 27, 24, + /* 12: */ 95, 92, + /* 20: */ 95, 92) +FF(null_rt, + /* 4-7: */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + /* 8: */ 80, 80, + /* 12: */ 44, 44, + /* 20: */ 44, 44) /* actually only Gfx11+ */ +FC(base_mrf, /* 4+ */ 27, 24, /* 12+ */ -1, -1, devinfo->ver < 6); +FF(send_rta_index, + /* 4: */ -1, -1, + /* 4.5 */ -1, -1, + /* 5: */ -1, -1, + /* 6: */ -1, -1, + /* 7: */ -1, -1, + /* 8: */ -1, -1, + /* 12: */ 38, 36, + /* 20: */ 38, 36) +/** @} */ + +/** + * URB message function control bits: + * @{ + */ +FF(urb_per_slot_offset, + /* 4-6: */ -1, -1, -1, -1, -1, -1, -1, -1, + /* 7: */ MD(16), MD(16), + /* 8: */ MD(17), MD(17), + /* 12: */ MD12(17), MD12(17), + /* 20: */ MD12(17), MD12(17)) +FC(urb_channel_mask_present, /* 4+ */ MD(15), MD(15), /* 12+ */ MD12(15), MD12(15), devinfo->ver >= 8) +FC(urb_complete, /* 4+ */ MD(15), MD(15), /* 12+ */ -1, -1, devinfo->ver < 8) +FC(urb_used, /* 4+ */ MD(14), MD(14), /* 12+ */ -1, -1, devinfo->ver < 7) +FC(urb_allocate, /* 4+ */ MD(13), MD(13), /* 12+ */ -1, -1, devinfo->ver < 7) +FF(urb_swizzle_control, + /* 4: */ MD(11), MD(10), + /* 4.5: */ MD(11), MD(10), + /* 5: */ MD(11), MD(10), + /* 6: */ MD(11), MD(10), + /* 7: */ MD(14), MD(14), + /* 8: */ MD(15), MD(15), + /* 12: */ -1, -1, + /* 20: */ -1, -1) +FD(urb_global_offset, + /* 4: */ MD( 9), MD(4), + /* 4.5: */ MD( 9), MD(4), + /* 5: */ MD( 9), MD(4), + /* 6: */ MD( 9), MD(4), + /* 7: */ MD(13), MD(3), + /* 8: */ MD(14), MD(4), + /* 12: */ MD12(14), MD12(11), MD12(10), MD12(4)) +FF(urb_opcode, + /* 4: */ MD( 3), MD(0), + /* 4.5: */ MD( 3), MD(0), + /* 5: */ MD( 3), MD(0), + /* 6: */ MD( 3), MD(0), + /* 7: */ MD( 2), MD(0), + /* 8: */ MD( 3), MD(0), + /* 12: */ MD12(3), MD12(0), + /* 20: */ MD12(3), MD12(0)) +/** @} */ + +/** + * Gfx4-5 math messages: + * @{ + */ +FC(math_msg_data_type, /* 4+ */ MD(7), MD(7), /* 12+ */ -1, -1, devinfo->ver < 6) +FC(math_msg_saturate, /* 4+ */ MD(6), MD(6), /* 12+ */ -1, -1, devinfo->ver < 6) +FC(math_msg_precision, /* 4+ */ MD(5), MD(5), /* 12+ */ -1, -1, devinfo->ver < 6) +FC(math_msg_signed_int, /* 4+ */ MD(4), MD(4), /* 12+ */ -1, -1, devinfo->ver < 6) +FC(math_msg_function, /* 4+ */ MD(3), MD(0), /* 12+ */ -1, -1, devinfo->ver < 6) +/** @} */ + +/** + * Sampler message function control bits: + * @{ + */ +FF(sampler_simd_mode, + /* 4: doesn't exist */ -1, -1, -1, -1, + /* 5: */ MD(17), MD(16), + /* 6: */ MD(17), MD(16), + /* 7: */ MD(18), MD(17), + /* 8: */ MD(18), MD(17), + /* 12: */ MD12(18), MD12(17), + /* 20: */ MD12(18), MD12(17)) +FF(sampler_msg_type, + /* 4: */ MD(15), MD(14), + /* 4.5: */ MD(15), MD(12), + /* 5: */ MD(15), MD(12), + /* 6: */ MD(15), MD(12), + /* 7: */ MD(16), MD(12), + /* 8: */ MD(16), MD(12), + /* 12: */ MD12(16), MD12(12), + /* 20: */ MD12(16), MD12(12)) +FC(sampler_return_format, /* 4+ */ MD(13), MD(12), /* 12+ */ -1, -1, devinfo->verx10 == 40) +FD(sampler, + /* 4: */ MD(11), MD(8), + /* 4.5: */ MD(11), MD(8), + /* 5: */ MD(11), MD(8), + /* 6: */ MD(11), MD(8), + /* 7: */ MD(11), MD(8), + /* 8: */ MD(11), MD(8), + /* 12: */ MD12(11), MD12(11), MD12(10), MD12(8)) +F(binding_table_index, /* 4+ */ MD(7), MD(0), /* 12+ */ MD12(7), MD12(0)) /* also used by other messages */ +/** @} */ + +/** + * Data port message function control bits: + * @{ + */ +FC(dp_category, /* 4+ */ MD(18), MD(18), /* 12+ */ MD12(18), MD12(18), devinfo->ver >= 7) + +/* Gfx4-5 store fields in different bits for read/write messages. */ +FF(dp_read_msg_type, + /* 4: */ MD(13), MD(12), + /* 4.5: */ MD(13), MD(11), + /* 5: */ MD(13), MD(11), + /* 6: */ MD(16), MD(13), + /* 7: */ MD(17), MD(14), + /* 8: */ MD(17), MD(14), + /* 12: */ MD12(17), MD12(14), + /* 20: */ MD12(17), MD12(14)) +FF(dp_write_msg_type, + /* 4: */ MD(14), MD(12), + /* 4.5: */ MD(14), MD(12), + /* 5: */ MD(14), MD(12), + /* 6: */ MD(16), MD(13), + /* 7: */ MD(17), MD(14), + /* 8: */ MD(17), MD(14), + /* 12: */ MD12(17), MD12(14), + /* 20: */ MD12(17), MD12(14)) +FD(dp_read_msg_control, + /* 4: */ MD(11), MD( 8), + /* 4.5: */ MD(10), MD( 8), + /* 5: */ MD(10), MD( 8), + /* 6: */ MD(12), MD( 8), + /* 7: */ MD(13), MD( 8), + /* 8: */ MD(13), MD( 8), + /* 12: */ MD12(13), MD12(11), MD12(10), MD12(8)) +FD(dp_write_msg_control, + /* 4: */ MD(11), MD( 8), + /* 4.5: */ MD(11), MD( 8), + /* 5: */ MD(11), MD( 8), + /* 6: */ MD(12), MD( 8), + /* 7: */ MD(13), MD( 8), + /* 8: */ MD(13), MD( 8), + /* 12: */ MD12(13), MD12(11), MD12(10), MD12(8)) +FC(dp_read_target_cache, /* 4+ */ MD(15), MD(14), /* 12+ */ -1, -1, devinfo->ver < 6); + +FF(dp_write_commit, + /* 4: */ MD(15), MD(15), + /* 4.5: */ MD(15), MD(15), + /* 5: */ MD(15), MD(15), + /* 6: */ MD(17), MD(17), + /* 7+: does not exist */ -1, -1, -1, -1, + /* 12: */ -1, -1, + /* 20: */ -1, -1) + +/* Gfx6+ use the same bit locations for everything. */ +FF(dp_msg_type, + /* 4-5: use dp_read_msg_type or dp_write_msg_type instead */ + -1, -1, -1, -1, -1, -1, + /* 6: */ MD(16), MD(13), + /* 7: */ MD(17), MD(14), + /* 8: */ MD(18), MD(14), + /* 12: */ MD12(18), MD12(14), + /* 20: */ MD12(18), MD12(14)) +FD(dp_msg_control, + /* 4: */ MD(11), MD( 8), + /* 4.5-5: use dp_read_msg_control or dp_write_msg_control */ -1, -1, -1, -1, + /* 6: */ MD(12), MD( 8), + /* 7: */ MD(13), MD( 8), + /* 8: */ MD(13), MD( 8), + /* 12: */ MD12(13), MD12(11), MD12(10), MD12(8)) +/** @} */ + +/** + * Scratch message bits (Gfx7+): + * @{ + */ +FC(scratch_read_write, /* 4+ */ MD(17), MD(17), /* 12+ */ MD12(17), MD12(17), devinfo->ver >= 7) /* 0 = read, 1 = write */ +FC(scratch_type, /* 4+ */ MD(16), MD(16), /* 12+ */ -1, -1, devinfo->ver >= 7) /* 0 = OWord, 1 = DWord */ +FC(scratch_invalidate_after_read, /* 4+ */ MD(15), MD(15), /* 12+ */ MD12(15), MD12(15), devinfo->ver >= 7) +FC(scratch_block_size, /* 4+ */ MD(13), MD(12), /* 12+ */ MD12(13), MD12(12), devinfo->ver >= 7) +FD(scratch_addr_offset, + /* 4: */ -1, -1, + /* 4.5: */ -1, -1, + /* 5: */ -1, -1, + /* 6: */ -1, -1, + /* 7: */ MD(11), MD(0), + /* 8: */ MD(11), MD(0), + /* 12: */ MD12(11), MD12(11), MD12(10), MD12(0)) +/** @} */ + +/** + * Render Target message function control bits: + * @{ + */ +FF(rt_last, + /* 4: */ MD(11), MD(11), + /* 4.5: */ MD(11), MD(11), + /* 5: */ MD(11), MD(11), + /* 6: */ MD(12), MD(12), + /* 7: */ MD(12), MD(12), + /* 8: */ MD(12), MD(12), + /* 12: */ MD12(12), MD12(12), + /* 20: */ MD12(12), MD12(12)) +FC(rt_slot_group, /* 4+ */ MD(11), MD(11), /* 12+ */ MD12(11), MD12(11), devinfo->ver >= 6) +F(rt_message_type, /* 4+ */ MD(10), MD( 8), /* 12+ */ MD12(10), MD12(8)) +/** @} */ + +/** + * Thread Spawn message function control bits: + * @{ + */ +FC(ts_resource_select, /* 4+ */ MD( 4), MD( 4), /* 12+ */ -1, -1, devinfo->ver < 11) +FC(ts_request_type, /* 4+ */ MD( 1), MD( 1), /* 12+ */ -1, -1, devinfo->ver < 11) +F(ts_opcode, /* 4+ */ MD( 0), MD( 0), /* 12+ */ MD12(0), MD12(0)) +/** @} */ + +/** + * Pixel Interpolator message function control bits: + * @{ + */ +F(pi_simd_mode, /* 4+ */ MD(16), MD(16), /* 12+ */ MD12(16), MD12(16)) +F(pi_nopersp, /* 4+ */ MD(14), MD(14), /* 12+ */ MD12(14), MD12(14)) +F(pi_message_type, /* 4+ */ MD(13), MD(12), /* 12+ */ MD12(13), MD12(12)) +F(pi_slot_group, /* 4+ */ MD(11), MD(11), /* 12+ */ MD12(11), MD12(11)) +F(pi_message_data, /* 4+ */ MD(7), MD(0), /* 12+ */ MD12(7), MD12(0)) +/** @} */ + +/** + * Immediates: + * @{ + */ +static inline int +brw_inst_imm_d(const struct intel_device_info *devinfo, const brw_inst *insn) +{ + (void) devinfo; + return brw_inst_bits(insn, 127, 96); +} + +static inline unsigned +brw_inst_imm_ud(const struct intel_device_info *devinfo, const brw_inst *insn) +{ + (void) devinfo; + return brw_inst_bits(insn, 127, 96); +} + +static inline uint64_t +brw_inst_imm_uq(const struct intel_device_info *devinfo, + const brw_inst *insn) +{ + if (devinfo->ver >= 12) { + return brw_inst_bits(insn, 95, 64) << 32 | + brw_inst_bits(insn, 127, 96); + } else { + assert(devinfo->ver >= 8); + return brw_inst_bits(insn, 127, 64); + } +} + +static inline float +brw_inst_imm_f(const struct intel_device_info *devinfo, const brw_inst *insn) +{ + union { + float f; + uint32_t u; + } ft; + (void) devinfo; + ft.u = brw_inst_bits(insn, 127, 96); + return ft.f; +} + +static inline double +brw_inst_imm_df(const struct intel_device_info *devinfo, const brw_inst *insn) +{ + union { + double d; + uint64_t u; + } dt; + dt.u = brw_inst_imm_uq(devinfo, insn); + return dt.d; +} + +static inline void +brw_inst_set_imm_d(const struct intel_device_info *devinfo, + brw_inst *insn, int value) +{ + (void) devinfo; + return brw_inst_set_bits(insn, 127, 96, value); +} + +static inline void +brw_inst_set_imm_ud(const struct intel_device_info *devinfo, + brw_inst *insn, unsigned value) +{ + (void) devinfo; + return brw_inst_set_bits(insn, 127, 96, value); +} + +static inline void +brw_inst_set_imm_f(const struct intel_device_info *devinfo, + brw_inst *insn, float value) +{ + union { + float f; + uint32_t u; + } ft; + (void) devinfo; + ft.f = value; + brw_inst_set_bits(insn, 127, 96, ft.u); +} + +static inline void +brw_inst_set_imm_df(const struct intel_device_info *devinfo, + brw_inst *insn, double value) +{ + union { + double d; + uint64_t u; + } dt; + (void) devinfo; + dt.d = value; + + if (devinfo->ver >= 12) { + brw_inst_set_bits(insn, 95, 64, dt.u >> 32); + brw_inst_set_bits(insn, 127, 96, dt.u & 0xFFFFFFFF); + } else { + brw_inst_set_bits(insn, 127, 64, dt.u); + } +} + +static inline void +brw_inst_set_imm_uq(const struct intel_device_info *devinfo, + brw_inst *insn, uint64_t value) +{ + (void) devinfo; + if (devinfo->ver >= 12) { + brw_inst_set_bits(insn, 95, 64, value >> 32); + brw_inst_set_bits(insn, 127, 96, value & 0xFFFFFFFF); + } else { + brw_inst_set_bits(insn, 127, 64, value); + } +} + +/** @} */ + +#define REG_TYPE(reg) \ +static inline void \ +brw_inst_set_##reg##_file_type(const struct intel_device_info *devinfo, \ + brw_inst *inst, enum brw_reg_file file, \ + enum brw_reg_type type) \ +{ \ + assert(file <= BRW_IMMEDIATE_VALUE); \ + unsigned hw_type = brw_reg_type_to_hw_type(devinfo, file, type); \ + brw_inst_set_##reg##_reg_file(devinfo, inst, file); \ + brw_inst_set_##reg##_reg_hw_type(devinfo, inst, hw_type); \ +} \ + \ +static inline enum brw_reg_type \ +brw_inst_##reg##_type(const struct intel_device_info *devinfo, \ + const brw_inst *inst) \ +{ \ + unsigned file = __builtin_strcmp("dst", #reg) == 0 ? \ + (unsigned) BRW_GENERAL_REGISTER_FILE : \ + brw_inst_##reg##_reg_file(devinfo, inst); \ + unsigned hw_type = brw_inst_##reg##_reg_hw_type(devinfo, inst); \ + return brw_hw_type_to_reg_type(devinfo, (enum brw_reg_file)file, hw_type); \ +} + +REG_TYPE(dst) +REG_TYPE(src0) +REG_TYPE(src1) +#undef REG_TYPE + + +/* The AddrImm fields are split into two discontiguous sections on Gfx8+ */ +#define BRW_IA1_ADDR_IMM(reg, g4_high, g4_low, g8_nine, g8_high, g8_low, \ + g12_high, g12_low, g20_high, g20_low, g20_zero) \ +static inline void \ +brw_inst_set_##reg##_ia1_addr_imm(const struct \ + intel_device_info *devinfo, \ + brw_inst *inst, \ + unsigned value) \ +{ \ + if (devinfo->ver >= 20) { \ + assert((value & ~0x7ff) == 0); \ + brw_inst_set_bits(inst, g20_high, g20_low, value >> 1); \ + if (g20_zero == -1) \ + assert((value & 1) == 0); \ + else \ + brw_inst_set_bits(inst, g20_zero, g20_zero, value & 1); \ + } else if (devinfo->ver >= 12) { \ + assert((value & ~0x3ff) == 0); \ + brw_inst_set_bits(inst, g12_high, g12_low, value); \ + } else if (devinfo->ver >= 8) { \ + assert((value & ~0x3ff) == 0); \ + brw_inst_set_bits(inst, g8_high, g8_low, value & 0x1ff); \ + brw_inst_set_bits(inst, g8_nine, g8_nine, value >> 9); \ + } else { \ + assert((value & ~0x3ff) == 0); \ + brw_inst_set_bits(inst, g4_high, g4_low, value); \ + } \ +} \ +static inline unsigned \ +brw_inst_##reg##_ia1_addr_imm(const struct intel_device_info *devinfo, \ + const brw_inst *inst) \ +{ \ + if (devinfo->ver >= 20) { \ + return brw_inst_bits(inst, g20_high, g20_low) << 1 | \ + (g20_zero == -1 ? 0 : \ + brw_inst_bits(inst, g20_zero, g20_zero)); \ + } else if (devinfo->ver >= 12) { \ + return brw_inst_bits(inst, g12_high, g12_low); \ + } else if (devinfo->ver >= 8) { \ + return brw_inst_bits(inst, g8_high, g8_low) | \ + (brw_inst_bits(inst, g8_nine, g8_nine) << 9); \ + } else { \ + return brw_inst_bits(inst, g4_high, g4_low); \ + } \ +} + +/* AddrImm for Align1 Indirect Addressing */ +/* -Gen 4- ----Gfx8---- -Gfx12- ---Gfx20--- */ +BRW_IA1_ADDR_IMM(src1, 105, 96, 121, 104, 96, 107, 98, 107, 98, -1) +BRW_IA1_ADDR_IMM(src0, 73, 64, 95, 72, 64, 75, 66, 75, 66, 87) +BRW_IA1_ADDR_IMM(dst, 57, 48, 47, 56, 48, 59, 50, 59, 50, 33) + +#define BRW_IA16_ADDR_IMM(reg, g4_high, g4_low, g8_nine, g8_high, g8_low) \ +static inline void \ +brw_inst_set_##reg##_ia16_addr_imm(const struct \ + intel_device_info *devinfo, \ + brw_inst *inst, unsigned value) \ +{ \ + assert(devinfo->ver < 12); \ + assert((value & ~0x3ff) == 0); \ + if (devinfo->ver >= 8) { \ + assert(GET_BITS(value, 3, 0) == 0); \ + brw_inst_set_bits(inst, g8_high, g8_low, GET_BITS(value, 8, 4)); \ + brw_inst_set_bits(inst, g8_nine, g8_nine, GET_BITS(value, 9, 9)); \ + } else { \ + brw_inst_set_bits(inst, g4_high, g4_low, value); \ + } \ +} \ +static inline unsigned \ +brw_inst_##reg##_ia16_addr_imm(const struct intel_device_info *devinfo, \ + const brw_inst *inst) \ +{ \ + assert(devinfo->ver < 12); \ + if (devinfo->ver >= 8) { \ + return (brw_inst_bits(inst, g8_high, g8_low) << 4) | \ + (brw_inst_bits(inst, g8_nine, g8_nine) << 9); \ + } else { \ + return brw_inst_bits(inst, g4_high, g4_low); \ + } \ +} + +/* AddrImm[9:0] for Align16 Indirect Addressing: + * Compared to Align1, these are missing the low 4 bits. + * -Gen 4- ----Gfx8---- + */ +BRW_IA16_ADDR_IMM(src1, 105, 96, 121, 104, 100) +BRW_IA16_ADDR_IMM(src0, 73, 64, 95, 72, 68) +BRW_IA16_ADDR_IMM(dst, 57, 52, 47, 56, 52) +BRW_IA16_ADDR_IMM(send_src0, -1, -1, 78, 72, 68) +BRW_IA16_ADDR_IMM(send_dst, -1, -1, 62, 56, 52) + +/** + * Fetch a set of contiguous bits from the instruction. + * + * Bits indices range from 0..127; fields may not cross 64-bit boundaries. + */ +static inline uint64_t +brw_inst_bits(const brw_inst *inst, unsigned high, unsigned low) +{ + assume(high < 128); + assume(high >= low); + /* We assume the field doesn't cross 64-bit boundaries. */ + const unsigned word = high / 64; + assert(word == low / 64); + + high %= 64; + low %= 64; + + const uint64_t mask = (~0ull >> (64 - (high - low + 1))); + + return (inst->data[word] >> low) & mask; +} + +/** + * Set bits in the instruction, with proper shifting and masking. + * + * Bits indices range from 0..127; fields may not cross 64-bit boundaries. + */ +static inline void +brw_inst_set_bits(brw_inst *inst, unsigned high, unsigned low, uint64_t value) +{ + assume(high < 128); + assume(high >= low); + const unsigned word = high / 64; + assert(word == low / 64); + + high %= 64; + low %= 64; + + const uint64_t mask = (~0ull >> (64 - (high - low + 1))) << low; + + /* Make sure the supplied value actually fits in the given bitfield. */ + assert((value & (mask >> low)) == value); + + inst->data[word] = (inst->data[word] & ~mask) | (value << low); +} + +#undef BRW_IA16_ADDR_IMM +#undef BRW_IA1_ADDR_IMM +#undef MD +#undef F8 +#undef FF +#undef BOUNDS +#undef F +#undef FC +#undef F20 +#undef FD20 + +typedef struct { + uint64_t data; +} brw_compact_inst; + +/** + * Fetch a set of contiguous bits from the compacted instruction. + * + * Bits indices range from 0..63. + */ +static inline unsigned +brw_compact_inst_bits(const brw_compact_inst *inst, unsigned high, unsigned low) +{ + assume(high < 64); + assume(high >= low); + const uint64_t mask = (1ull << (high - low + 1)) - 1; + + return (inst->data >> low) & mask; +} + +/** + * Set bits in the compacted instruction. + * + * Bits indices range from 0..63. + */ +static inline void +brw_compact_inst_set_bits(brw_compact_inst *inst, unsigned high, unsigned low, + uint64_t value) +{ + assume(high < 64); + assume(high >= low); + const uint64_t mask = ((1ull << (high - low + 1)) - 1) << low; + + /* Make sure the supplied value actually fits in the given bitfield. */ + assert((value & (mask >> low)) == value); + + inst->data = (inst->data & ~mask) | (value << low); +} + +#define FC(name, high, low, gfx12_high, gfx12_low, assertions) \ +static inline void \ +brw_compact_inst_set_##name(const struct \ + intel_device_info *devinfo, \ + brw_compact_inst *inst, unsigned v) \ +{ \ + assert(assertions); \ + if (devinfo->ver >= 12) \ + brw_compact_inst_set_bits(inst, gfx12_high, gfx12_low, v); \ + else \ + brw_compact_inst_set_bits(inst, high, low, v); \ +} \ +static inline unsigned \ +brw_compact_inst_##name(const struct intel_device_info *devinfo, \ + const brw_compact_inst *inst) \ +{ \ + assert(assertions); \ + if (devinfo->ver >= 12) \ + return brw_compact_inst_bits(inst, gfx12_high, gfx12_low); \ + else \ + return brw_compact_inst_bits(inst, high, low); \ +} + +/* A simple macro for fields which stay in the same place on all generations + * except for Gfx12. + */ +#define F(name, high, low, gfx12_high, gfx12_low) \ + FC(name, high, low, gfx12_high, gfx12_low, true) + +/* A macro for fields which moved to several different locations + * across generations. + */ +#define F20(name, high, low, hi8, lo8, hi12, lo12, hi20, lo20) \ +static inline void \ +brw_compact_inst_set_##name(const struct \ + intel_device_info *devinfo, \ + brw_compact_inst *inst, unsigned v) \ +{ \ + if (devinfo->ver >= 20) \ + brw_compact_inst_set_bits(inst, hi20, lo20, v); \ + else if (devinfo->ver >= 12) \ + brw_compact_inst_set_bits(inst, hi12, lo12, v); \ + else if (devinfo->ver >= 8) \ + brw_compact_inst_set_bits(inst, hi8, lo8, v); \ + else \ + brw_compact_inst_set_bits(inst, high, low, v); \ +} \ +static inline unsigned \ +brw_compact_inst_##name(const struct intel_device_info *devinfo, \ + const brw_compact_inst *inst) \ +{ \ + if (devinfo->ver >= 20) \ + return brw_compact_inst_bits(inst, hi20, lo20); \ + else if (devinfo->ver >= 12) \ + return brw_compact_inst_bits(inst, hi12, lo12); \ + else if (devinfo->ver >= 8) \ + return brw_compact_inst_bits(inst, hi8, lo8); \ + else \ + return brw_compact_inst_bits(inst, high, low); \ +} + +/* A macro for fields which gained extra discontiguous bits in Gfx20 + * (specified by hi20ex-lo20ex). + */ +#define FD20(name, high, low, hi8, lo8, hi12, lo12, \ + hi20, lo20, hi20ex, lo20ex) \ + static inline void \ +brw_compact_inst_set_##name(const struct \ + intel_device_info *devinfo, \ + brw_compact_inst *inst, unsigned v) \ +{ \ + if (devinfo->ver >= 20) { \ + const unsigned k = hi20 - lo20 + 1; \ + brw_compact_inst_set_bits(inst, hi20ex, lo20ex, v >> k); \ + brw_compact_inst_set_bits(inst, hi20, lo20, v & ((1u << k) - 1)); \ + } else if (devinfo->ver >= 12) { \ + brw_compact_inst_set_bits(inst, hi12, lo12, v); \ + } else if (devinfo->ver >= 8) { \ + brw_compact_inst_set_bits(inst, hi8, lo8, v); \ + } else { \ + brw_compact_inst_set_bits(inst, high, low, v); \ + } \ +} \ +static inline unsigned \ +brw_compact_inst_##name(const struct intel_device_info *devinfo, \ + const brw_compact_inst *inst) \ +{ \ + if (devinfo->ver >= 20) { \ + const unsigned k = hi20 - lo20 + 1; \ + return (brw_compact_inst_bits(inst, hi20ex, lo20ex) << k | \ + brw_compact_inst_bits(inst, hi20, lo20)); \ + } else if (devinfo->ver >= 12) { \ + return brw_compact_inst_bits(inst, hi12, lo12); \ + } else if (devinfo->ver >= 8) { \ + return brw_compact_inst_bits(inst, hi8, lo8); \ + } else { \ + return brw_compact_inst_bits(inst, high, low); \ + } \ +} + +F(src1_reg_nr, /* 4+ */ 63, 56, /* 12+ */ 63, 56) +F(src0_reg_nr, /* 4+ */ 55, 48, /* 12+ */ 47, 40) +F20(dst_reg_nr, /* 4+ */ 47, 40, /* 8+ */ 47, 40, /* 12+ */ 23, 16, /* 20+ */ 39, 32) +F(src1_index, /* 4+ */ 39, 35, /* 12+ */ 55, 52) +F20(src0_index, /* 4+ */ 34, 30, /* 8+ */ 34, 30, /* 12+ */ 51, 48, /* 20+ */ 25, 23) +F(cmpt_control, /* 4+ */ 29, 29, /* 12+ */ 29, 29) /* Same location as brw_inst */ +FC(flag_subreg_nr, /* 4+ */ 28, 28, /* 12+ */ -1, -1, devinfo->ver <= 6) +F(cond_modifier, /* 4+ */ 27, 24, /* 12+ */ -1, -1) /* Same location as brw_inst */ +FC(acc_wr_control, /* 4+ */ 23, 23, /* 12+ */ -1, -1, devinfo->ver >= 6) +FC(mask_control_ex, /* 4+ */ 23, 23, /* 12+ */ -1, -1, devinfo->verx10 == 45 || devinfo->ver == 5) +F20(subreg_index, /* 4+ */ 22, 18, /* 8+ */ 22, 18, /* 12+ */ 39, 35, /* 20+ */ 51, 48) +FD20(datatype_index, /* 4+ */ 17, 13, /* 8+ */ 17, 13, /* 12+ */ 34, 30, /* 20+ */ 28, 26, 31, 30) +F20(control_index, /* 4+ */ 12, 8, /* 8+ */ 12, 8, /* 12+ */ 28, 24, /* 20+ */ 22, 18) +F20(swsb, /* 4+ */ -1, -1, /* 8+ */ -1, -1, /* 12+ */ 15, 8, /* 20+ */ 17, 8) +F(debug_control, /* 4+ */ 7, 7, /* 12+ */ 7, 7) +F(hw_opcode, /* 4+ */ 6, 0, /* 12+ */ 6, 0) /* Same location as brw_inst */ + +static inline unsigned +brw_compact_inst_imm(const struct intel_device_info *devinfo, + const brw_compact_inst *inst) +{ + if (devinfo->ver >= 12) { + return brw_compact_inst_bits(inst, 63, 52); + } else { + return (brw_compact_inst_bits(inst, 39, 35) << 8) | + (brw_compact_inst_bits(inst, 63, 56)); + } +} + +/** + * (Gfx8+) Compacted three-source instructions: + * @{ + */ +FC(3src_src2_reg_nr, /* 4+ */ 63, 57, /* 12+ */ 55, 48, devinfo->ver >= 8) +FC(3src_src1_reg_nr, /* 4+ */ 56, 50, /* 12+ */ 63, 56, devinfo->ver >= 8) +FC(3src_src0_reg_nr, /* 4+ */ 49, 43, /* 12+ */ 47, 40, devinfo->ver >= 8) +FC(3src_src2_subreg_nr, /* 4+ */ 42, 40, /* 12+ */ -1, -1, devinfo->ver >= 8) +FC(3src_src1_subreg_nr, /* 4+ */ 39, 37, /* 12+ */ -1, -1, devinfo->ver >= 8) +FC(3src_src0_subreg_nr, /* 4+ */ 36, 34, /* 12+ */ -1, -1, devinfo->ver >= 8) +FC(3src_src2_rep_ctrl, /* 4+ */ 33, 33, /* 12+ */ -1, -1, devinfo->ver >= 8) +FC(3src_src1_rep_ctrl, /* 4+ */ 32, 32, /* 12+ */ -1, -1, devinfo->ver >= 8) +FC(3src_saturate, /* 4+ */ 31, 31, /* 12+ */ -1, -1, devinfo->ver >= 8) +FC(3src_debug_control, /* 4+ */ 30, 30, /* 12+ */ 7, 7, devinfo->ver >= 8) +FC(3src_cmpt_control, /* 4+ */ 29, 29, /* 12+ */ 29, 29, devinfo->ver >= 8) +FC(3src_src0_rep_ctrl, /* 4+ */ 28, 28, /* 12+ */ -1, -1, devinfo->ver >= 8) +/* Reserved */ +F20(3src_dst_reg_nr, /* 4+ */ 18, 12, /* 8+ */ 18, 12, /* 12+ */ 23, 16, /* 20+ */ 39, 32) +F20(3src_source_index, /* 4+ */ -1, -1, /* 8+ */ 11, 10, /* 12+ */ 34, 30, /* 20+ */ 25, 22) +FD20(3src_subreg_index, /* 4+ */ -1, -1, /* 8+ */ -1, -1, /* 12+ */ 39, 35, /* 20+ */ 28, 26, 31, 30) +F20(3src_control_index, /* 4+ */ -1, -1, /* 8+ */ 9, 8, /* 12+ */ 28, 24, /* 20+ */ 21, 18) +F20(3src_swsb, /* 4+ */ -1, -1, /* 8+ */ -1, -1, /* 12+ */ 15, 8, /* 20+ */ 17, 8) +/* Bit 7 is Reserved (for future Opcode expansion) */ +FC(3src_hw_opcode, /* 4+ */ 6, 0, /* 12+ */ 6, 0, devinfo->ver >= 8) +/** @} */ + +#undef F + +static inline void +brw_inst_set_opcode(const struct brw_isa_info *isa, + struct brw_inst *inst, enum opcode opcode) +{ + brw_inst_set_hw_opcode(isa->devinfo, inst, brw_opcode_encode(isa, opcode)); +} + +static inline enum opcode +brw_inst_opcode(const struct brw_isa_info *isa, + const struct brw_inst *inst) +{ + return brw_opcode_decode(isa, brw_inst_hw_opcode(isa->devinfo, inst)); +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/intel/compiler/elk/brw_interpolation_map.c b/src/intel/compiler/elk/brw_interpolation_map.c new file mode 100644 index 00000000000..bdda1ad5d48 --- /dev/null +++ b/src/intel/compiler/elk/brw_interpolation_map.c @@ -0,0 +1,108 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_compiler.h" +#include "compiler/nir/nir.h" + +static char const *get_qual_name(int mode) +{ + switch (mode) { + case INTERP_MODE_NONE: return "none"; + case INTERP_MODE_FLAT: return "flat"; + case INTERP_MODE_SMOOTH: return "smooth"; + case INTERP_MODE_NOPERSPECTIVE: return "nopersp"; + default: return "???"; + } +} + +static void +gfx4_frag_prog_set_interp_modes(struct brw_wm_prog_data *prog_data, + const struct intel_vue_map *vue_map, + unsigned location, unsigned slot_count, + enum glsl_interp_mode interp) +{ + for (unsigned k = 0; k < slot_count; k++) { + unsigned slot = vue_map->varying_to_slot[location + k]; + if (slot != -1 && prog_data->interp_mode[slot] == INTERP_MODE_NONE) { + prog_data->interp_mode[slot] = interp; + + if (prog_data->interp_mode[slot] == INTERP_MODE_FLAT) { + prog_data->contains_flat_varying = true; + } else if (prog_data->interp_mode[slot] == INTERP_MODE_NOPERSPECTIVE) { + prog_data->contains_noperspective_varying = true; + } + } + } +} + +/* Set up interpolation modes for every element in the VUE */ +void +brw_setup_vue_interpolation(const struct intel_vue_map *vue_map, nir_shader *nir, + struct brw_wm_prog_data *prog_data) +{ + /* Initialise interp_mode. INTERP_MODE_NONE == 0 */ + memset(prog_data->interp_mode, 0, sizeof(prog_data->interp_mode)); + + if (!vue_map) + return; + + /* HPOS always wants noperspective. setting it up here allows + * us to not need special handling in the SF program. + */ + unsigned pos_slot = vue_map->varying_to_slot[VARYING_SLOT_POS]; + if (pos_slot != -1) {; + prog_data->interp_mode[pos_slot] = INTERP_MODE_NOPERSPECTIVE; + prog_data->contains_noperspective_varying = true; + } + + nir_foreach_shader_in_variable(var, nir) { + unsigned location = var->data.location; + unsigned slot_count = glsl_count_attribute_slots(var->type, false); + + gfx4_frag_prog_set_interp_modes(prog_data, vue_map, location, slot_count, + var->data.interpolation); + + if (location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1) { + location = location + VARYING_SLOT_BFC0 - VARYING_SLOT_COL0; + gfx4_frag_prog_set_interp_modes(prog_data, vue_map, location, + slot_count, var->data.interpolation); + } + } + + const bool debug = false; + if (debug) { + fprintf(stderr, "VUE map:\n"); + for (int i = 0; i < vue_map->num_slots; i++) { + int varying = vue_map->slot_to_varying[i]; + if (varying == -1) { + fprintf(stderr, "%d: --\n", i); + continue; + } + + fprintf(stderr, "%d: %d %s ofs %d\n", + i, varying, + get_qual_name(prog_data->interp_mode[i]), + brw_vue_slot_to_offset(i)); + } + } +} diff --git a/src/intel/compiler/elk/brw_ir.h b/src/intel/compiler/elk/brw_ir.h new file mode 100644 index 00000000000..3b4b19c244a --- /dev/null +++ b/src/intel/compiler/elk/brw_ir.h @@ -0,0 +1,216 @@ +/* -*- c++ -*- */ +/* + * Copyright © 2010-2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BRW_IR_H +#define BRW_IR_H + +#include +#include "brw_reg.h" +#include "compiler/glsl/list.h" + +#define MAX_SAMPLER_MESSAGE_SIZE 11 + +/* The sampler can return a vec5 when sampling with sparse residency. In + * SIMD32, each component takes up 4 GRFs, so we need to allow up to size-20 + * VGRFs to hold the result. + */ +#define MAX_VGRF_SIZE(devinfo) ((devinfo)->ver >= 20 ? 40 : 20) + +#ifdef __cplusplus +struct backend_reg : private brw_reg +{ + backend_reg() {} + backend_reg(const struct brw_reg ®) : brw_reg(reg), offset(0) {} + + const brw_reg &as_brw_reg() const + { + assert(file == ARF || file == FIXED_GRF || file == MRF || file == IMM); + assert(offset == 0); + return static_cast(*this); + } + + brw_reg &as_brw_reg() + { + assert(file == ARF || file == FIXED_GRF || file == MRF || file == IMM); + assert(offset == 0); + return static_cast(*this); + } + + bool equals(const backend_reg &r) const; + bool negative_equals(const backend_reg &r) const; + + bool is_zero() const; + bool is_one() const; + bool is_negative_one() const; + bool is_null() const; + bool is_accumulator() const; + + /** Offset from the start of the (virtual) register in bytes. */ + uint16_t offset; + + using brw_reg::type; + using brw_reg::file; + using brw_reg::negate; + using brw_reg::abs; + using brw_reg::address_mode; + using brw_reg::subnr; + using brw_reg::nr; + + using brw_reg::swizzle; + using brw_reg::writemask; + using brw_reg::indirect_offset; + using brw_reg::vstride; + using brw_reg::width; + using brw_reg::hstride; + + using brw_reg::df; + using brw_reg::f; + using brw_reg::d; + using brw_reg::ud; + using brw_reg::d64; + using brw_reg::u64; +}; + +struct bblock_t; + +struct backend_instruction : public exec_node { + bool is_3src(const struct brw_compiler *compiler) const; + bool is_math() const; + bool is_control_flow_begin() const; + bool is_control_flow_end() const; + bool is_control_flow() const; + bool is_commutative() const; + bool can_do_source_mods() const; + bool can_do_saturate() const; + bool can_do_cmod() const; + bool reads_accumulator_implicitly() const; + bool writes_accumulator_implicitly(const struct intel_device_info *devinfo) const; + + /** + * Instructions that use indirect addressing have additional register + * regioning restrictions. + */ + bool uses_indirect_addressing() const; + + void remove(bblock_t *block, bool defer_later_block_ip_updates = false); + void insert_after(bblock_t *block, backend_instruction *inst); + void insert_before(bblock_t *block, backend_instruction *inst); + + /** + * True if the instruction has side effects other than writing to + * its destination registers. You are expected not to reorder or + * optimize these out unless you know what you are doing. + */ + bool has_side_effects() const; + + /** + * True if the instruction might be affected by side effects of other + * instructions. + */ + bool is_volatile() const; +#else +struct backend_instruction { + struct exec_node link; +#endif + /** @{ + * Annotation for the generated IR. One of the two can be set. + */ + const void *ir; + const char *annotation; + /** @} */ + + /** + * Execution size of the instruction. This is used by the generator to + * generate the correct binary for the given instruction. Current valid + * values are 1, 4, 8, 16, 32. + */ + uint8_t exec_size; + + /** + * Channel group from the hardware execution and predication mask that + * should be applied to the instruction. The subset of channel enable + * signals (calculated from the EU control flow and predication state) + * given by [group, group + exec_size) will be used to mask GRF writes and + * any other side effects of the instruction. + */ + uint8_t group; + + uint32_t offset; /**< spill/unspill offset or texture offset bitfield */ + uint8_t mlen; /**< SEND message length */ + uint8_t ex_mlen; /**< SENDS extended message length */ + int8_t base_mrf; /**< First MRF in the SEND message, if mlen is nonzero. */ + uint8_t target; /**< MRT target. */ + uint8_t sfid; /**< SFID for SEND instructions */ + uint32_t desc; /**< SEND[S] message descriptor immediate */ + uint32_t ex_desc; /**< SEND[S] extended message descriptor immediate */ + unsigned size_written; /**< Data written to the destination register in bytes. */ + + enum opcode opcode; /* BRW_OPCODE_* or FS_OPCODE_* */ + enum brw_conditional_mod conditional_mod; /**< BRW_CONDITIONAL_* */ + enum brw_predicate predicate; + bool predicate_inverse:1; + bool writes_accumulator:1; /**< instruction implicitly writes accumulator */ + bool force_writemask_all:1; + bool no_dd_clear:1; + bool no_dd_check:1; + bool saturate:1; + bool shadow_compare:1; + bool check_tdr:1; /**< Only valid for SEND; turns it into a SENDC */ + bool send_has_side_effects:1; /**< Only valid for SHADER_OPCODE_SEND */ + bool send_is_volatile:1; /**< Only valid for SHADER_OPCODE_SEND */ + bool send_ex_desc_scratch:1; /**< Only valid for SHADER_OPCODE_SEND, use + * the scratch surface offset to build + * extended descriptor + */ + bool send_ex_bso:1; /**< Only for SHADER_OPCODE_SEND, use extended bindless + * surface offset (26bits instead of 20bits) + */ + bool predicate_trivial:1; /**< The predication mask applied to this + * instruction is guaranteed to be uniform and + * a superset of the execution mask of the + * present block, no currently enabled channels + * will be disabled by the predicate. + */ + bool eot:1; + + /* Chooses which flag subregister (f0.0 to f3.1) is used for conditional + * mod and predication. + */ + unsigned flag_subreg:3; + + /** + * Systolic depth used by DPAS instruction. + */ + unsigned sdepth:4; + + /** + * Repeat count used by DPAS instruction. + */ + unsigned rcount:4; + + /** The number of hardware registers used for a message header. */ + uint8_t header_size; +}; + +#endif diff --git a/src/intel/compiler/elk/brw_ir_allocator.h b/src/intel/compiler/elk/brw_ir_allocator.h new file mode 100644 index 00000000000..4722ae4a4a5 --- /dev/null +++ b/src/intel/compiler/elk/brw_ir_allocator.h @@ -0,0 +1,92 @@ +/* -*- c++ -*- */ +/* + * Copyright © 2010-2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BRW_IR_ALLOCATOR_H +#define BRW_IR_ALLOCATOR_H + +#include "util/compiler.h" +#include "util/glheader.h" +#include "util/macros.h" +#include "util/rounding.h" +#include "util/u_math.h" + +namespace brw { + /** + * Simple allocator used to keep track of virtual GRFs. + */ + class simple_allocator { + public: + simple_allocator() : + sizes(NULL), offsets(NULL), count(0), total_size(0), capacity(0) + { + } + + ~simple_allocator() + { + free(offsets); + free(sizes); + } + + unsigned + allocate(unsigned size) + { + assert(size > 0); + if (capacity <= count) { + capacity = MAX2(16, capacity * 2); + sizes = (unsigned *)realloc(sizes, capacity * sizeof(unsigned)); + offsets = (unsigned *)realloc(offsets, capacity * sizeof(unsigned)); + } + + sizes[count] = size; + offsets[count] = total_size; + total_size += size; + + return count++; + } + + /** + * Array of sizes for each allocation. The allocation unit is up to the + * back-end, but it's expected to be one scalar value in the FS back-end + * and one vec4 in the VEC4 back-end. + */ + unsigned *sizes; + + /** + * Array of offsets from the start of the VGRF space in allocation + * units. + */ + unsigned *offsets; + + /** Total number of VGRFs allocated. */ + unsigned count; + + /** Cumulative size in allocation units. */ + unsigned total_size; + + private: + unsigned capacity; + }; +} + +#endif diff --git a/src/intel/compiler/elk/brw_ir_analysis.h b/src/intel/compiler/elk/brw_ir_analysis.h new file mode 100644 index 00000000000..33b8f5178a6 --- /dev/null +++ b/src/intel/compiler/elk/brw_ir_analysis.h @@ -0,0 +1,192 @@ +/* -*- c++ -*- */ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BRW_IR_ANALYSIS_H +#define BRW_IR_ANALYSIS_H + +namespace brw { + /** + * Bitset of state categories that can influence the result of IR analysis + * passes. + */ + enum analysis_dependency_class { + /** + * The analysis doesn't depend on the IR, its result is effectively a + * constant during the compilation. + */ + DEPENDENCY_NOTHING = 0, + /** + * The analysis depends on the set of instructions in the program and + * their naming. Note that because instructions are named sequentially + * by IP this implies a dependency on the control flow edges between + * instructions. This will be signaled whenever instructions are + * inserted, removed or reordered in the program. + */ + DEPENDENCY_INSTRUCTION_IDENTITY = 0x1, + /** + * The analysis is sensitive to the detailed semantics of instructions + * in the program, where "detailed" means any change in the instruction + * data structures other than the linked-list pointers (which are + * already covered by DEPENDENCY_INSTRUCTION_IDENTITY). E.g. changing + * the negate or abs flags of an instruction source would signal this + * flag alone because it would preserve all other instruction dependency + * classes. + */ + DEPENDENCY_INSTRUCTION_DETAIL = 0x2, + /** + * The analysis depends on the set of data flow edges between + * instructions. This will be signaled whenever the dataflow relation + * between instructions has potentially changed, e.g. when the VGRF + * index of an instruction source or destination changes (in which case + * it will appear in combination with DEPENDENCY_INSTRUCTION_DETAIL), or + * when data-dependent instructions are reordered (in which case it will + * appear in combination with DEPENDENCY_INSTRUCTION_IDENTITY). + */ + DEPENDENCY_INSTRUCTION_DATA_FLOW = 0x4, + /** + * The analysis depends on all instruction dependency classes. These + * will typically be signaled simultaneously when inserting or removing + * instructions in the program (or if you're feeling too lazy to read + * through your optimization pass to figure out which of the instruction + * dependency classes above it invalidates). + */ + DEPENDENCY_INSTRUCTIONS = 0x7, + /** + * The analysis depends on the set of VGRFs in the program and their + * naming. This will be signaled when VGRFs are allocated or released. + */ + DEPENDENCY_VARIABLES = 0x8, + /** + * The analysis depends on the set of basic blocks in the program, their + * control flow edges and naming. + */ + DEPENDENCY_BLOCKS = 0x10, + /** + * The analysis depends on the program being literally the same (good + * luck...), any change in the input invalidates previous analysis + * computations. + */ + DEPENDENCY_EVERYTHING = ~0 + }; + + inline analysis_dependency_class + operator|(analysis_dependency_class x, analysis_dependency_class y) + { + return static_cast( + static_cast(x) | static_cast(y)); + } +} + +/** + * Instantiate a program analysis class \p L which can calculate an object of + * type \p T as result. \p C is a closure that encapsulates whatever + * information is required as argument to run the analysis pass. The purpose + * of this class is to make sure that: + * + * - The analysis pass is executed lazily whenever it's needed and multiple + * executions are optimized out as long as the cached result remains marked + * up-to-date. + * + * - There is no way to access the cached analysis result without first + * calling L::require(), which makes sure that the analysis pass is rerun + * if necessary. + * + * - The cached result doesn't become inconsistent with the program for as + * long as it remains marked up-to-date. (This is only enforced in debug + * builds for performance reasons) + * + * The requirements on \p T are the following: + * + * - Constructible with a single argument, as in 'x = T(c)' for \p c of type + * \p C. + * + * - 'x.dependency_class()' on const \p x returns a bitset of + * brw::analysis_dependency_class specifying the set of IR objects that are + * required to remain invariant for the cached analysis result to be + * considered valid. + * + * - 'x.validate(c)' on const \p x returns a boolean result specifying + * whether the analysis result \p x is consistent with the input IR. This + * is currently only used for validation in debug builds. + */ +template +class brw_analysis { +public: + /** + * Construct a program analysis. \p c is an arbitrary object + * passed as argument to the constructor of the analysis result + * object of type \p T. + */ + brw_analysis(const C *c) : c(c), p(NULL) {} + + /** + * Destroy a program analysis. + */ + ~brw_analysis() + { + delete p; + } + + /** + * Obtain the result of a program analysis. This gives a + * guaranteed up-to-date result, the analysis pass will be + * rerun implicitly if it has become stale. + */ + T & + require() + { + if (p) + assert(p->validate(c)); + else + p = new T(c); + + return *p; + } + + const T & + require() const + { + return const_cast *>(this)->require(); + } + + /** + * Report that dependencies of the analysis pass may have changed + * since the last calculation and the cached analysis result may + * have to be discarded. + */ + void + invalidate(brw::analysis_dependency_class c) + { + if (p && (c & p->dependency_class())) { + delete p; + p = NULL; + } + } + +private: + const C *c; + T *p; +}; + +#endif diff --git a/src/intel/compiler/elk/brw_ir_fs.h b/src/intel/compiler/elk/brw_ir_fs.h new file mode 100644 index 00000000000..169449bbab8 --- /dev/null +++ b/src/intel/compiler/elk/brw_ir_fs.h @@ -0,0 +1,737 @@ +/* -*- c++ -*- */ +/* + * Copyright © 2010-2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BRW_IR_FS_H +#define BRW_IR_FS_H + +#include "brw_shader.h" + +class fs_inst; + +class fs_reg : public backend_reg { +public: + DECLARE_RALLOC_CXX_OPERATORS(fs_reg) + + void init(); + + fs_reg(); + fs_reg(struct ::brw_reg reg); + fs_reg(enum brw_reg_file file, unsigned nr); + fs_reg(enum brw_reg_file file, unsigned nr, enum brw_reg_type type); + + bool equals(const fs_reg &r) const; + bool negative_equals(const fs_reg &r) const; + bool is_contiguous() const; + + /** + * Return the size in bytes of a single logical component of the + * register assuming the given execution width. + */ + unsigned component_size(unsigned width) const; + + /** Register region horizontal stride */ + uint8_t stride; +}; + +static inline fs_reg +negate(fs_reg reg) +{ + assert(reg.file != IMM); + reg.negate = !reg.negate; + return reg; +} + +static inline fs_reg +retype(fs_reg reg, enum brw_reg_type type) +{ + reg.type = type; + return reg; +} + +static inline fs_reg +byte_offset(fs_reg reg, unsigned delta) +{ + switch (reg.file) { + case BAD_FILE: + break; + case VGRF: + case ATTR: + case UNIFORM: + reg.offset += delta; + break; + case MRF: { + const unsigned suboffset = reg.offset + delta; + reg.nr += suboffset / REG_SIZE; + reg.offset = suboffset % REG_SIZE; + break; + } + case ARF: + case FIXED_GRF: { + const unsigned suboffset = reg.subnr + delta; + reg.nr += suboffset / REG_SIZE; + reg.subnr = suboffset % REG_SIZE; + break; + } + case IMM: + default: + assert(delta == 0); + } + return reg; +} + +static inline fs_reg +horiz_offset(const fs_reg ®, unsigned delta) +{ + switch (reg.file) { + case BAD_FILE: + case UNIFORM: + case IMM: + /* These only have a single component that is implicitly splatted. A + * horizontal offset should be a harmless no-op. + * XXX - Handle vector immediates correctly. + */ + return reg; + case VGRF: + case MRF: + case ATTR: + return byte_offset(reg, delta * reg.stride * type_sz(reg.type)); + case ARF: + case FIXED_GRF: + if (reg.is_null()) { + return reg; + } else { + const unsigned hstride = reg.hstride ? 1 << (reg.hstride - 1) : 0; + const unsigned vstride = reg.vstride ? 1 << (reg.vstride - 1) : 0; + const unsigned width = 1 << reg.width; + + if (delta % width == 0) { + return byte_offset(reg, delta / width * vstride * type_sz(reg.type)); + } else { + assert(vstride == hstride * width); + return byte_offset(reg, delta * hstride * type_sz(reg.type)); + } + } + } + unreachable("Invalid register file"); +} + +static inline fs_reg +offset(fs_reg reg, unsigned width, unsigned delta) +{ + switch (reg.file) { + case BAD_FILE: + break; + case ARF: + case FIXED_GRF: + case MRF: + case VGRF: + case ATTR: + case UNIFORM: + return byte_offset(reg, delta * reg.component_size(width)); + case IMM: + assert(delta == 0); + } + return reg; +} + +/** + * Get the scalar channel of \p reg given by \p idx and replicate it to all + * channels of the result. + */ +static inline fs_reg +component(fs_reg reg, unsigned idx) +{ + reg = horiz_offset(reg, idx); + reg.stride = 0; + if (reg.file == ARF || reg.file == FIXED_GRF) { + reg.vstride = BRW_VERTICAL_STRIDE_0; + reg.width = BRW_WIDTH_1; + reg.hstride = BRW_HORIZONTAL_STRIDE_0; + } + return reg; +} + +/** + * Return an integer identifying the discrete address space a register is + * contained in. A register is by definition fully contained in the single + * reg_space it belongs to, so two registers with different reg_space ids are + * guaranteed not to overlap. Most register files are a single reg_space of + * its own, only the VGRF and ATTR files are composed of multiple discrete + * address spaces, one for each allocation and input attribute respectively. + */ +static inline uint32_t +reg_space(const fs_reg &r) +{ + return r.file << 16 | (r.file == VGRF || r.file == ATTR ? r.nr : 0); +} + +/** + * Return the base offset in bytes of a register relative to the start of its + * reg_space(). + */ +static inline unsigned +reg_offset(const fs_reg &r) +{ + return (r.file == VGRF || r.file == IMM || r.file == ATTR ? 0 : r.nr) * + (r.file == UNIFORM ? 4 : REG_SIZE) + r.offset + + (r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0); +} + +/** + * Return the amount of padding in bytes left unused between individual + * components of register \p r due to a (horizontal) stride value greater than + * one, or zero if components are tightly packed in the register file. + */ +static inline unsigned +reg_padding(const fs_reg &r) +{ + const unsigned stride = ((r.file != ARF && r.file != FIXED_GRF) ? r.stride : + r.hstride == 0 ? 0 : + 1 << (r.hstride - 1)); + return (MAX2(1, stride) - 1) * type_sz(r.type); +} + +/* Do not call this directly. Call regions_overlap() instead. */ +static inline bool +regions_overlap_MRF(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds) +{ + if (r.nr & BRW_MRF_COMPR4) { + fs_reg t = r; + t.nr &= ~BRW_MRF_COMPR4; + /* COMPR4 regions are translated by the hardware during decompression + * into two separate half-regions 4 MRFs apart from each other. + * + * Note: swapping s and t in this parameter list eliminates one possible + * level of recursion (since the s in the called versions of + * regions_overlap_MRF can't be COMPR4), and that makes the compiled + * code a lot smaller. + */ + return regions_overlap_MRF(s, ds, t, dr / 2) || + regions_overlap_MRF(s, ds, byte_offset(t, 4 * REG_SIZE), dr / 2); + } else if (s.nr & BRW_MRF_COMPR4) { + return regions_overlap_MRF(s, ds, r, dr); + } + + return !((r.nr * REG_SIZE + r.offset + dr) <= (s.nr * REG_SIZE + s.offset) || + (s.nr * REG_SIZE + s.offset + ds) <= (r.nr * REG_SIZE + r.offset)); +} + +/** + * Return whether the register region starting at \p r and spanning \p dr + * bytes could potentially overlap the register region starting at \p s and + * spanning \p ds bytes. + */ +static inline bool +regions_overlap(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds) +{ + if (r.file != s.file) + return false; + + if (r.file == VGRF) { + return r.nr == s.nr && + !(r.offset + dr <= s.offset || s.offset + ds <= r.offset); + } else if (r.file != MRF) { + return !(reg_offset(r) + dr <= reg_offset(s) || + reg_offset(s) + ds <= reg_offset(r)); + } else { + return regions_overlap_MRF(r, dr, s, ds); + } +} + +/** + * Check that the register region given by r [r.offset, r.offset + dr[ + * is fully contained inside the register region given by s + * [s.offset, s.offset + ds[. + */ +static inline bool +region_contained_in(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds) +{ + return reg_space(r) == reg_space(s) && + reg_offset(r) >= reg_offset(s) && + reg_offset(r) + dr <= reg_offset(s) + ds; +} + +/** + * Return whether the given register region is n-periodic, i.e. whether the + * original region remains invariant after shifting it by \p n scalar + * channels. + */ +static inline bool +is_periodic(const fs_reg ®, unsigned n) +{ + if (reg.file == BAD_FILE || reg.is_null()) { + return true; + + } else if (reg.file == IMM) { + const unsigned period = (reg.type == BRW_REGISTER_TYPE_UV || + reg.type == BRW_REGISTER_TYPE_V ? 8 : + reg.type == BRW_REGISTER_TYPE_VF ? 4 : + 1); + return n % period == 0; + + } else if (reg.file == ARF || reg.file == FIXED_GRF) { + const unsigned period = (reg.hstride == 0 && reg.vstride == 0 ? 1 : + reg.vstride == 0 ? 1 << reg.width : + ~0); + return n % period == 0; + + } else { + return reg.stride == 0; + } +} + +static inline bool +is_uniform(const fs_reg ®) +{ + return is_periodic(reg, 1); +} + +/** + * Get the specified 8-component quarter of a register. + */ +static inline fs_reg +quarter(const fs_reg ®, unsigned idx) +{ + assert(idx < 4); + return horiz_offset(reg, 8 * idx); +} + +/** + * Reinterpret each channel of register \p reg as a vector of values of the + * given smaller type and take the i-th subcomponent from each. + */ +static inline fs_reg +subscript(fs_reg reg, brw_reg_type type, unsigned i) +{ + assert((i + 1) * type_sz(type) <= type_sz(reg.type)); + + if (reg.file == ARF || reg.file == FIXED_GRF) { + /* The stride is encoded inconsistently for fixed GRF and ARF registers + * as the log2 of the actual vertical and horizontal strides. + */ + const int delta = util_logbase2(type_sz(reg.type)) - + util_logbase2(type_sz(type)); + reg.hstride += (reg.hstride ? delta : 0); + reg.vstride += (reg.vstride ? delta : 0); + + } else if (reg.file == IMM) { + unsigned bit_size = type_sz(type) * 8; + reg.u64 >>= i * bit_size; + reg.u64 &= BITFIELD64_MASK(bit_size); + if (bit_size <= 16) + reg.u64 |= reg.u64 << 16; + return retype(reg, type); + } else { + reg.stride *= type_sz(reg.type) / type_sz(type); + } + + return byte_offset(retype(reg, type), i * type_sz(type)); +} + +static inline fs_reg +horiz_stride(fs_reg reg, unsigned s) +{ + reg.stride *= s; + return reg; +} + +static const fs_reg reg_undef; + +class fs_inst : public backend_instruction { + fs_inst &operator=(const fs_inst &); + + void init(enum opcode opcode, uint8_t exec_width, const fs_reg &dst, + const fs_reg *src, unsigned sources); + +public: + DECLARE_RALLOC_CXX_OPERATORS(fs_inst) + + fs_inst(); + fs_inst(enum opcode opcode, uint8_t exec_size); + fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst); + fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, + const fs_reg &src0); + fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, + const fs_reg &src0, const fs_reg &src1); + fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, + const fs_reg &src0, const fs_reg &src1, const fs_reg &src2); + fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, + const fs_reg src[], unsigned sources); + fs_inst(const fs_inst &that); + ~fs_inst(); + + void resize_sources(uint8_t num_sources); + + bool is_send_from_grf() const; + bool is_payload(unsigned arg) const; + bool is_partial_write() const; + unsigned components_read(unsigned i) const; + unsigned size_read(int arg) const; + bool can_do_source_mods(const struct intel_device_info *devinfo) const; + bool can_do_cmod(); + bool can_change_types() const; + bool has_source_and_destination_hazard() const; + unsigned implied_mrf_writes() const; + + /** + * Return whether \p arg is a control source of a virtual instruction which + * shouldn't contribute to the execution type and usual regioning + * restriction calculations of arithmetic instructions. + */ + bool is_control_source(unsigned arg) const; + + /** + * Return the subset of flag registers read by the instruction as a bitset + * with byte granularity. + */ + unsigned flags_read(const intel_device_info *devinfo) const; + + /** + * Return the subset of flag registers updated by the instruction (either + * partially or fully) as a bitset with byte granularity. + */ + unsigned flags_written(const intel_device_info *devinfo) const; + + /** + * Return true if this instruction is a sampler message gathering residency + * data. + */ + bool has_sampler_residency() const; + + fs_reg dst; + fs_reg *src; + + uint8_t sources; /**< Number of fs_reg sources. */ + + bool last_rt:1; + bool pi_noperspective:1; /**< Pixel interpolator noperspective flag */ + bool keep_payload_trailing_zeros; + + tgl_swsb sched; /**< Scheduling info. */ +}; + +/** + * Make the execution of \p inst dependent on the evaluation of a possibly + * inverted predicate. + */ +static inline fs_inst * +set_predicate_inv(enum brw_predicate pred, bool inverse, + fs_inst *inst) +{ + inst->predicate = pred; + inst->predicate_inverse = inverse; + return inst; +} + +/** + * Make the execution of \p inst dependent on the evaluation of a predicate. + */ +static inline fs_inst * +set_predicate(enum brw_predicate pred, fs_inst *inst) +{ + return set_predicate_inv(pred, false, inst); +} + +/** + * Write the result of evaluating the condition given by \p mod to a flag + * register. + */ +static inline fs_inst * +set_condmod(enum brw_conditional_mod mod, fs_inst *inst) +{ + inst->conditional_mod = mod; + return inst; +} + +/** + * Clamp the result of \p inst to the saturation range of its destination + * datatype. + */ +static inline fs_inst * +set_saturate(bool saturate, fs_inst *inst) +{ + inst->saturate = saturate; + return inst; +} + +/** + * Return the number of dataflow registers written by the instruction (either + * fully or partially) counted from 'floor(reg_offset(inst->dst) / + * register_size)'. The somewhat arbitrary register size unit is 4B for the + * UNIFORM and IMM files and 32B for all other files. + */ +inline unsigned +regs_written(const fs_inst *inst) +{ + assert(inst->dst.file != UNIFORM && inst->dst.file != IMM); + return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE + + inst->size_written - + MIN2(inst->size_written, reg_padding(inst->dst)), + REG_SIZE); +} + +/** + * Return the number of dataflow registers read by the instruction (either + * fully or partially) counted from 'floor(reg_offset(inst->src[i]) / + * register_size)'. The somewhat arbitrary register size unit is 4B for the + * UNIFORM files and 32B for all other files. + */ +inline unsigned +regs_read(const fs_inst *inst, unsigned i) +{ + if (inst->src[i].file == IMM) + return 1; + + const unsigned reg_size = inst->src[i].file == UNIFORM ? 4 : REG_SIZE; + return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size + + inst->size_read(i) - + MIN2(inst->size_read(i), reg_padding(inst->src[i])), + reg_size); +} + +static inline enum brw_reg_type +get_exec_type(const fs_inst *inst) +{ + brw_reg_type exec_type = BRW_REGISTER_TYPE_B; + + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file != BAD_FILE && + !inst->is_control_source(i)) { + const brw_reg_type t = get_exec_type(inst->src[i].type); + if (type_sz(t) > type_sz(exec_type)) + exec_type = t; + else if (type_sz(t) == type_sz(exec_type) && + brw_reg_type_is_floating_point(t)) + exec_type = t; + } + } + + if (exec_type == BRW_REGISTER_TYPE_B) + exec_type = inst->dst.type; + + assert(exec_type != BRW_REGISTER_TYPE_B); + + /* Promotion of the execution type to 32-bit for conversions from or to + * half-float seems to be consistent with the following text from the + * Cherryview PRM Vol. 7, "Execution Data Type": + * + * "When single precision and half precision floats are mixed between + * source operands or between source and destination operand [..] single + * precision float is the execution datatype." + * + * and from "Register Region Restrictions": + * + * "Conversion between Integer and HF (Half Float) must be DWord aligned + * and strided by a DWord on the destination." + */ + if (type_sz(exec_type) == 2 && + inst->dst.type != exec_type) { + if (exec_type == BRW_REGISTER_TYPE_HF) + exec_type = BRW_REGISTER_TYPE_F; + else if (inst->dst.type == BRW_REGISTER_TYPE_HF) + exec_type = BRW_REGISTER_TYPE_D; + } + + return exec_type; +} + +static inline unsigned +get_exec_type_size(const fs_inst *inst) +{ + return type_sz(get_exec_type(inst)); +} + +static inline bool +is_send(const fs_inst *inst) +{ + return inst->mlen || inst->is_send_from_grf(); +} + +/** + * Return whether the instruction isn't an ALU instruction and cannot be + * assumed to complete in-order. + */ +static inline bool +is_unordered(const intel_device_info *devinfo, const fs_inst *inst) +{ + return is_send(inst) || (devinfo->ver < 20 && inst->is_math()) || + inst->opcode == BRW_OPCODE_DPAS || + (devinfo->has_64bit_float_via_math_pipe && + (get_exec_type(inst) == BRW_REGISTER_TYPE_DF || + inst->dst.type == BRW_REGISTER_TYPE_DF)); +} + +/** + * Return whether the following regioning restriction applies to the specified + * instruction. From the Cherryview PRM Vol 7. "Register Region + * Restrictions": + * + * "When source or destination datatype is 64b or operation is integer DWord + * multiply, regioning in Align1 must follow these rules: + * + * 1. Source and Destination horizontal stride must be aligned to the same qword. + * 2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride. + * 3. Source and Destination offset must be the same, except the case of + * scalar source." + */ +static inline bool +has_dst_aligned_region_restriction(const intel_device_info *devinfo, + const fs_inst *inst, + brw_reg_type dst_type) +{ + const brw_reg_type exec_type = get_exec_type(inst); + /* Even though the hardware spec claims that "integer DWord multiply" + * operations are restricted, empirical evidence and the behavior of the + * simulator suggest that only 32x32-bit integer multiplication is + * restricted. + */ + const bool is_dword_multiply = !brw_reg_type_is_floating_point(exec_type) && + ((inst->opcode == BRW_OPCODE_MUL && + MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4) || + (inst->opcode == BRW_OPCODE_MAD && + MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4)); + + if (type_sz(dst_type) > 4 || type_sz(exec_type) > 4 || + (type_sz(exec_type) == 4 && is_dword_multiply)) + return devinfo->platform == INTEL_PLATFORM_CHV || + intel_device_info_is_9lp(devinfo) || + devinfo->verx10 >= 125; + + else if (brw_reg_type_is_floating_point(dst_type)) + return devinfo->verx10 >= 125; + + else + return false; +} + +static inline bool +has_dst_aligned_region_restriction(const intel_device_info *devinfo, + const fs_inst *inst) +{ + return has_dst_aligned_region_restriction(devinfo, inst, inst->dst.type); +} + +/** + * Return whether the LOAD_PAYLOAD instruction is a plain copy of bits from + * the specified register file into a VGRF. + * + * This implies identity register regions without any source-destination + * overlap, but otherwise has no implications on the location of sources and + * destination in the register file: Gathering any number of portions from + * multiple virtual registers in any order is allowed. + */ +inline bool +is_copy_payload(brw_reg_file file, const fs_inst *inst) +{ + if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD || + inst->is_partial_write() || inst->saturate || + inst->dst.file != VGRF) + return false; + + for (unsigned i = 0; i < inst->sources; i++) { + if (inst->src[i].file != file || + inst->src[i].abs || inst->src[i].negate) + return false; + + if (!inst->src[i].is_contiguous()) + return false; + + if (regions_overlap(inst->dst, inst->size_written, + inst->src[i], inst->size_read(i))) + return false; + } + + return true; +} + +/** + * Like is_copy_payload(), but the instruction is required to copy a single + * contiguous block of registers from the given register file into the + * destination without any reordering. + */ +inline bool +is_identity_payload(brw_reg_file file, const fs_inst *inst) { + if (is_copy_payload(file, inst)) { + fs_reg reg = inst->src[0]; + + for (unsigned i = 0; i < inst->sources; i++) { + reg.type = inst->src[i].type; + if (!inst->src[i].equals(reg)) + return false; + + reg = byte_offset(reg, inst->size_read(i)); + } + + return true; + } else { + return false; + } +} + +/** + * Like is_copy_payload(), but the instruction is required to source data from + * at least two disjoint VGRFs. + * + * This doesn't necessarily rule out the elimination of this instruction + * through register coalescing, but due to limitations of the register + * coalesce pass it might be impossible to do so directly until a later stage, + * when the LOAD_PAYLOAD instruction is unrolled into a sequence of MOV + * instructions. + */ +inline bool +is_multi_copy_payload(const fs_inst *inst) { + if (is_copy_payload(VGRF, inst)) { + for (unsigned i = 0; i < inst->sources; i++) { + if (inst->src[i].nr != inst->src[0].nr) + return true; + } + } + + return false; +} + +/** + * Like is_identity_payload(), but the instruction is required to copy the + * whole contents of a single VGRF into the destination. + * + * This means that there is a good chance that the instruction will be + * eliminated through register coalescing, but it's neither a necessary nor a + * sufficient condition for that to happen -- E.g. consider the case where + * source and destination registers diverge due to other instructions in the + * program overwriting part of their contents, which isn't something we can + * predict up front based on a cheap strictly local test of the copy + * instruction. + */ +inline bool +is_coalescing_payload(const brw::simple_allocator &alloc, const fs_inst *inst) +{ + return is_identity_payload(VGRF, inst) && + inst->src[0].offset == 0 && + alloc.sizes[inst->src[0].nr] * REG_SIZE == inst->size_written; +} + +bool +has_bank_conflict(const struct brw_isa_info *isa, const fs_inst *inst); + +#endif diff --git a/src/intel/compiler/elk/brw_ir_performance.cpp b/src/intel/compiler/elk/brw_ir_performance.cpp new file mode 100644 index 00000000000..d50e63bfdb1 --- /dev/null +++ b/src/intel/compiler/elk/brw_ir_performance.cpp @@ -0,0 +1,1698 @@ +/* + * Copyright © 2020 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_eu.h" +#include "brw_fs.h" +#include "brw_vec4.h" +#include "brw_cfg.h" + +using namespace brw; + +namespace { + /** + * Enumeration representing the various asynchronous units that can run + * computations in parallel on behalf of a shader thread. + */ + enum intel_eu_unit { + /** EU front-end. */ + EU_UNIT_FE, + /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */ + EU_UNIT_FPU, + /** Extended Math unit (AKA FPU1 on Gfx8-11, part of the EU on Gfx6+). */ + EU_UNIT_EM, + /** Sampler shared function. */ + EU_UNIT_SAMPLER, + /** Pixel Interpolator shared function. */ + EU_UNIT_PI, + /** Unified Return Buffer shared function. */ + EU_UNIT_URB, + /** Data Port Data Cache shared function. */ + EU_UNIT_DP_DC, + /** Data Port Render Cache shared function. */ + EU_UNIT_DP_RC, + /** Data Port Constant Cache shared function. */ + EU_UNIT_DP_CC, + /** Message Gateway shared function. */ + EU_UNIT_GATEWAY, + /** Thread Spawner shared function. */ + EU_UNIT_SPAWNER, + /* EU_UNIT_VME, */ + /* EU_UNIT_CRE, */ + /** Number of asynchronous units currently tracked. */ + EU_NUM_UNITS, + /** Dummy unit for instructions that don't consume runtime from the above. */ + EU_UNIT_NULL = EU_NUM_UNITS + }; + + /** + * Enumeration representing a computation result another computation can + * potentially depend on. + */ + enum intel_eu_dependency_id { + /* Register part of the GRF. */ + EU_DEPENDENCY_ID_GRF0 = 0, + /* Register part of the MRF. Only used on Gfx4-6. */ + EU_DEPENDENCY_ID_MRF0 = EU_DEPENDENCY_ID_GRF0 + XE2_MAX_GRF, + /* Address register part of the ARF. */ + EU_DEPENDENCY_ID_ADDR0 = EU_DEPENDENCY_ID_MRF0 + 24, + /* Accumulator register part of the ARF. */ + EU_DEPENDENCY_ID_ACCUM0 = EU_DEPENDENCY_ID_ADDR0 + 1, + /* Flag register part of the ARF. */ + EU_DEPENDENCY_ID_FLAG0 = EU_DEPENDENCY_ID_ACCUM0 + 12, + /* SBID token write completion. Only used on Gfx12+. */ + EU_DEPENDENCY_ID_SBID_WR0 = EU_DEPENDENCY_ID_FLAG0 + 8, + /* SBID token read completion. Only used on Gfx12+. */ + EU_DEPENDENCY_ID_SBID_RD0 = EU_DEPENDENCY_ID_SBID_WR0 + 32, + /* Number of computation dependencies currently tracked. */ + EU_NUM_DEPENDENCY_IDS = EU_DEPENDENCY_ID_SBID_RD0 + 32 + }; + + /** + * State of our modeling of the program execution. + */ + struct state { + state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {} + /** + * Time at which a given unit will be ready to execute the next + * computation, in clock units. + */ + unsigned unit_ready[EU_NUM_UNITS]; + /** + * Time at which an instruction dependent on a given dependency ID will + * be ready to execute, in clock units. + */ + unsigned dep_ready[EU_NUM_DEPENDENCY_IDS]; + /** + * Aggregated utilization of a given unit excluding idle cycles, + * in clock units. + */ + float unit_busy[EU_NUM_UNITS]; + /** + * Factor of the overhead of a computation accounted for in the + * aggregated utilization calculation. + */ + float weight; + }; + + /** + * Information derived from an IR instruction used to compute performance + * estimates. Allows the timing calculation to work on both FS and VEC4 + * instructions. + */ + struct instruction_info { + instruction_info(const struct brw_isa_info *isa, const fs_inst *inst) : + isa(isa), devinfo(isa->devinfo), op(inst->opcode), + td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)), + tx(get_exec_type(inst)), sx(0), ss(0), + sc(has_bank_conflict(isa, inst) ? sd : 0), + desc(inst->desc), sfid(inst->sfid) + { + /* We typically want the maximum source size, except for split send + * messages which require the total size. + */ + if (inst->opcode == SHADER_OPCODE_SEND) { + ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) + + DIV_ROUND_UP(inst->size_read(3), REG_SIZE); + } else { + for (unsigned i = 0; i < inst->sources; i++) + ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE)); + } + + /* Convert the execution size to GRF units. */ + sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE); + + /* 32x32 integer multiplication has half the usual ALU throughput. + * Treat it as double-precision. + */ + if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) && + !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 && + type_sz(inst->src[0].type) == type_sz(inst->src[1].type)) + tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D); + + rcount = inst->opcode == BRW_OPCODE_DPAS ? inst->rcount : 0; + } + + instruction_info(const struct brw_isa_info *isa, + const vec4_instruction *inst) : + isa(isa), devinfo(isa->devinfo), op(inst->opcode), + td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)), + tx(get_exec_type(inst)), sx(0), ss(0), sc(0), + desc(inst->desc), sfid(inst->sfid), rcount(0) + { + /* Compute the maximum source size. */ + for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) + ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE)); + + /* Convert the execution size to GRF units. */ + sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE); + + /* 32x32 integer multiplication has half the usual ALU throughput. + * Treat it as double-precision. + */ + if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) && + !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 && + type_sz(inst->src[0].type) == type_sz(inst->src[1].type)) + tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D); + } + + /** ISA encoding information */ + const struct brw_isa_info *isa; + /** Device information. */ + const struct intel_device_info *devinfo; + /** Instruction opcode. */ + opcode op; + /** Destination type. */ + brw_reg_type td; + /** Destination size in GRF units. */ + unsigned sd; + /** Execution type. */ + brw_reg_type tx; + /** Execution size in GRF units. */ + unsigned sx; + /** Source size. */ + unsigned ss; + /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */ + unsigned sc; + /** Send message descriptor. */ + uint32_t desc; + /** Send message shared function ID. */ + uint8_t sfid; + /** Repeat count for DPAS instructions. */ + uint8_t rcount; + }; + + /** + * Timing information of an instruction used to estimate the performance of + * the program. + */ + struct perf_desc { + perf_desc(enum intel_eu_unit u, int df, int db, + int ls, int ld, int la, int lf) : + u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {} + + /** + * Back-end unit its runtime shall be accounted to, in addition to the + * EU front-end which is always assumed to be involved. + */ + enum intel_eu_unit u; + /** + * Overhead cycles from the time that the EU front-end starts executing + * the instruction until it's ready to execute the next instruction. + */ + int df; + /** + * Overhead cycles from the time that the back-end starts executing the + * instruction until it's ready to execute the next instruction. + */ + int db; + /** + * Latency cycles from the time that the back-end starts executing the + * instruction until its sources have been read from the register file. + */ + int ls; + /** + * Latency cycles from the time that the back-end starts executing the + * instruction until its regular destination has been written to the + * register file. + */ + int ld; + /** + * Latency cycles from the time that the back-end starts executing the + * instruction until its accumulator destination has been written to the + * ARF file. + * + * Note that this is an approximation of the real behavior of + * accumulating instructions in the hardware: Instead of modeling a pair + * of back-to-back accumulating instructions as a first computation with + * latency equal to ld followed by another computation with a + * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we + * model the stall as if it occurred at the top of the pipeline, with + * the latency of the accumulator computation offset accordingly. + */ + int la; + /** + * Latency cycles from the time that the back-end starts executing the + * instruction until its flag destination has been written to the ARF + * file. + */ + int lf; + }; + + /** + * Compute the timing information of an instruction based on any relevant + * information from the IR and a number of parameters specifying a linear + * approximation: Parameter X_Y specifies the derivative of timing X + * relative to info field Y, while X_1 specifies the independent term of + * the approximation of timing X. + */ + perf_desc + calculate_desc(const instruction_info &info, enum intel_eu_unit u, + int df_1, int df_sd, int df_sc, + int db_1, int db_sx, + int ls_1, int ld_1, int la_1, int lf_1, + int l_ss, int l_sd) + { + return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc), + db_1 + db_sx * int(info.sx), + ls_1 + l_ss * int(info.ss), + ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd), + la_1, lf_1); + } + + /** + * Compute the timing information of an instruction based on any relevant + * information from the IR and a number of linear approximation parameters + * hard-coded for each IR instruction. + * + * Most timing parameters are obtained from the multivariate linear + * regression of a sample of empirical timings measured using the tm0 + * register (as can be done today by using the shader_time debugging + * option). The Gfx4-5 math timings are obtained from BSpec Volume 5c.3 + * "Shared Functions - Extended Math", Section 3.2 "Performance". + * Parameters marked XXX shall be considered low-quality, they're possibly + * high variance or completely guessed in cases where experimental data was + * unavailable. + */ + const perf_desc + instruction_desc(const instruction_info &info) + { + const struct intel_device_info *devinfo = info.devinfo; + + switch (info.op) { + case BRW_OPCODE_SYNC: + case BRW_OPCODE_SEL: + case BRW_OPCODE_NOT: + case BRW_OPCODE_AND: + case BRW_OPCODE_OR: + case BRW_OPCODE_XOR: + case BRW_OPCODE_SHR: + case BRW_OPCODE_SHL: + case BRW_OPCODE_DIM: + case BRW_OPCODE_ASR: + case BRW_OPCODE_CMPN: + case BRW_OPCODE_F16TO32: + case BRW_OPCODE_BFREV: + case BRW_OPCODE_BFI1: + case BRW_OPCODE_AVG: + case BRW_OPCODE_FRC: + case BRW_OPCODE_RNDU: + case BRW_OPCODE_RNDD: + case BRW_OPCODE_RNDE: + case BRW_OPCODE_RNDZ: + case BRW_OPCODE_MAC: + case BRW_OPCODE_MACH: + case BRW_OPCODE_LZD: + case BRW_OPCODE_FBH: + case BRW_OPCODE_FBL: + case BRW_OPCODE_CBIT: + case BRW_OPCODE_ADDC: + case BRW_OPCODE_ROR: + case BRW_OPCODE_ROL: + case BRW_OPCODE_SUBB: + case BRW_OPCODE_SAD2: + case BRW_OPCODE_SADA2: + case BRW_OPCODE_LINE: + case BRW_OPCODE_NOP: + case SHADER_OPCODE_CLUSTER_BROADCAST: + case SHADER_OPCODE_SCRATCH_HEADER: + case FS_OPCODE_DDX_COARSE: + case FS_OPCODE_DDX_FINE: + case FS_OPCODE_DDY_COARSE: + case FS_OPCODE_PIXEL_X: + case FS_OPCODE_PIXEL_Y: + case FS_OPCODE_SET_SAMPLE_ID: + case VEC4_OPCODE_MOV_BYTES: + case VEC4_OPCODE_UNPACK_UNIFORM: + case VEC4_OPCODE_DOUBLE_TO_F32: + case VEC4_OPCODE_DOUBLE_TO_D32: + case VEC4_OPCODE_DOUBLE_TO_U32: + case VEC4_OPCODE_TO_DOUBLE: + case VEC4_OPCODE_PICK_LOW_32BIT: + case VEC4_OPCODE_PICK_HIGH_32BIT: + case VEC4_OPCODE_SET_LOW_32BIT: + case VEC4_OPCODE_SET_HIGH_32BIT: + case VEC4_OPCODE_ZERO_OOB_PUSH_REGS: + case GS_OPCODE_SET_DWORD_2: + case GS_OPCODE_SET_WRITE_OFFSET: + case GS_OPCODE_SET_VERTEX_COUNT: + case GS_OPCODE_PREPARE_CHANNEL_MASKS: + case GS_OPCODE_SET_CHANNEL_MASKS: + case GS_OPCODE_GET_INSTANCE_ID: + case GS_OPCODE_SET_PRIMITIVE_ID: + case GS_OPCODE_SVB_SET_DST_INDEX: + case TCS_OPCODE_SRC0_010_IS_ZERO: + case TCS_OPCODE_GET_PRIMITIVE_ID: + case TES_OPCODE_GET_PRIMITIVE_ID: + case SHADER_OPCODE_READ_SR_REG: + if (devinfo->ver >= 11) { + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, + 0, 10, 6 /* XXX */, 14, 0, 0); + } else if (devinfo->ver >= 8) { + if (type_sz(info.tx) > 4) + return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4, + 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); + else + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, + 0, 8, 4, 12, 0, 0); + } else if (devinfo->verx10 >= 75) { + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, + 0, 10, 6 /* XXX */, 16, 0, 0); + } else { + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, + 0, 12, 8 /* XXX */, 18, 0, 0); + } + + case BRW_OPCODE_MOV: + case BRW_OPCODE_CMP: + case BRW_OPCODE_ADD: + case BRW_OPCODE_ADD3: + case BRW_OPCODE_MUL: + case SHADER_OPCODE_MOV_RELOC_IMM: + case VEC4_OPCODE_MOV_FOR_SCRATCH: + if (devinfo->ver >= 11) { + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, + 0, 10, 6, 14, 0, 0); + } else if (devinfo->ver >= 8) { + if (type_sz(info.tx) > 4) + return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4, + 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); + else + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, + 0, 8, 4, 12, 0, 0); + } else if (devinfo->verx10 >= 75) { + if (info.tx == BRW_REGISTER_TYPE_F) + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, + 0, 12, 8 /* XXX */, 18, 0, 0); + else + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, + 0, 10, 6 /* XXX */, 16, 0, 0); + } else if (devinfo->ver >= 7) { + if (info.tx == BRW_REGISTER_TYPE_F) + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, + 0, 14, 10 /* XXX */, 20, 0, 0); + else + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, + 0, 12, 8 /* XXX */, 18, 0, 0); + } else { + return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0, + 0, 2 /* XXX */, + 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, + 0, 0); + } + + case BRW_OPCODE_BFE: + case BRW_OPCODE_BFI2: + case BRW_OPCODE_CSEL: + if (devinfo->ver >= 11) + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, + 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); + else if (devinfo->ver >= 8) + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, + 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); + else if (devinfo->verx10 >= 75) + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, + 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); + else if (devinfo->ver >= 7) + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, + 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); + else + abort(); + + case BRW_OPCODE_MAD: + if (devinfo->ver >= 11) { + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, + 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); + } else if (devinfo->ver >= 8) { + if (type_sz(info.tx) > 4) + return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4, + 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); + else + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, + 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); + } else if (devinfo->verx10 >= 75) { + if (info.tx == BRW_REGISTER_TYPE_F) + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, + 0, 12, 8 /* XXX */, 18, 0, 0); + else + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, + 0, 10, 6 /* XXX */, 16, 0, 0); + } else if (devinfo->ver >= 7) { + if (info.tx == BRW_REGISTER_TYPE_F) + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, + 0, 14, 10 /* XXX */, 20, 0, 0); + else + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, + 0, 12, 8 /* XXX */, 18, 0, 0); + } else if (devinfo->ver >= 6) { + return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 1 /* XXX */, + 0, 2 /* XXX */, + 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, + 0, 0); + } else { + abort(); + } + + case BRW_OPCODE_F32TO16: + if (devinfo->ver >= 11) + return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4, + 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); + else if (devinfo->ver >= 8) + return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4, + 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); + else if (devinfo->verx10 >= 75) + return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4, + 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); + else if (devinfo->ver >= 7) + return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4, + 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); + else + abort(); + + case BRW_OPCODE_DP4: + case BRW_OPCODE_DPH: + case BRW_OPCODE_DP3: + case BRW_OPCODE_DP2: + if (devinfo->ver >= 8) + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, + 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); + else if (devinfo->verx10 >= 75) + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, + 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); + else + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, + 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); + + case BRW_OPCODE_DP4A: + if (devinfo->ver >= 12) + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, + 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); + else + abort(); + + case BRW_OPCODE_DPAS: { + unsigned ld; + + switch (info.rcount) { + case 1: + ld = 21; + break; + case 2: + ld = 22; + break; + case 8: + default: + ld = 32; + break; + } + + /* DPAS cannot write the accumulator or the flags, so pass UINT_MAX + * for la and lf. + */ + if (devinfo->verx10 >= 125) + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, + 0, ld, UINT_MAX, UINT_MAX, 0, 0); + else + abort(); + } + + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + case SHADER_OPCODE_POW: + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + if (devinfo->ver >= 6) { + switch (info.op) { + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + if (devinfo->ver >= 8) + return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 4, + 0, 16, 0, 0, 0, 0); + else if (devinfo->verx10 >= 75) + return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 2, + 0, 12, 0, 0, 0, 0); + else + return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 2, + 0, 14, 0, 0, 0, 0); + + case SHADER_OPCODE_POW: + if (devinfo->ver >= 8) + return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 8, + 0, 24, 0, 0, 0, 0); + else if (devinfo->verx10 >= 75) + return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 4, + 0, 20, 0, 0, 0, 0); + else + return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 4, + 0, 22, 0, 0, 0, 0); + + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 26, 0, + 0, 28 /* XXX */, 0, 0, 0, 0); + + default: + abort(); + } + } else { + switch (info.op) { + case SHADER_OPCODE_RCP: + return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 8, + 0, 22, 0, 0, 0, 8); + + case SHADER_OPCODE_RSQ: + return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 16, + 0, 44, 0, 0, 0, 8); + + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_LOG2: + return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 24, + 0, 66, 0, 0, 0, 8); + + case SHADER_OPCODE_INT_REMAINDER: + case SHADER_OPCODE_EXP2: + return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 32, + 0, 88, 0, 0, 0, 8); + + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 48, + 0, 132, 0, 0, 0, 8); + + case SHADER_OPCODE_POW: + return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 64, + 0, 176, 0, 0, 0, 8); + + default: + abort(); + } + } + + case BRW_OPCODE_DO: + if (devinfo->ver >= 6) + return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0); + else + return calculate_desc(info, EU_UNIT_NULL, 2 /* XXX */, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0); + + case BRW_OPCODE_IF: + case BRW_OPCODE_ELSE: + case BRW_OPCODE_ENDIF: + case BRW_OPCODE_WHILE: + case BRW_OPCODE_BREAK: + case BRW_OPCODE_CONTINUE: + case BRW_OPCODE_HALT: + if (devinfo->ver >= 8) + return calculate_desc(info, EU_UNIT_NULL, 8, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0); + else if (devinfo->verx10 >= 75) + return calculate_desc(info, EU_UNIT_NULL, 6, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0); + else + return calculate_desc(info, EU_UNIT_NULL, 2, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0); + + case FS_OPCODE_LINTERP: + if (devinfo->ver >= 8) + return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4, + 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); + else if (devinfo->verx10 >= 75) + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, + 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); + else + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, + 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); + + case BRW_OPCODE_LRP: + if (devinfo->ver >= 8) + return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4, + 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); + else if (devinfo->verx10 >= 75) + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, + 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); + else if (devinfo->ver >= 6) + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2, + 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); + else + abort(); + + case FS_OPCODE_PACK_HALF_2x16_SPLIT: + if (devinfo->ver >= 11) + return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6, + 0, 10 /* XXX */, 6 /* XXX */, + 14 /* XXX */, 0, 0); + else if (devinfo->ver >= 8) + return calculate_desc(info, EU_UNIT_FPU, 16, 6, 0, 0, 6, + 0, 8 /* XXX */, 4 /* XXX */, + 12 /* XXX */, 0, 0); + else if (devinfo->verx10 >= 75) + return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6, + 0, 10 /* XXX */, 6 /* XXX */, + 16 /* XXX */, 0, 0); + else if (devinfo->ver >= 7) + return calculate_desc(info, EU_UNIT_FPU, 24, 6, 0, 0, 6, + 0, 12 /* XXX */, 8 /* XXX */, + 18 /* XXX */, 0, 0); + else + abort(); + + case SHADER_OPCODE_MOV_INDIRECT: + if (devinfo->ver >= 11) + return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0, + 0, 10 /* XXX */, 6 /* XXX */, + 14 /* XXX */, 0, 0); + else if (devinfo->ver >= 8) + return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0, + 0, 8 /* XXX */, 4 /* XXX */, + 12 /* XXX */, 0, 0); + else if (devinfo->verx10 >= 75) + return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0, + 0, 10 /* XXX */, 6 /* XXX */, + 16 /* XXX */, 0, 0); + else + return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0, + 0, 12 /* XXX */, 8 /* XXX */, + 18 /* XXX */, 0, 0); + + case SHADER_OPCODE_BROADCAST: + if (devinfo->ver >= 11) + return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0, 4, 0, + 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); + else if (devinfo->ver >= 8) + return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0, + 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); + else if (devinfo->verx10 >= 75) + return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0, + 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); + else if (devinfo->ver >= 7) + return calculate_desc(info, EU_UNIT_FPU, 20, 0, 0, 4, 0, + 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); + else + abort(); + + case SHADER_OPCODE_FIND_LIVE_CHANNEL: + case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL: + if (devinfo->ver >= 11) + return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0, + 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); + else if (devinfo->ver >= 8) + return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0, + 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); + else if (devinfo->verx10 >= 75) + return calculate_desc(info, EU_UNIT_FPU, 36, 0, 0, 6, 0, + 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); + else if (devinfo->ver >= 7) + return calculate_desc(info, EU_UNIT_FPU, 40, 0, 0, 6, 0, + 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); + else + abort(); + + case SHADER_OPCODE_RND_MODE: + case SHADER_OPCODE_FLOAT_CONTROL_MODE: + if (devinfo->ver >= 11) + return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0, + 4 /* XXX */, 0, + 0, 0, 0, 0, 0, 0); + else if (devinfo->ver >= 8) + return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0, + 4 /* XXX */, 0, + 0, 0, 0, 0, 0, 0); + else if (devinfo->verx10 >= 75) + return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0, + 4 /* XXX */, 0, + 0, 0, 0, 0, 0, 0); + else if (devinfo->ver >= 6) + return calculate_desc(info, EU_UNIT_FPU, 28 /* XXX */, 0, 0, + 4 /* XXX */, 0, + 0, 0, 0, 0, 0, 0); + else + abort(); + + case SHADER_OPCODE_SHUFFLE: + if (devinfo->ver >= 11) + return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0, + 44 /* XXX */, 0, + 0, 10 /* XXX */, 6 /* XXX */, + 14 /* XXX */, 0, 0); + else if (devinfo->ver >= 8) + return calculate_desc(info, EU_UNIT_FPU, 42 /* XXX */, 0, 0, + 42 /* XXX */, 0, + 0, 8 /* XXX */, 4 /* XXX */, + 12 /* XXX */, 0, 0); + else if (devinfo->verx10 >= 75) + return calculate_desc(info, EU_UNIT_FPU, 0, 44 /* XXX */, 0, + 0, 44 /* XXX */, + 0, 10 /* XXX */, 6 /* XXX */, + 16 /* XXX */, 0, 0); + else if (devinfo->ver >= 6) + return calculate_desc(info, EU_UNIT_FPU, 0, 46 /* XXX */, 0, + 0, 46 /* XXX */, + 0, 12 /* XXX */, 8 /* XXX */, + 18 /* XXX */, 0, 0); + else + abort(); + + case SHADER_OPCODE_SEL_EXEC: + if (devinfo->ver >= 11) + return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0, + 0, 4 /* XXX */, + 0, 10 /* XXX */, 6 /* XXX */, + 14 /* XXX */, 0, 0); + else if (devinfo->ver >= 8) + return calculate_desc(info, EU_UNIT_FPU, 8 /* XXX */, 4 /* XXX */, 0, + 0, 4 /* XXX */, + 0, 8 /* XXX */, 4 /* XXX */, + 12 /* XXX */, 0, 0); + else if (devinfo->verx10 >= 75) + return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0, + 0, 4 /* XXX */, + 0, 10 /* XXX */, 6 /* XXX */, + 16 /* XXX */, 0, 0); + else + return calculate_desc(info, EU_UNIT_FPU, 12 /* XXX */, 4 /* XXX */, 0, + 0, 4 /* XXX */, + 0, 12 /* XXX */, 8 /* XXX */, + 18 /* XXX */, 0, 0); + + case SHADER_OPCODE_QUAD_SWIZZLE: + if (devinfo->ver >= 11) + return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0, + 0, 8 /* XXX */, + 0, 10 /* XXX */, 6 /* XXX */, + 14 /* XXX */, 0, 0); + else if (devinfo->ver >= 8) + return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0, + 0, 8 /* XXX */, + 0, 8 /* XXX */, 4 /* XXX */, + 12 /* XXX */, 0, 0); + else if (devinfo->verx10 >= 75) + return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0, + 0, 8 /* XXX */, + 0, 10 /* XXX */, 6 /* XXX */, + 16 /* XXX */, 0, 0); + else + return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0, + 0, 8 /* XXX */, + 0, 12 /* XXX */, 8 /* XXX */, + 18 /* XXX */, 0, 0); + + case FS_OPCODE_DDY_FINE: + if (devinfo->ver >= 11) + return calculate_desc(info, EU_UNIT_FPU, 0, 14, 0, 0, 4, + 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); + else if (devinfo->ver >= 8) + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, + 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); + else if (devinfo->verx10 >= 75) + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, + 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); + else + return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2, + 0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0); + + case FS_OPCODE_LOAD_LIVE_CHANNELS: + if (devinfo->ver >= 11) + return calculate_desc(info, EU_UNIT_FPU, 2 /* XXX */, 0, 0, + 2 /* XXX */, 0, + 0, 0, 0, 10 /* XXX */, 0, 0); + else if (devinfo->ver >= 8) + return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0, + 0, 2 /* XXX */, + 0, 0, 0, 8 /* XXX */, 0, 0); + else + abort(); + + case VEC4_OPCODE_PACK_BYTES: + if (devinfo->ver >= 8) + return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0, + 4 /* XXX */, 0, + 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */, + 0, 0); + else if (devinfo->verx10 >= 75) + return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0, + 4 /* XXX */, 0, + 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */, + 0, 0); + else + return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0, + 4 /* XXX */, 0, + 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, + 0, 0); + + case VS_OPCODE_UNPACK_FLAGS_SIMD4X2: + case TCS_OPCODE_GET_INSTANCE_ID: + case VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS: + case VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: + case TES_OPCODE_CREATE_INPUT_READ_HEADER: + if (devinfo->ver >= 8) + return calculate_desc(info, EU_UNIT_FPU, 22 /* XXX */, 0, 0, + 6 /* XXX */, 0, + 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */, + 0, 0); + else if (devinfo->verx10 >= 75) + return calculate_desc(info, EU_UNIT_FPU, 26 /* XXX */, 0, 0, + 6 /* XXX */, 0, + 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */, + 0, 0); + else + return calculate_desc(info, EU_UNIT_FPU, 30 /* XXX */, 0, 0, + 6 /* XXX */, 0, + 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, + 0, 0); + + case GS_OPCODE_FF_SYNC_SET_PRIMITIVES: + case TCS_OPCODE_CREATE_BARRIER_HEADER: + if (devinfo->ver >= 8) + return calculate_desc(info, EU_UNIT_FPU, 32 /* XXX */, 0, 0, + 8 /* XXX */, 0, + 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */, + 0, 0); + else if (devinfo->verx10 >= 75) + return calculate_desc(info, EU_UNIT_FPU, 38 /* XXX */, 0, 0, + 8 /* XXX */, 0, + 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */, + 0, 0); + else if (devinfo->ver >= 6) + return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0, + 8 /* XXX */, 0, + 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, + 0, 0); + else + abort(); + + case TES_OPCODE_ADD_INDIRECT_URB_OFFSET: + if (devinfo->ver >= 8) + return calculate_desc(info, EU_UNIT_FPU, 12 /* XXX */, 0, 0, + 4 /* XXX */, 0, + 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */, + 0, 0); + else if (devinfo->verx10 >= 75) + return calculate_desc(info, EU_UNIT_FPU, 14 /* XXX */, 0, 0, + 4 /* XXX */, 0, + 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */, + 0, 0); + else if (devinfo->ver >= 7) + return calculate_desc(info, EU_UNIT_FPU, 16 /* XXX */, 0, 0, + 4 /* XXX */, 0, + 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, + 0, 0); + else + abort(); + + case SHADER_OPCODE_TEX: + case FS_OPCODE_TXB: + case SHADER_OPCODE_TXD: + case SHADER_OPCODE_TXF: + case SHADER_OPCODE_TXF_LZ: + case SHADER_OPCODE_TXL: + case SHADER_OPCODE_TXL_LZ: + case SHADER_OPCODE_TXF_CMS: + case SHADER_OPCODE_TXF_CMS_W: + case SHADER_OPCODE_TXF_UMS: + case SHADER_OPCODE_TXF_MCS: + case SHADER_OPCODE_TXS: + case SHADER_OPCODE_LOD: + case SHADER_OPCODE_GET_BUFFER_SIZE: + case SHADER_OPCODE_TG4: + case SHADER_OPCODE_TG4_OFFSET: + case SHADER_OPCODE_SAMPLEINFO: + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4: + return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16 /* XXX */, + 8 /* XXX */, 750 /* XXX */, 0, 0, + 2 /* XXX */, 0); + + case VEC4_OPCODE_URB_READ: + case VEC4_VS_OPCODE_URB_WRITE: + case VEC4_GS_OPCODE_URB_WRITE: + case VEC4_GS_OPCODE_URB_WRITE_ALLOCATE: + case GS_OPCODE_THREAD_END: + case GS_OPCODE_FF_SYNC: + case VEC4_TCS_OPCODE_URB_WRITE: + case TCS_OPCODE_RELEASE_INPUT: + case TCS_OPCODE_THREAD_END: + return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */, + 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0); + + case SHADER_OPCODE_MEMORY_FENCE: + case SHADER_OPCODE_INTERLOCK: + switch (info.sfid) { + case GFX6_SFID_DATAPORT_RENDER_CACHE: + if (devinfo->ver >= 7) + return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 30 /* XXX */, 0, + 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0); + else + abort(); + + case BRW_SFID_URB: + case GFX7_SFID_DATAPORT_DATA_CACHE: + case GFX12_SFID_SLM: + case GFX12_SFID_TGM: + case GFX12_SFID_UGM: + case HSW_SFID_DATAPORT_DATA_CACHE_1: + if (devinfo->ver >= 7) + return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 30 /* XXX */, 0, + 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0); + else + abort(); + + default: + abort(); + } + + case SHADER_OPCODE_GFX4_SCRATCH_READ: + case SHADER_OPCODE_GFX4_SCRATCH_WRITE: + case SHADER_OPCODE_GFX7_SCRATCH_READ: + return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 0, 8 /* XXX */, + 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0); + + case VEC4_OPCODE_UNTYPED_ATOMIC: + if (devinfo->ver >= 7) + return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, + 30 /* XXX */, 400 /* XXX */, + 10 /* XXX */, 100 /* XXX */, 0, 0, + 0, 400 /* XXX */); + else + abort(); + + case VEC4_OPCODE_UNTYPED_SURFACE_READ: + case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: + if (devinfo->ver >= 7) + return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, + 0, 20 /* XXX */, + 10 /* XXX */, 100 /* XXX */, 0, 0, + 0, 0); + else + abort(); + + case FS_OPCODE_FB_WRITE: + case FS_OPCODE_FB_READ: + case FS_OPCODE_REP_FB_WRITE: + return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 0, 450 /* XXX */, + 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0); + + case GS_OPCODE_SVB_WRITE: + if (devinfo->ver >= 6) + return calculate_desc(info, EU_UNIT_DP_RC, 2 /* XXX */, 0, 0, + 0, 450 /* XXX */, + 10 /* XXX */, 300 /* XXX */, 0, 0, + 0, 0); + else + abort(); + + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: + return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */, + 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0); + + case VS_OPCODE_PULL_CONSTANT_LOAD: + case VS_OPCODE_PULL_CONSTANT_LOAD_GFX7: + return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16, + 8, 750, 0, 0, 2, 0); + + case FS_OPCODE_INTERPOLATE_AT_SAMPLE: + case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: + case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: + if (devinfo->ver >= 7) + return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0, + 0, 90 /* XXX */, 0, 0, 0, 0); + else + abort(); + + case SHADER_OPCODE_BARRIER: + if (devinfo->ver >= 7) + return calculate_desc(info, EU_UNIT_GATEWAY, 90 /* XXX */, 0, 0, + 0 /* XXX */, 0, + 0, 0, 0, 0, 0, 0); + else + abort(); + + case CS_OPCODE_CS_TERMINATE: + if (devinfo->ver >= 7) + return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0, + 10 /* XXX */, 0, 0, 0, 0, 0); + else + abort(); + + case SHADER_OPCODE_SEND: + switch (info.sfid) { + case GFX6_SFID_DATAPORT_CONSTANT_CACHE: + if (devinfo->ver >= 7) { + /* See FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */ + return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */, + 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0); + } else { + abort(); + } + case GFX6_SFID_DATAPORT_RENDER_CACHE: + if (devinfo->ver >= 7) { + switch (brw_dp_desc_msg_type(devinfo, info.desc)) { + case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP: + return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, + 30 /* XXX */, 450 /* XXX */, + 10 /* XXX */, 100 /* XXX */, + 0, 0, 0, 400 /* XXX */); + default: + return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, + 0, 450 /* XXX */, + 10 /* XXX */, 300 /* XXX */, 0, 0, + 0, 0); + } + } else if (devinfo->ver >= 6) { + return calculate_desc(info, EU_UNIT_DP_RC, 2 /* XXX */, 0, 0, + 0, 450 /* XXX */, + 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0); + } else { + abort(); + } + case BRW_SFID_SAMPLER: { + if (devinfo->ver >= 6) + return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16, + 8, 750, 0, 0, 2, 0); + else + abort(); + } + case GFX7_SFID_DATAPORT_DATA_CACHE: + case HSW_SFID_DATAPORT_DATA_CACHE_1: + if (devinfo->verx10 >= 75) { + switch (brw_dp_desc_msg_type(devinfo, info.desc)) { + case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP: + case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2: + case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2: + case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP: + return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, + 30 /* XXX */, 400 /* XXX */, + 10 /* XXX */, 100 /* XXX */, 0, 0, + 0, 400 /* XXX */); + + default: + return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, + 0, 20 /* XXX */, + 10 /* XXX */, 100 /* XXX */, 0, 0, + 0, 0); + } + } else if (devinfo->ver >= 7) { + switch (brw_dp_desc_msg_type(devinfo, info.desc)) { + case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP: + return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, + 30 /* XXX */, 400 /* XXX */, + 10 /* XXX */, 100 /* XXX */, + 0, 0, 0, 400 /* XXX */); + default: + return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, + 0, 20 /* XXX */, + 10 /* XXX */, 100 /* XXX */, 0, 0, + 0, 0); + } + } else { + abort(); + } + + case GFX7_SFID_PIXEL_INTERPOLATOR: + if (devinfo->ver >= 7) + return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0, + 0, 90 /* XXX */, 0, 0, 0, 0); + else + abort(); + + case GFX12_SFID_UGM: + case GFX12_SFID_TGM: + case GFX12_SFID_SLM: + switch (lsc_msg_desc_opcode(devinfo, info.desc)) { + case LSC_OP_LOAD: + case LSC_OP_STORE: + case LSC_OP_LOAD_CMASK: + case LSC_OP_STORE_CMASK: + return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, + 0, 20 /* XXX */, + 10 /* XXX */, 100 /* XXX */, 0, 0, + 0, 0); + + case LSC_OP_FENCE: + case LSC_OP_ATOMIC_INC: + case LSC_OP_ATOMIC_DEC: + case LSC_OP_ATOMIC_LOAD: + case LSC_OP_ATOMIC_STORE: + case LSC_OP_ATOMIC_ADD: + case LSC_OP_ATOMIC_SUB: + case LSC_OP_ATOMIC_MIN: + case LSC_OP_ATOMIC_MAX: + case LSC_OP_ATOMIC_UMIN: + case LSC_OP_ATOMIC_UMAX: + case LSC_OP_ATOMIC_CMPXCHG: + case LSC_OP_ATOMIC_FADD: + case LSC_OP_ATOMIC_FSUB: + case LSC_OP_ATOMIC_FMIN: + case LSC_OP_ATOMIC_FMAX: + case LSC_OP_ATOMIC_FCMPXCHG: + case LSC_OP_ATOMIC_AND: + case LSC_OP_ATOMIC_OR: + case LSC_OP_ATOMIC_XOR: + return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, + 30 /* XXX */, 400 /* XXX */, + 10 /* XXX */, 100 /* XXX */, 0, 0, + 0, 400 /* XXX */); + default: + abort(); + } + + case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH: + case GEN_RT_SFID_RAY_TRACE_ACCELERATOR: + return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0, + 10 /* XXX */, 0, 0, 0, 0, 0); + + case BRW_SFID_URB: + return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */, + 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0); + + default: + abort(); + } + + case SHADER_OPCODE_UNDEF: + case SHADER_OPCODE_HALT_TARGET: + case FS_OPCODE_SCHEDULING_FENCE: + return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0); + + default: + abort(); + } + } + + /** + * Model the performance behavior of a stall on the specified dependency + * ID. + */ + void + stall_on_dependency(state &st, enum intel_eu_dependency_id id) + { + if (id < ARRAY_SIZE(st.dep_ready)) + st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE], + st.dep_ready[id]); + } + + /** + * Model the performance behavior of the front-end and back-end while + * executing an instruction with the specified timing information, assuming + * all dependencies are already clear. + */ + void + execute_instruction(state &st, const perf_desc &perf) + { + /* Compute the time at which the front-end will be ready to execute the + * next instruction. + */ + st.unit_ready[EU_UNIT_FE] += perf.df; + + if (perf.u < EU_NUM_UNITS) { + /* Wait for the back-end to be ready to execute this instruction. */ + st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE], + st.unit_ready[perf.u]); + + /* Compute the time at which the back-end will be ready to execute + * the next instruction, and update the back-end utilization. + */ + st.unit_ready[perf.u] = st.unit_ready[EU_UNIT_FE] + perf.db; + st.unit_busy[perf.u] += perf.db * st.weight; + } + } + + /** + * Model the performance behavior of a read dependency provided by an + * instruction. + */ + void + mark_read_dependency(state &st, const perf_desc &perf, + enum intel_eu_dependency_id id) + { + if (id < ARRAY_SIZE(st.dep_ready)) + st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ls; + } + + /** + * Model the performance behavior of a write dependency provided by an + * instruction. + */ + void + mark_write_dependency(state &st, const perf_desc &perf, + enum intel_eu_dependency_id id) + { + if (id >= EU_DEPENDENCY_ID_ACCUM0 && id < EU_DEPENDENCY_ID_FLAG0) + st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.la; + else if (id >= EU_DEPENDENCY_ID_FLAG0 && id < EU_DEPENDENCY_ID_SBID_WR0) + st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.lf; + else if (id < ARRAY_SIZE(st.dep_ready)) + st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ld; + } + + /** + * Return the dependency ID of a backend_reg, offset by \p delta GRFs. + */ + enum intel_eu_dependency_id + reg_dependency_id(const intel_device_info *devinfo, const backend_reg &r, + const int delta) + { + if (r.file == VGRF) { + const unsigned i = r.nr + r.offset / REG_SIZE + delta; + assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0); + return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i); + + } else if (r.file == FIXED_GRF) { + const unsigned i = r.nr + delta; + assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0); + return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i); + + } else if (r.file == MRF && devinfo->ver >= 7) { + const unsigned i = GFX7_MRF_HACK_START + + r.nr + r.offset / REG_SIZE + delta; + assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0); + return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i); + + } else if (r.file == MRF && devinfo->ver < 7) { + const unsigned i = (r.nr & ~BRW_MRF_COMPR4) + + r.offset / REG_SIZE + delta; + assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_MRF0); + return intel_eu_dependency_id(EU_DEPENDENCY_ID_MRF0 + i); + + } else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS && + r.nr < BRW_ARF_ACCUMULATOR) { + assert(delta == 0); + return EU_DEPENDENCY_ID_ADDR0; + + } else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR && + r.nr < BRW_ARF_FLAG) { + const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta; + assert(i < EU_DEPENDENCY_ID_FLAG0 - EU_DEPENDENCY_ID_ACCUM0); + return intel_eu_dependency_id(EU_DEPENDENCY_ID_ACCUM0 + i); + + } else { + return EU_NUM_DEPENDENCY_IDS; + } + } + + /** + * Return the dependency ID of flag register starting at offset \p i. + */ + enum intel_eu_dependency_id + flag_dependency_id(unsigned i) + { + assert(i < EU_DEPENDENCY_ID_SBID_WR0 - EU_DEPENDENCY_ID_FLAG0); + return intel_eu_dependency_id(EU_DEPENDENCY_ID_FLAG0 + i); + } + + /** + * Return the dependency ID corresponding to the SBID read completion + * condition of a Gfx12+ SWSB. + */ + enum intel_eu_dependency_id + tgl_swsb_rd_dependency_id(tgl_swsb swsb) + { + if (swsb.mode) { + assert(swsb.sbid < + EU_NUM_DEPENDENCY_IDS - EU_DEPENDENCY_ID_SBID_RD0); + return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_RD0 + swsb.sbid); + } else { + return EU_NUM_DEPENDENCY_IDS; + } + } + + /** + * Return the dependency ID corresponding to the SBID write completion + * condition of a Gfx12+ SWSB. + */ + enum intel_eu_dependency_id + tgl_swsb_wr_dependency_id(tgl_swsb swsb) + { + if (swsb.mode) { + assert(swsb.sbid < + EU_DEPENDENCY_ID_SBID_RD0 - EU_DEPENDENCY_ID_SBID_WR0); + return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_WR0 + swsb.sbid); + } else { + return EU_NUM_DEPENDENCY_IDS; + } + } + + /** + * Return the implicit accumulator register accessed by channel \p i of the + * instruction. + */ + unsigned + accum_reg_of_channel(const intel_device_info *devinfo, + const backend_instruction *inst, + brw_reg_type tx, unsigned i) + { + assert(inst->reads_accumulator_implicitly() || + inst->writes_accumulator_implicitly(devinfo)); + const unsigned offset = (inst->group + i) * type_sz(tx) * + (devinfo->ver < 7 || brw_reg_type_is_floating_point(tx) ? 1 : 2); + return offset / (reg_unit(devinfo) * REG_SIZE) % 2; + } + + /** + * Model the performance behavior of an FS back-end instruction. + */ + void + issue_fs_inst(state &st, const struct brw_isa_info *isa, + const backend_instruction *be_inst) + { + const struct intel_device_info *devinfo = isa->devinfo; + const fs_inst *inst = static_cast(be_inst); + const instruction_info info(isa, inst); + const perf_desc perf = instruction_desc(info); + + /* Stall on any source dependencies. */ + for (unsigned i = 0; i < inst->sources; i++) { + for (unsigned j = 0; j < regs_read(inst, i); j++) + stall_on_dependency( + st, reg_dependency_id(devinfo, inst->src[i], j)); + } + + if (inst->reads_accumulator_implicitly()) { + for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); + j <= accum_reg_of_channel(devinfo, inst, info.tx, + inst->exec_size - 1); j++) + stall_on_dependency( + st, reg_dependency_id(devinfo, brw_acc_reg(8), j)); + } + + if (is_send(inst) && inst->base_mrf != -1) { + for (unsigned j = 0; j < inst->mlen; j++) + stall_on_dependency( + st, reg_dependency_id( + devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j)); + } + + if (const unsigned mask = inst->flags_read(devinfo)) { + for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) { + if (mask & (1 << i)) + stall_on_dependency(st, flag_dependency_id(i)); + } + } + + /* Stall on any write dependencies. */ + if (!inst->no_dd_check) { + if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { + for (unsigned j = 0; j < regs_written(inst); j++) + stall_on_dependency( + st, reg_dependency_id(devinfo, inst->dst, j)); + } + + if (inst->writes_accumulator_implicitly(devinfo)) { + for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); + j <= accum_reg_of_channel(devinfo, inst, info.tx, + inst->exec_size - 1); j++) + stall_on_dependency( + st, reg_dependency_id(devinfo, brw_acc_reg(8), j)); + } + + if (const unsigned mask = inst->flags_written(devinfo)) { + for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) { + if (mask & (1 << i)) + stall_on_dependency(st, flag_dependency_id(i)); + } + } + } + + /* Stall on any SBID dependencies. */ + if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST)) + stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched)); + else if (inst->sched.mode & TGL_SBID_SRC) + stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched)); + + /* Execute the instruction. */ + execute_instruction(st, perf); + + /* Mark any source dependencies. */ + if (inst->is_send_from_grf()) { + for (unsigned i = 0; i < inst->sources; i++) { + if (inst->is_payload(i)) { + for (unsigned j = 0; j < regs_read(inst, i); j++) + mark_read_dependency( + st, perf, reg_dependency_id(devinfo, inst->src[i], j)); + } + } + } + + if (is_send(inst) && inst->base_mrf != -1) { + for (unsigned j = 0; j < inst->mlen; j++) + mark_read_dependency(st, perf, + reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j)); + } + + /* Mark any destination dependencies. */ + if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { + for (unsigned j = 0; j < regs_written(inst); j++) { + mark_write_dependency(st, perf, + reg_dependency_id(devinfo, inst->dst, j)); + } + } + + if (inst->writes_accumulator_implicitly(devinfo)) { + for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); + j <= accum_reg_of_channel(devinfo, inst, info.tx, + inst->exec_size - 1); j++) + mark_write_dependency(st, perf, + reg_dependency_id(devinfo, brw_acc_reg(8), j)); + } + + if (const unsigned mask = inst->flags_written(devinfo)) { + for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) { + if (mask & (1 << i)) + mark_write_dependency(st, perf, flag_dependency_id(i)); + } + } + + /* Mark any SBID dependencies. */ + if (inst->sched.mode & TGL_SBID_SET) { + mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched)); + mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched)); + } + } + + /** + * Model the performance behavior of a VEC4 back-end instruction. + */ + void + issue_vec4_instruction(state &st, const struct brw_isa_info *isa, + const backend_instruction *be_inst) + { + const struct intel_device_info *devinfo = isa->devinfo; + const vec4_instruction *inst = + static_cast(be_inst); + const instruction_info info(isa, inst); + const perf_desc perf = instruction_desc(info); + + /* Stall on any source dependencies. */ + for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) { + for (unsigned j = 0; j < regs_read(inst, i); j++) + stall_on_dependency( + st, reg_dependency_id(devinfo, inst->src[i], j)); + } + + if (inst->reads_accumulator_implicitly()) { + for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); + j <= accum_reg_of_channel(devinfo, inst, info.tx, + inst->exec_size - 1); j++) + stall_on_dependency( + st, reg_dependency_id(devinfo, brw_acc_reg(8), j)); + } + + if (inst->base_mrf != -1) { + for (unsigned j = 0; j < inst->mlen; j++) + stall_on_dependency( + st, reg_dependency_id( + devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j)); + } + + if (inst->reads_flag()) + stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0); + + /* Stall on any write dependencies. */ + if (!inst->no_dd_check) { + if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { + for (unsigned j = 0; j < regs_written(inst); j++) + stall_on_dependency( + st, reg_dependency_id(devinfo, inst->dst, j)); + } + + if (inst->writes_accumulator_implicitly(devinfo)) { + for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); + j <= accum_reg_of_channel(devinfo, inst, info.tx, + inst->exec_size - 1); j++) + stall_on_dependency( + st, reg_dependency_id(devinfo, brw_acc_reg(8), j)); + } + + if (inst->writes_flag(devinfo)) + stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0); + } + + /* Execute the instruction. */ + execute_instruction(st, perf); + + /* Mark any source dependencies. */ + if (inst->is_send_from_grf()) { + for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) { + for (unsigned j = 0; j < regs_read(inst, i); j++) + mark_read_dependency( + st, perf, reg_dependency_id(devinfo, inst->src[i], j)); + } + } + + if (inst->base_mrf != -1) { + for (unsigned j = 0; j < inst->mlen; j++) + mark_read_dependency(st, perf, + reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j)); + } + + /* Mark any destination dependencies. */ + if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { + for (unsigned j = 0; j < regs_written(inst); j++) { + mark_write_dependency(st, perf, + reg_dependency_id(devinfo, inst->dst, j)); + } + } + + if (inst->writes_accumulator_implicitly(devinfo)) { + for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); + j <= accum_reg_of_channel(devinfo, inst, info.tx, + inst->exec_size - 1); j++) + mark_write_dependency(st, perf, + reg_dependency_id(devinfo, brw_acc_reg(8), j)); + } + + if (inst->writes_flag(devinfo)) + mark_write_dependency(st, perf, EU_DEPENDENCY_ID_FLAG0); + } + + /** + * Calculate the maximum possible throughput of the program compatible with + * the cycle-count utilization estimated for each asynchronous unit, in + * threads-per-cycle units. + */ + float + calculate_thread_throughput(const state &st, float busy) + { + for (unsigned i = 0; i < EU_NUM_UNITS; i++) + busy = MAX2(busy, st.unit_busy[i]); + + return 1.0 / busy; + } + + /** + * Estimate the performance of the specified shader. + */ + void + calculate_performance(performance &p, const backend_shader *s, + void (*issue_instruction)( + state &, const struct brw_isa_info *, + const backend_instruction *), + unsigned dispatch_width) + { + /* XXX - Note that the previous version of this code used worst-case + * scenario estimation of branching divergence for SIMD32 shaders, + * but this heuristic was removed to improve performance in common + * scenarios. Wider shader variants are less optimal when divergence + * is high, e.g. when application renders complex scene on a small + * surface. It is assumed that such renders are short, so their + * time doesn't matter and when it comes to the overall performance, + * they are dominated by more optimal larger renders. + * + * It's possible that we could do better with divergence analysis + * by isolating branches which are 100% uniform. + * + * Plumbing the trip counts from NIR loop analysis would allow us + * to do a better job regarding the loop weights. + * + * In the meantime use values that roughly match the control flow + * weights used elsewhere in the compiler back-end. + * + * Note that we provide slightly more pessimistic weights on + * Gfx12+ for SIMD32, since the effective warp size on that + * platform is 2x the SIMD width due to EU fusion, which increases + * the likelihood of divergent control flow in comparison to + * previous generations, giving narrower SIMD modes a performance + * advantage in several test-cases with non-uniform discard jumps. + */ + const float discard_weight = (dispatch_width > 16 || s->devinfo->ver < 12 ? + 1.0 : 0.5); + const float loop_weight = 10; + unsigned halt_count = 0; + unsigned elapsed = 0; + state st; + + foreach_block(block, s->cfg) { + const unsigned elapsed0 = elapsed; + + foreach_inst_in_block(backend_instruction, inst, block) { + const unsigned clock0 = st.unit_ready[EU_UNIT_FE]; + + issue_instruction(st, &s->compiler->isa, inst); + + if (inst->opcode == SHADER_OPCODE_HALT_TARGET && halt_count) + st.weight /= discard_weight; + + elapsed += (st.unit_ready[EU_UNIT_FE] - clock0) * st.weight; + + if (inst->opcode == BRW_OPCODE_DO) + st.weight *= loop_weight; + else if (inst->opcode == BRW_OPCODE_WHILE) + st.weight /= loop_weight; + else if (inst->opcode == BRW_OPCODE_HALT && !halt_count++) + st.weight *= discard_weight; + } + + p.block_latency[block->num] = elapsed - elapsed0; + } + + p.latency = elapsed; + p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed); + } +} + +brw::performance::performance(const fs_visitor *v) : + block_latency(new unsigned[v->cfg->num_blocks]) +{ + calculate_performance(*this, v, issue_fs_inst, v->dispatch_width); +} + +brw::performance::performance(const vec4_visitor *v) : + block_latency(new unsigned[v->cfg->num_blocks]) +{ + calculate_performance(*this, v, issue_vec4_instruction, 8); +} + +brw::performance::~performance() +{ + delete[] block_latency; +} diff --git a/src/intel/compiler/elk/brw_ir_performance.h b/src/intel/compiler/elk/brw_ir_performance.h new file mode 100644 index 00000000000..c3cefe838aa --- /dev/null +++ b/src/intel/compiler/elk/brw_ir_performance.h @@ -0,0 +1,86 @@ +/* -*- c++ -*- */ +/* + * Copyright © 2020 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BRW_IR_PERFORMANCE_H +#define BRW_IR_PERFORMANCE_H + +class fs_visitor; + +namespace brw { + class vec4_visitor; + + /** + * Various estimates of the performance of a shader based on static + * analysis. + */ + struct performance { + performance(const fs_visitor *v); + performance(const vec4_visitor *v); + ~performance(); + + analysis_dependency_class + dependency_class() const + { + return (DEPENDENCY_INSTRUCTIONS | + DEPENDENCY_BLOCKS); + } + + bool + validate(const backend_shader *) const + { + return true; + } + + /** + * Array containing estimates of the runtime of each basic block of the + * program in cycle units. + */ + unsigned *block_latency; + + /** + * Estimate of the runtime of the whole program in cycle units assuming + * uncontended execution. + */ + unsigned latency; + + /** + * Estimate of the throughput of the whole program in + * invocations-per-cycle units. + * + * Note that this might be lower than the ratio between the dispatch + * width of the program and its latency estimate in cases where + * performance doesn't scale without limits as a function of its thread + * parallelism, e.g. due to the existence of a bottleneck in a shared + * function. + */ + float throughput; + + private: + performance(const performance &perf); + performance & + operator=(performance u); + }; +} + +#endif diff --git a/src/intel/compiler/elk/brw_ir_vec4.h b/src/intel/compiler/elk/brw_ir_vec4.h new file mode 100644 index 00000000000..78d34729c0b --- /dev/null +++ b/src/intel/compiler/elk/brw_ir_vec4.h @@ -0,0 +1,475 @@ +/* -*- c++ -*- */ +/* + * Copyright © 2011-2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BRW_IR_VEC4_H +#define BRW_IR_VEC4_H + +#include "brw_shader.h" + +namespace brw { + +class dst_reg; + +class src_reg : public backend_reg +{ +public: + DECLARE_RALLOC_CXX_OPERATORS(src_reg) + + void init(); + + src_reg(enum brw_reg_file file, int nr, const glsl_type *type); + src_reg(); + src_reg(struct ::brw_reg reg); + + bool equals(const src_reg &r) const; + bool negative_equals(const src_reg &r) const; + + src_reg(class vec4_visitor *v, const struct glsl_type *type); + src_reg(class vec4_visitor *v, const struct glsl_type *type, int size); + + explicit src_reg(const dst_reg ®); + + src_reg *reladdr; +}; + +static inline src_reg +retype(src_reg reg, enum brw_reg_type type) +{ + reg.type = type; + return reg; +} + +namespace detail { + +static inline void +add_byte_offset(backend_reg *reg, unsigned bytes) +{ + switch (reg->file) { + case BAD_FILE: + break; + case VGRF: + case ATTR: + case UNIFORM: + reg->offset += bytes; + assert(reg->offset % 16 == 0); + break; + case MRF: { + const unsigned suboffset = reg->offset + bytes; + reg->nr += suboffset / REG_SIZE; + reg->offset = suboffset % REG_SIZE; + assert(reg->offset % 16 == 0); + break; + } + case ARF: + case FIXED_GRF: { + const unsigned suboffset = reg->subnr + bytes; + reg->nr += suboffset / REG_SIZE; + reg->subnr = suboffset % REG_SIZE; + assert(reg->subnr % 16 == 0); + break; + } + default: + assert(bytes == 0); + } +} + +} /* namespace detail */ + +static inline src_reg +byte_offset(src_reg reg, unsigned bytes) +{ + detail::add_byte_offset(®, bytes); + return reg; +} + +static inline src_reg +offset(src_reg reg, unsigned width, unsigned delta) +{ + const unsigned stride = (reg.file == UNIFORM ? 0 : 4); + const unsigned num_components = MAX2(width / 4 * stride, 4); + return byte_offset(reg, num_components * type_sz(reg.type) * delta); +} + +static inline src_reg +horiz_offset(src_reg reg, unsigned delta) +{ + return byte_offset(reg, delta * type_sz(reg.type)); +} + +/** + * Reswizzle a given source register. + * \sa brw_swizzle(). + */ +static inline src_reg +swizzle(src_reg reg, unsigned swizzle) +{ + if (reg.file == IMM) + reg.ud = brw_swizzle_immediate(reg.type, reg.ud, swizzle); + else + reg.swizzle = brw_compose_swizzle(swizzle, reg.swizzle); + + return reg; +} + +static inline src_reg +negate(src_reg reg) +{ + assert(reg.file != IMM); + reg.negate = !reg.negate; + return reg; +} + +static inline bool +is_uniform(const src_reg ®) +{ + return (reg.file == IMM || reg.file == UNIFORM || reg.is_null()) && + (!reg.reladdr || is_uniform(*reg.reladdr)); +} + +class dst_reg : public backend_reg +{ +public: + DECLARE_RALLOC_CXX_OPERATORS(dst_reg) + + void init(); + + dst_reg(); + dst_reg(enum brw_reg_file file, int nr); + dst_reg(enum brw_reg_file file, int nr, const glsl_type *type, + unsigned writemask); + dst_reg(enum brw_reg_file file, int nr, brw_reg_type type, + unsigned writemask); + dst_reg(struct ::brw_reg reg); + dst_reg(class vec4_visitor *v, const struct glsl_type *type); + + explicit dst_reg(const src_reg ®); + + bool equals(const dst_reg &r) const; + + src_reg *reladdr; +}; + +static inline dst_reg +retype(dst_reg reg, enum brw_reg_type type) +{ + reg.type = type; + return reg; +} + +static inline dst_reg +byte_offset(dst_reg reg, unsigned bytes) +{ + detail::add_byte_offset(®, bytes); + return reg; +} + +static inline dst_reg +offset(dst_reg reg, unsigned width, unsigned delta) +{ + const unsigned stride = (reg.file == UNIFORM ? 0 : 4); + const unsigned num_components = MAX2(width / 4 * stride, 4); + return byte_offset(reg, num_components * type_sz(reg.type) * delta); +} + +static inline dst_reg +horiz_offset(const dst_reg ®, unsigned delta) +{ + if (is_uniform(src_reg(reg))) + return reg; + else + return byte_offset(reg, delta * type_sz(reg.type)); +} + +static inline dst_reg +writemask(dst_reg reg, unsigned mask) +{ + assert(reg.file != IMM); + assert((reg.writemask & mask) != 0); + reg.writemask &= mask; + return reg; +} + +/** + * Return an integer identifying the discrete address space a register is + * contained in. A register is by definition fully contained in the single + * reg_space it belongs to, so two registers with different reg_space ids are + * guaranteed not to overlap. Most register files are a single reg_space of + * its own, only the VGRF file is composed of multiple discrete address + * spaces, one for each VGRF allocation. + */ +static inline uint32_t +reg_space(const backend_reg &r) +{ + return r.file << 16 | (r.file == VGRF ? r.nr : 0); +} + +/** + * Return the base offset in bytes of a register relative to the start of its + * reg_space(). + */ +static inline unsigned +reg_offset(const backend_reg &r) +{ + return (r.file == VGRF || r.file == IMM ? 0 : r.nr) * + (r.file == UNIFORM ? 16 : REG_SIZE) + r.offset + + (r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0); +} + +/** + * Return whether the register region starting at \p r and spanning \p dr + * bytes could potentially overlap the register region starting at \p s and + * spanning \p ds bytes. + */ +static inline bool +regions_overlap(const backend_reg &r, unsigned dr, + const backend_reg &s, unsigned ds) +{ + if (r.file == MRF && (r.nr & BRW_MRF_COMPR4)) { + /* COMPR4 regions are translated by the hardware during decompression + * into two separate half-regions 4 MRFs apart from each other. + */ + backend_reg t0 = r; + t0.nr &= ~BRW_MRF_COMPR4; + backend_reg t1 = t0; + t1.offset += 4 * REG_SIZE; + return regions_overlap(t0, dr / 2, s, ds) || + regions_overlap(t1, dr / 2, s, ds); + + } else if (s.file == MRF && (s.nr & BRW_MRF_COMPR4)) { + return regions_overlap(s, ds, r, dr); + + } else { + return reg_space(r) == reg_space(s) && + !(reg_offset(r) + dr <= reg_offset(s) || + reg_offset(s) + ds <= reg_offset(r)); + } +} + +class vec4_instruction : public backend_instruction { +public: + DECLARE_RALLOC_CXX_OPERATORS(vec4_instruction) + + vec4_instruction(enum opcode opcode, + const dst_reg &dst = dst_reg(), + const src_reg &src0 = src_reg(), + const src_reg &src1 = src_reg(), + const src_reg &src2 = src_reg()); + + dst_reg dst; + src_reg src[3]; + + enum brw_urb_write_flags urb_write_flags; + + unsigned sol_binding; /**< gfx6: SOL binding table index */ + bool sol_final_write; /**< gfx6: send commit message */ + unsigned sol_vertex; /**< gfx6: used for setting dst index in SVB header */ + + bool is_send_from_grf() const; + unsigned size_read(unsigned arg) const; + bool can_reswizzle(const struct intel_device_info *devinfo, + int dst_writemask, + int swizzle, int swizzle_mask); + void reswizzle(int dst_writemask, int swizzle); + bool can_do_source_mods(const struct intel_device_info *devinfo); + bool can_do_cmod(); + bool can_do_writemask(const struct intel_device_info *devinfo); + bool can_change_types() const; + bool has_source_and_destination_hazard() const; + unsigned implied_mrf_writes() const; + + bool is_align1_partial_write() + { + return opcode == VEC4_OPCODE_SET_LOW_32BIT || + opcode == VEC4_OPCODE_SET_HIGH_32BIT; + } + + bool reads_flag() const + { + return predicate || opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2; + } + + bool reads_flag(unsigned c) + { + if (opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2) + return true; + + switch (predicate) { + case BRW_PREDICATE_NONE: + return false; + case BRW_PREDICATE_ALIGN16_REPLICATE_X: + return c == 0; + case BRW_PREDICATE_ALIGN16_REPLICATE_Y: + return c == 1; + case BRW_PREDICATE_ALIGN16_REPLICATE_Z: + return c == 2; + case BRW_PREDICATE_ALIGN16_REPLICATE_W: + return c == 3; + default: + return true; + } + } + + bool writes_flag(const intel_device_info *devinfo) const + { + return (conditional_mod && ((opcode != BRW_OPCODE_SEL || devinfo->ver <= 5) && + opcode != BRW_OPCODE_CSEL && + opcode != BRW_OPCODE_IF && + opcode != BRW_OPCODE_WHILE)); + } + + bool reads_g0_implicitly() const + { + switch (opcode) { + case SHADER_OPCODE_TEX: + case SHADER_OPCODE_TXL: + case SHADER_OPCODE_TXD: + case SHADER_OPCODE_TXF: + case SHADER_OPCODE_TXF_CMS_W: + case SHADER_OPCODE_TXF_CMS: + case SHADER_OPCODE_TXF_MCS: + case SHADER_OPCODE_TXS: + case SHADER_OPCODE_TG4: + case SHADER_OPCODE_TG4_OFFSET: + case SHADER_OPCODE_SAMPLEINFO: + case VS_OPCODE_PULL_CONSTANT_LOAD: + case GS_OPCODE_SET_PRIMITIVE_ID: + case GS_OPCODE_GET_INSTANCE_ID: + case SHADER_OPCODE_GFX4_SCRATCH_READ: + case SHADER_OPCODE_GFX4_SCRATCH_WRITE: + return true; + default: + return false; + } + } +}; + +/** + * Make the execution of \p inst dependent on the evaluation of a possibly + * inverted predicate. + */ +inline vec4_instruction * +set_predicate_inv(enum brw_predicate pred, bool inverse, + vec4_instruction *inst) +{ + inst->predicate = pred; + inst->predicate_inverse = inverse; + return inst; +} + +/** + * Make the execution of \p inst dependent on the evaluation of a predicate. + */ +inline vec4_instruction * +set_predicate(enum brw_predicate pred, vec4_instruction *inst) +{ + return set_predicate_inv(pred, false, inst); +} + +/** + * Write the result of evaluating the condition given by \p mod to a flag + * register. + */ +inline vec4_instruction * +set_condmod(enum brw_conditional_mod mod, vec4_instruction *inst) +{ + inst->conditional_mod = mod; + return inst; +} + +/** + * Clamp the result of \p inst to the saturation range of its destination + * datatype. + */ +inline vec4_instruction * +set_saturate(bool saturate, vec4_instruction *inst) +{ + inst->saturate = saturate; + return inst; +} + +/** + * Return the number of dataflow registers written by the instruction (either + * fully or partially) counted from 'floor(reg_offset(inst->dst) / + * register_size)'. The somewhat arbitrary register size unit is 16B for the + * UNIFORM and IMM files and 32B for all other files. + */ +inline unsigned +regs_written(const vec4_instruction *inst) +{ + assert(inst->dst.file != UNIFORM && inst->dst.file != IMM); + return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE + inst->size_written, + REG_SIZE); +} + +/** + * Return the number of dataflow registers read by the instruction (either + * fully or partially) counted from 'floor(reg_offset(inst->src[i]) / + * register_size)'. The somewhat arbitrary register size unit is 16B for the + * UNIFORM and IMM files and 32B for all other files. + */ +inline unsigned +regs_read(const vec4_instruction *inst, unsigned i) +{ + const unsigned reg_size = + inst->src[i].file == UNIFORM || inst->src[i].file == IMM ? 16 : REG_SIZE; + return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size + inst->size_read(i), + reg_size); +} + +static inline enum brw_reg_type +get_exec_type(const vec4_instruction *inst) +{ + enum brw_reg_type exec_type = BRW_REGISTER_TYPE_B; + + for (int i = 0; i < 3; i++) { + if (inst->src[i].file != BAD_FILE) { + const brw_reg_type t = get_exec_type(brw_reg_type(inst->src[i].type)); + if (type_sz(t) > type_sz(exec_type)) + exec_type = t; + else if (type_sz(t) == type_sz(exec_type) && + brw_reg_type_is_floating_point(t)) + exec_type = t; + } + } + + if (exec_type == BRW_REGISTER_TYPE_B) + exec_type = inst->dst.type; + + /* TODO: We need to handle half-float conversions. */ + assert(exec_type != BRW_REGISTER_TYPE_HF || + inst->dst.type == BRW_REGISTER_TYPE_HF); + assert(exec_type != BRW_REGISTER_TYPE_B); + + return exec_type; +} + +static inline unsigned +get_exec_type_size(const vec4_instruction *inst) +{ + return type_sz(get_exec_type(inst)); +} + +} /* namespace brw */ + +#endif diff --git a/src/intel/compiler/elk/brw_isa_info.h b/src/intel/compiler/elk/brw_isa_info.h new file mode 100644 index 00000000000..ae0ad3e2c2d --- /dev/null +++ b/src/intel/compiler/elk/brw_isa_info.h @@ -0,0 +1,86 @@ +/* + * Copyright © 2022 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef BRW_ISA_ENCODING_H +#define BRW_ISA_ENCODING_H + +#include "dev/intel_device_info.h" +#include "brw_eu_defines.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct opcode_desc; + +struct brw_isa_info { + const struct intel_device_info *devinfo; + + /* A mapping from enum opcode to the corresponding opcode_desc */ + const struct opcode_desc *ir_to_descs[NUM_BRW_OPCODES]; + + /** A mapping from a HW opcode encoding to the corresponding opcode_desc */ + const struct opcode_desc *hw_to_descs[128]; +}; + +void brw_init_isa_info(struct brw_isa_info *isa, + const struct intel_device_info *devinfo); + +struct opcode_desc { + unsigned ir; + unsigned hw; + const char *name; + int nsrc; + int ndst; + int gfx_vers; +}; + +const struct opcode_desc * +brw_opcode_desc(const struct brw_isa_info *isa, enum opcode opcode); + +const struct opcode_desc * +brw_opcode_desc_from_hw(const struct brw_isa_info *isa, unsigned hw); + +static inline unsigned +brw_opcode_encode(const struct brw_isa_info *isa, enum opcode opcode) +{ + return brw_opcode_desc(isa, opcode)->hw; +} + +static inline enum opcode +brw_opcode_decode(const struct brw_isa_info *isa, unsigned hw) +{ + const struct opcode_desc *desc = brw_opcode_desc_from_hw(isa, hw); + return desc ? (enum opcode)desc->ir : BRW_OPCODE_ILLEGAL; +} + +static inline bool +is_3src(const struct brw_isa_info *isa, enum opcode opcode) +{ + const struct opcode_desc *desc = brw_opcode_desc(isa, opcode); + return desc && desc->nsrc == 3; +} + +#ifdef __cplusplus +} +#endif +#endif diff --git a/src/intel/compiler/elk/brw_kernel.c b/src/intel/compiler/elk/brw_kernel.c new file mode 100644 index 00000000000..a85dc583a58 --- /dev/null +++ b/src/intel/compiler/elk/brw_kernel.c @@ -0,0 +1,790 @@ +/* + * Copyright © 2020 Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_kernel.h" +#include "brw_nir.h" +#include "intel_nir.h" + +#include "intel_nir.h" +#include "nir_clc_helpers.h" +#include "compiler/nir/nir_builder.h" +#include "compiler/spirv/nir_spirv.h" +#include "dev/intel_debug.h" +#include "util/u_atomic.h" +#include "util/u_dynarray.h" + +static const nir_shader * +load_clc_shader(struct brw_compiler *compiler, struct disk_cache *disk_cache, + const nir_shader_compiler_options *nir_options, + const struct spirv_to_nir_options *spirv_options) +{ + if (compiler->clc_shader) + return compiler->clc_shader; + + nir_shader *nir = nir_load_libclc_shader(64, disk_cache, + spirv_options, nir_options, + disk_cache != NULL); + if (nir == NULL) + return NULL; + + const nir_shader *old_nir = + p_atomic_cmpxchg(&compiler->clc_shader, NULL, nir); + if (old_nir == NULL) { + /* We won the race */ + ralloc_steal(compiler, nir); + return nir; + } else { + /* Someone else built the shader first */ + ralloc_free(nir); + return old_nir; + } +} + +static nir_builder +builder_init_new_impl(nir_function *func) +{ + nir_function_impl *impl = nir_function_impl_create(func); + return nir_builder_at(nir_before_impl(impl)); +} + +static void +implement_atomic_builtin(nir_function *func, nir_atomic_op atomic_op, + enum glsl_base_type data_base_type, + nir_variable_mode mode) +{ + nir_builder b = builder_init_new_impl(func); + const struct glsl_type *data_type = glsl_scalar_type(data_base_type); + + unsigned p = 0; + + nir_deref_instr *ret = NULL; + ret = nir_build_deref_cast(&b, nir_load_param(&b, p++), + nir_var_function_temp, data_type, 0); + + nir_intrinsic_op op = nir_intrinsic_deref_atomic; + nir_intrinsic_instr *atomic = nir_intrinsic_instr_create(b.shader, op); + nir_intrinsic_set_atomic_op(atomic, atomic_op); + + for (unsigned i = 0; i < nir_intrinsic_infos[op].num_srcs; i++) { + nir_def *src = nir_load_param(&b, p++); + if (i == 0) { + /* The first source is our deref */ + assert(nir_intrinsic_infos[op].src_components[i] == -1); + src = &nir_build_deref_cast(&b, src, mode, data_type, 0)->def; + } + atomic->src[i] = nir_src_for_ssa(src); + } + + nir_def_init_for_type(&atomic->instr, &atomic->def, data_type); + + nir_builder_instr_insert(&b, &atomic->instr); + nir_store_deref(&b, ret, &atomic->def, ~0); +} + +static void +implement_sub_group_ballot_builtin(nir_function *func) +{ + nir_builder b = builder_init_new_impl(func); + nir_deref_instr *ret = + nir_build_deref_cast(&b, nir_load_param(&b, 0), + nir_var_function_temp, glsl_uint_type(), 0); + nir_def *cond = nir_load_param(&b, 1); + + nir_intrinsic_instr *ballot = + nir_intrinsic_instr_create(b.shader, nir_intrinsic_ballot); + ballot->src[0] = nir_src_for_ssa(cond); + ballot->num_components = 1; + nir_def_init(&ballot->instr, &ballot->def, 1, 32); + nir_builder_instr_insert(&b, &ballot->instr); + + nir_store_deref(&b, ret, &ballot->def, ~0); +} + +static bool +implement_intel_builtins(nir_shader *nir) +{ + bool progress = false; + + nir_foreach_function(func, nir) { + if (strcmp(func->name, "_Z10atomic_minPU3AS1Vff") == 0) { + /* float atom_min(__global float volatile *p, float val) */ + implement_atomic_builtin(func, nir_atomic_op_fmin, + GLSL_TYPE_FLOAT, nir_var_mem_global); + progress = true; + } else if (strcmp(func->name, "_Z10atomic_maxPU3AS1Vff") == 0) { + /* float atom_max(__global float volatile *p, float val) */ + implement_atomic_builtin(func, nir_atomic_op_fmax, + GLSL_TYPE_FLOAT, nir_var_mem_global); + progress = true; + } else if (strcmp(func->name, "_Z10atomic_minPU3AS3Vff") == 0) { + /* float atomic_min(__shared float volatile *, float) */ + implement_atomic_builtin(func, nir_atomic_op_fmin, + GLSL_TYPE_FLOAT, nir_var_mem_shared); + progress = true; + } else if (strcmp(func->name, "_Z10atomic_maxPU3AS3Vff") == 0) { + /* float atomic_max(__shared float volatile *, float) */ + implement_atomic_builtin(func, nir_atomic_op_fmax, + GLSL_TYPE_FLOAT, nir_var_mem_shared); + progress = true; + } else if (strcmp(func->name, "intel_sub_group_ballot") == 0) { + implement_sub_group_ballot_builtin(func); + progress = true; + } + } + + nir_shader_preserve_all_metadata(nir); + + return progress; +} + +static bool +lower_kernel_intrinsics(nir_shader *nir) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + + bool progress = false; + + unsigned kernel_sysvals_start = 0; + unsigned kernel_arg_start = sizeof(struct brw_kernel_sysvals); + nir->num_uniforms += kernel_arg_start; + + nir_builder b = nir_builder_create(impl); + + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_load_kernel_input: { + b.cursor = nir_instr_remove(&intrin->instr); + + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform); + load->num_components = intrin->num_components; + load->src[0] = nir_src_for_ssa(nir_u2u32(&b, intrin->src[0].ssa)); + nir_intrinsic_set_base(load, kernel_arg_start); + nir_intrinsic_set_range(load, nir->num_uniforms); + nir_def_init(&load->instr, &load->def, + intrin->def.num_components, + intrin->def.bit_size); + nir_builder_instr_insert(&b, &load->instr); + + nir_def_rewrite_uses(&intrin->def, &load->def); + progress = true; + break; + } + + case nir_intrinsic_load_constant_base_ptr: { + b.cursor = nir_instr_remove(&intrin->instr); + nir_def *const_data_base_addr = nir_pack_64_2x32_split(&b, + nir_load_reloc_const_intel(&b, BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW), + nir_load_reloc_const_intel(&b, BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH)); + nir_def_rewrite_uses(&intrin->def, const_data_base_addr); + progress = true; + break; + } + + case nir_intrinsic_load_num_workgroups: { + b.cursor = nir_instr_remove(&intrin->instr); + + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform); + load->num_components = 3; + load->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0)); + nir_intrinsic_set_base(load, kernel_sysvals_start + + offsetof(struct brw_kernel_sysvals, num_work_groups)); + nir_intrinsic_set_range(load, 3 * 4); + nir_def_init(&load->instr, &load->def, 3, 32); + nir_builder_instr_insert(&b, &load->instr); + nir_def_rewrite_uses(&intrin->def, &load->def); + progress = true; + break; + } + + default: + break; + } + } + } + + if (progress) { + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + } else { + nir_metadata_preserve(impl, nir_metadata_all); + } + + return progress; +} + +bool +brw_kernel_from_spirv(struct brw_compiler *compiler, + struct disk_cache *disk_cache, + struct brw_kernel *kernel, + void *log_data, void *mem_ctx, + const uint32_t *spirv, size_t spirv_size, + const char *entrypoint_name, + char **error_str) +{ + const struct intel_device_info *devinfo = compiler->devinfo; + const nir_shader_compiler_options *nir_options = + compiler->nir_options[MESA_SHADER_KERNEL]; + + struct spirv_to_nir_options spirv_options = { + .environment = NIR_SPIRV_OPENCL, + .caps = { + .address = true, + .float16 = devinfo->ver >= 8, + .float64 = devinfo->ver >= 8, + .groups = true, + .image_write_without_format = true, + .int8 = devinfo->ver >= 8, + .int16 = devinfo->ver >= 8, + .int64 = devinfo->ver >= 8, + .int64_atomics = devinfo->ver >= 9, + .kernel = true, + .linkage = true, /* We receive linked kernel from clc */ + .float_controls = devinfo->ver >= 8, + .generic_pointers = true, + .storage_8bit = devinfo->ver >= 8, + .storage_16bit = devinfo->ver >= 8, + .subgroup_arithmetic = true, + .subgroup_basic = true, + .subgroup_ballot = true, + .subgroup_dispatch = true, + .subgroup_quad = true, + .subgroup_shuffle = true, + .subgroup_vote = true, + + .intel_subgroup_shuffle = true, + .intel_subgroup_buffer_block_io = true, + }, + .shared_addr_format = nir_address_format_62bit_generic, + .global_addr_format = nir_address_format_62bit_generic, + .temp_addr_format = nir_address_format_62bit_generic, + .constant_addr_format = nir_address_format_64bit_global, + }; + + spirv_options.clc_shader = load_clc_shader(compiler, disk_cache, + nir_options, &spirv_options); + if (spirv_options.clc_shader == NULL) { + fprintf(stderr, "ERROR: libclc shader missing." + " Consider installing the libclc package\n"); + abort(); + } + + assert(spirv_size % 4 == 0); + nir_shader *nir = + spirv_to_nir(spirv, spirv_size / 4, NULL, 0, MESA_SHADER_KERNEL, + entrypoint_name, &spirv_options, nir_options); + nir_validate_shader(nir, "after spirv_to_nir"); + nir_validate_ssa_dominance(nir, "after spirv_to_nir"); + ralloc_steal(mem_ctx, nir); + nir->info.name = ralloc_strdup(nir, entrypoint_name); + + if (INTEL_DEBUG(DEBUG_CS)) { + /* Re-index SSA defs so we print more sensible numbers. */ + nir_foreach_function_impl(impl, nir) { + nir_index_ssa_defs(impl); + } + + fprintf(stderr, "NIR (from SPIR-V) for kernel\n"); + nir_print_shader(nir, stderr); + } + + NIR_PASS_V(nir, implement_intel_builtins); + NIR_PASS_V(nir, nir_link_shader_functions, spirv_options.clc_shader); + + /* We have to lower away local constant initializers right before we + * inline functions. That way they get properly initialized at the top + * of the function and not at the top of its caller. + */ + NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp); + NIR_PASS_V(nir, nir_lower_returns); + NIR_PASS_V(nir, nir_inline_functions); + NIR_PASS_V(nir, nir_copy_prop); + NIR_PASS_V(nir, nir_opt_deref); + + /* Pick off the single entrypoint that we want */ + nir_remove_non_entrypoints(nir); + + /* Now that we've deleted all but the main function, we can go ahead and + * lower the rest of the constant initializers. We do this here so that + * nir_remove_dead_variables and split_per_member_structs below see the + * corresponding stores. + */ + NIR_PASS_V(nir, nir_lower_variable_initializers, ~0); + + /* LLVM loves take advantage of the fact that vec3s in OpenCL are 16B + * aligned and so it can just read/write them as vec4s. This results in a + * LOT of vec4->vec3 casts on loads and stores. One solution to this + * problem is to get rid of all vec3 variables. + */ + NIR_PASS_V(nir, nir_lower_vec3_to_vec4, + nir_var_shader_temp | nir_var_function_temp | + nir_var_mem_shared | nir_var_mem_global| + nir_var_mem_constant); + + /* We assign explicit types early so that the optimizer can take advantage + * of that information and hopefully get rid of some of our memcpys. + */ + NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, + nir_var_uniform | + nir_var_shader_temp | nir_var_function_temp | + nir_var_mem_shared | nir_var_mem_global, + glsl_get_cl_type_size_align); + + struct brw_nir_compiler_opts opts = {}; + brw_preprocess_nir(compiler, nir, &opts); + + int max_arg_idx = -1; + nir_foreach_uniform_variable(var, nir) { + assert(var->data.location < 256); + max_arg_idx = MAX2(max_arg_idx, var->data.location); + } + + kernel->args_size = nir->num_uniforms; + kernel->arg_count = max_arg_idx + 1; + + /* No bindings */ + struct brw_kernel_arg_desc *args = + rzalloc_array(mem_ctx, struct brw_kernel_arg_desc, kernel->arg_count); + kernel->args = args; + + nir_foreach_uniform_variable(var, nir) { + struct brw_kernel_arg_desc arg_desc = { + .offset = var->data.driver_location, + .size = glsl_get_explicit_size(var->type, false), + }; + assert(arg_desc.offset + arg_desc.size <= nir->num_uniforms); + + assert(var->data.location >= 0); + args[var->data.location] = arg_desc; + } + + NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_all, NULL); + + /* Lower again, this time after dead-variables to get more compact variable + * layouts. + */ + nir->global_mem_size = 0; + nir->scratch_size = 0; + nir->info.shared_size = 0; + NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, + nir_var_shader_temp | nir_var_function_temp | + nir_var_mem_shared | nir_var_mem_global | nir_var_mem_constant, + glsl_get_cl_type_size_align); + if (nir->constant_data_size > 0) { + assert(nir->constant_data == NULL); + nir->constant_data = rzalloc_size(nir, nir->constant_data_size); + nir_gather_explicit_io_initializers(nir, nir->constant_data, + nir->constant_data_size, + nir_var_mem_constant); + } + + if (INTEL_DEBUG(DEBUG_CS)) { + /* Re-index SSA defs so we print more sensible numbers. */ + nir_foreach_function_impl(impl, nir) { + nir_index_ssa_defs(impl); + } + + fprintf(stderr, "NIR (before I/O lowering) for kernel\n"); + nir_print_shader(nir, stderr); + } + + NIR_PASS_V(nir, nir_lower_memcpy); + + NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_constant, + nir_address_format_64bit_global); + + NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_uniform, + nir_address_format_32bit_offset_as_64bit); + + NIR_PASS_V(nir, nir_lower_explicit_io, + nir_var_shader_temp | nir_var_function_temp | + nir_var_mem_shared | nir_var_mem_global, + nir_address_format_62bit_generic); + + NIR_PASS_V(nir, nir_lower_convert_alu_types, NULL); + + NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics, devinfo, NULL); + NIR_PASS_V(nir, lower_kernel_intrinsics); + + struct brw_cs_prog_key key = { }; + + memset(&kernel->prog_data, 0, sizeof(kernel->prog_data)); + kernel->prog_data.base.nr_params = DIV_ROUND_UP(nir->num_uniforms, 4); + + struct brw_compile_cs_params params = { + .base = { + .nir = nir, + .stats = kernel->stats, + .log_data = log_data, + .mem_ctx = mem_ctx, + }, + .key = &key, + .prog_data = &kernel->prog_data, + }; + + kernel->code = brw_compile_cs(compiler, ¶ms); + + if (error_str) + *error_str = params.base.error_str; + + return kernel->code != NULL; +} + +static nir_def * +rebuild_value_from_store(struct util_dynarray *stores, + nir_def *value, unsigned read_offset) +{ + unsigned read_size = value->num_components * value->bit_size / 8; + + util_dynarray_foreach(stores, nir_intrinsic_instr *, _store) { + nir_intrinsic_instr *store = *_store; + + unsigned write_offset = nir_src_as_uint(store->src[1]); + unsigned write_size = nir_src_num_components(store->src[0]) * + nir_src_bit_size(store->src[0]) / 8; + if (write_offset <= read_offset && + (write_offset + write_size) >= (read_offset + read_size)) { + assert(nir_block_dominates(store->instr.block, value->parent_instr->block)); + assert(write_size == read_size); + return store->src[0].ssa; + } + } + unreachable("Matching scratch store not found"); +} + +/** + * Remove temporary variables stored to scratch to be then reloaded + * immediately. Remap the load to the store SSA value. + * + * This workaround is only meant to be applied to shaders in src/intel/shaders + * were we know there should be no issue. More complex cases might not work + * with this approach. + */ +static bool +nir_remove_llvm17_scratch(nir_shader *nir) +{ + struct util_dynarray scratch_stores; + void *mem_ctx = ralloc_context(NULL); + + util_dynarray_init(&scratch_stores, mem_ctx); + + nir_foreach_function_impl(func, nir) { + nir_foreach_block(block, func) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + if (intrin->intrinsic != nir_intrinsic_store_scratch) + continue; + + nir_const_value *offset = nir_src_as_const_value(intrin->src[1]); + if (offset != NULL) { + util_dynarray_append(&scratch_stores, nir_intrinsic_instr *, intrin); + } + } + } + } + + bool progress = false; + if (util_dynarray_num_elements(&scratch_stores, nir_intrinsic_instr *) > 0) { + nir_foreach_function_impl(func, nir) { + nir_foreach_block(block, func) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + if (intrin->intrinsic != nir_intrinsic_load_scratch) + continue; + + nir_const_value *offset = nir_src_as_const_value(intrin->src[0]); + if (offset == NULL) + continue; + + nir_def_rewrite_uses(&intrin->def, + rebuild_value_from_store( + &scratch_stores, &intrin->def, + nir_src_as_uint(intrin->src[0]))); + nir_instr_remove(instr); + + progress = true; + } + } + } + } + + util_dynarray_foreach(&scratch_stores, nir_intrinsic_instr *, _store) { + nir_intrinsic_instr *store = *_store; + nir_instr_remove(&store->instr); + } + + /* Quick sanity check */ + assert(util_dynarray_num_elements(&scratch_stores, nir_intrinsic_instr *) == 0 || + progress); + + ralloc_free(mem_ctx); + + return progress; +} + +static void +cleanup_llvm17_scratch(nir_shader *nir) +{ + { + bool progress; + do { + progress = false; + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_constant_folding); + NIR_PASS(progress, nir, nir_opt_cse); + NIR_PASS(progress, nir, nir_opt_algebraic); + } while (progress); + } + + nir_remove_llvm17_scratch(nir); + + { + bool progress; + do { + progress = false; + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_constant_folding); + NIR_PASS(progress, nir, nir_opt_cse); + NIR_PASS(progress, nir, nir_opt_algebraic); + } while (progress); + } +} + +nir_shader * +brw_nir_from_spirv(void *mem_ctx, const uint32_t *spirv, size_t spirv_size, + bool llvm17_wa) +{ + struct spirv_to_nir_options spirv_options = { + .environment = NIR_SPIRV_OPENCL, + .caps = { + .address = true, + .groups = true, + .image_write_without_format = true, + .int8 = true, + .int16 = true, + .int64 = true, + .int64_atomics = true, + .kernel = true, + .linkage = true, /* We receive linked kernel from clc */ + .float_controls = true, + .generic_pointers = true, + .storage_8bit = true, + .storage_16bit = true, + .subgroup_arithmetic = true, + .subgroup_basic = true, + .subgroup_ballot = true, + .subgroup_dispatch = true, + .subgroup_quad = true, + .subgroup_shuffle = true, + .subgroup_vote = true, + + .intel_subgroup_shuffle = true, + .intel_subgroup_buffer_block_io = true, + }, + .shared_addr_format = nir_address_format_62bit_generic, + .global_addr_format = nir_address_format_62bit_generic, + .temp_addr_format = nir_address_format_62bit_generic, + .constant_addr_format = nir_address_format_64bit_global, + .create_library = true, + }; + + assert(spirv_size % 4 == 0); + nir_shader *nir = + spirv_to_nir(spirv, spirv_size / 4, NULL, 0, MESA_SHADER_KERNEL, + "library", &spirv_options, &brw_scalar_nir_options); + nir_validate_shader(nir, "after spirv_to_nir"); + nir_validate_ssa_dominance(nir, "after spirv_to_nir"); + ralloc_steal(mem_ctx, nir); + nir->info.name = ralloc_strdup(nir, "library"); + + if (INTEL_DEBUG(DEBUG_CS)) { + /* Re-index SSA defs so we print more sensible numbers. */ + nir_foreach_function_impl(impl, nir) { + nir_index_ssa_defs(impl); + } + + fprintf(stderr, "NIR (from SPIR-V) for kernel\n"); + nir_print_shader(nir, stderr); + } + + NIR_PASS_V(nir, implement_intel_builtins); + NIR_PASS_V(nir, nir_link_shader_functions, spirv_options.clc_shader); + + /* We have to lower away local constant initializers right before we + * inline functions. That way they get properly initialized at the top + * of the function and not at the top of its caller. + */ + NIR_PASS_V(nir, nir_lower_variable_initializers, ~(nir_var_shader_temp | + nir_var_function_temp)); + NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo | + nir_var_mem_constant | nir_var_function_temp | nir_var_image, NULL); + { + bool progress; + do + { + progress = false; + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_copy_prop_vars); + NIR_PASS(progress, nir, nir_opt_deref); + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_undef); + NIR_PASS(progress, nir, nir_opt_constant_folding); + NIR_PASS(progress, nir, nir_opt_cse); + NIR_PASS(progress, nir, nir_lower_vars_to_ssa); + NIR_PASS(progress, nir, nir_opt_algebraic); + } while (progress); + } + + NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp); + NIR_PASS_V(nir, nir_lower_returns); + NIR_PASS_V(nir, nir_inline_functions); + + assert(nir->scratch_size == 0); + NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp, glsl_get_cl_type_size_align); + + { + bool progress; + do + { + progress = false; + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_copy_prop_vars); + NIR_PASS(progress, nir, nir_opt_deref); + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_undef); + NIR_PASS(progress, nir, nir_opt_constant_folding); + NIR_PASS(progress, nir, nir_opt_cse); + NIR_PASS(progress, nir, nir_split_var_copies); + NIR_PASS(progress, nir, nir_lower_var_copies); + NIR_PASS(progress, nir, nir_lower_vars_to_ssa); + NIR_PASS(progress, nir, nir_opt_algebraic); + NIR_PASS(progress, nir, nir_opt_if, nir_opt_if_optimize_phi_true_false); + NIR_PASS(progress, nir, nir_opt_dead_cf); + NIR_PASS(progress, nir, nir_opt_remove_phis); + NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true); + NIR_PASS(progress, nir, nir_lower_vec3_to_vec4, nir_var_mem_generic | nir_var_uniform); + NIR_PASS(progress, nir, nir_opt_memcpy); + } while (progress); + } + + NIR_PASS_V(nir, nir_scale_fdiv); + + NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo | + nir_var_mem_constant | nir_var_function_temp | nir_var_image, NULL); + + + NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_mem_shared | nir_var_function_temp, NULL); + + nir->scratch_size = 0; + NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, + nir_var_mem_shared | nir_var_function_temp | nir_var_shader_temp | + nir_var_mem_global | nir_var_mem_constant, + glsl_get_cl_type_size_align); + + // Lower memcpy - needs to wait until types are sized + { + bool progress; + do { + progress = false; + NIR_PASS(progress, nir, nir_opt_memcpy); + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_copy_prop_vars); + NIR_PASS(progress, nir, nir_opt_deref); + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_split_var_copies); + NIR_PASS(progress, nir, nir_lower_var_copies); + NIR_PASS(progress, nir, nir_lower_vars_to_ssa); + NIR_PASS(progress, nir, nir_opt_constant_folding); + NIR_PASS(progress, nir, nir_opt_cse); + } while (progress); + } + NIR_PASS_V(nir, nir_lower_memcpy); + + NIR_PASS_V(nir, nir_lower_explicit_io, + nir_var_mem_shared | nir_var_function_temp | nir_var_shader_temp | nir_var_uniform, + nir_address_format_32bit_offset_as_64bit); + + NIR_PASS_V(nir, nir_lower_system_values); + + /* Hopefully we can drop this once lower_vars_to_ssa has improved to not + * lower everything to scratch. + */ + if (llvm17_wa) + cleanup_llvm17_scratch(nir); + + /* Lower again, this time after dead-variables to get more compact variable + * layouts. + */ + nir->global_mem_size = 0; + nir->scratch_size = 0; + nir->info.shared_size = 0; + NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, + nir_var_mem_shared | nir_var_mem_global | nir_var_mem_constant, + glsl_get_cl_type_size_align); + if (nir->constant_data_size > 0) { + assert(nir->constant_data == NULL); + nir->constant_data = rzalloc_size(nir, nir->constant_data_size); + nir_gather_explicit_io_initializers(nir, nir->constant_data, + nir->constant_data_size, + nir_var_mem_constant); + } + + NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_constant, + nir_address_format_64bit_global); + + NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_uniform, + nir_address_format_32bit_offset_as_64bit); + + NIR_PASS_V(nir, nir_lower_explicit_io, + nir_var_shader_temp | nir_var_function_temp | + nir_var_mem_shared | nir_var_mem_global, + nir_address_format_62bit_generic); + + if (INTEL_DEBUG(DEBUG_CS)) { + /* Re-index SSA defs so we print more sensible numbers. */ + nir_foreach_function_impl(impl, nir) { + nir_index_ssa_defs(impl); + } + + fprintf(stderr, "NIR (before I/O lowering) for kernel\n"); + nir_print_shader(nir, stderr); + } + + return nir; +} diff --git a/src/intel/compiler/elk/brw_kernel.h b/src/intel/compiler/elk/brw_kernel.h new file mode 100644 index 00000000000..fb1289872d5 --- /dev/null +++ b/src/intel/compiler/elk/brw_kernel.h @@ -0,0 +1,78 @@ +/* + * Copyright © 2020 Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BRW_KERNEL_H +#define BRW_KERNEL_H + +#include "brw_compiler.h" + +struct disk_cache; + +#ifdef __cplusplus +extern "C" { +#endif + +/** Software interface for system values in kernels + * + * These are intended to go at the start of the kernel argument buffer. + */ +struct brw_kernel_sysvals { + uint32_t num_work_groups[3]; + uint32_t pad[5]; +}; + +struct brw_kernel_arg_desc { + uint16_t offset; + uint16_t size; +}; + +struct brw_kernel { + struct brw_cs_prog_data prog_data; + + struct brw_compile_stats stats[3]; + + uint16_t args_size; + uint16_t arg_count; + const struct brw_kernel_arg_desc *args; + + const void *code; +}; + +bool +brw_kernel_from_spirv(struct brw_compiler *compiler, + struct disk_cache *disk_cache, + struct brw_kernel *kernel, + void *log_data, void *mem_ctx, + const uint32_t *spirv, size_t spirv_size, + const char *entrypoint_name, + char **error_str); + +nir_shader * +brw_nir_from_spirv(void *mem_ctx, const uint32_t *spirv, size_t spirv_size, + bool llvm17_wa); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* BRW_KERNEL_H */ diff --git a/src/intel/compiler/elk/brw_lex.l b/src/intel/compiler/elk/brw_lex.l new file mode 100644 index 00000000000..d230d997358 --- /dev/null +++ b/src/intel/compiler/elk/brw_lex.l @@ -0,0 +1,465 @@ +%option yylineno +%option nounput +%{ +#include +#include "brw_asm.h" +#undef ALIGN16 +#include "brw_gram.tab.h" + +/* Locations */ +int yycolumn = 1; + +int saved_state = 0; +extern char *input_filename; + +#define YY_NO_INPUT +#define YY_USER_ACTION \ + yylloc.first_line = yylloc.last_line = yylineno; \ + yylloc.first_column = yycolumn; \ + yylloc.last_column = yycolumn + yyleng - 1; \ + yycolumn += yyleng; +%} + +%x BLOCK_COMMENT +%x FILENAME +%x CHANNEL +%x REG +%x DOTSEL +%x LABEL +%x MSGDESC +%% + + /* eat up single line comment */ +\/\/.*[\r\n] { yycolumn = 1; } + + /* eat up multiline comment */ +\/\* { saved_state = YYSTATE; BEGIN(BLOCK_COMMENT); } + +\*\/ { BEGIN(saved_state); } + +. { } +[\r\n] { } + +\"[^\"]+\" { + char *name = malloc(yyleng - 1); + memmove(name, yytext + 1, yyleng - 2); + name[yyleng-1] = '\0'; + input_filename = name; + } + + /* null register */ +null { BEGIN(REG); return NULL_TOKEN; } + + /* Opcodes */ +add { yylval.integer = BRW_OPCODE_ADD; return ADD; } +add3 { yylval.integer = BRW_OPCODE_ADD3; return ADD3; } +addc { yylval.integer = BRW_OPCODE_ADDC; return ADDC; } +and { yylval.integer = BRW_OPCODE_AND; return AND; } +asr { yylval.integer = BRW_OPCODE_ASR; return ASR; } +avg { yylval.integer = BRW_OPCODE_AVG; return AVG; } +bfe { yylval.integer = BRW_OPCODE_BFE; return BFE; } +bfi1 { yylval.integer = BRW_OPCODE_BFI1; return BFI1; } +bfi2 { yylval.integer = BRW_OPCODE_BFI2; return BFI2; } +bfrev { yylval.integer = BRW_OPCODE_BFREV; return BFREV; } +brc { yylval.integer = BRW_OPCODE_BRC; return BRC; } +brd { yylval.integer = BRW_OPCODE_BRD; return BRD; } +break { yylval.integer = BRW_OPCODE_BREAK; return BREAK; } +call { yylval.integer = BRW_OPCODE_CALL; return CALL; } +calla { yylval.integer = BRW_OPCODE_CALLA; return CALLA; } +case { yylval.integer = BRW_OPCODE_CASE; return CASE; } +cbit { yylval.integer = BRW_OPCODE_CBIT; return CBIT; } +cmp { yylval.integer = BRW_OPCODE_CMP; return CMP; } +cmpn { yylval.integer = BRW_OPCODE_CMPN; return CMPN; } +cont { yylval.integer = BRW_OPCODE_CONTINUE; return CONT; } +csel { yylval.integer = BRW_OPCODE_CSEL; return CSEL; } +dim { yylval.integer = BRW_OPCODE_DIM; return DIM; } +do { yylval.integer = BRW_OPCODE_DO; return DO; } +dp2 { yylval.integer = BRW_OPCODE_DP2; return DP2; } +dp3 { yylval.integer = BRW_OPCODE_DP3; return DP3; } +dp4 { yylval.integer = BRW_OPCODE_DP4; return DP4; } +dp4a { yylval.integer = BRW_OPCODE_DP4A; return DP4A; } +dph { yylval.integer = BRW_OPCODE_DPH; return DPH; } +else { yylval.integer = BRW_OPCODE_ELSE; return ELSE; } +endif { yylval.integer = BRW_OPCODE_ENDIF; return ENDIF; } +f16to32 { yylval.integer = BRW_OPCODE_F16TO32; return F16TO32; } +f32to16 { yylval.integer = BRW_OPCODE_F32TO16; return F32TO16; } +fbh { yylval.integer = BRW_OPCODE_FBH; return FBH; } +fbl { yylval.integer = BRW_OPCODE_FBL; return FBL; } +fork { yylval.integer = BRW_OPCODE_FORK; return FORK; } +frc { yylval.integer = BRW_OPCODE_FRC; return FRC; } +goto { yylval.integer = BRW_OPCODE_GOTO; return GOTO; } +halt { yylval.integer = BRW_OPCODE_HALT; return HALT; } +if { yylval.integer = BRW_OPCODE_IF; return IF; } +iff { yylval.integer = BRW_OPCODE_IFF; return IFF; } +illegal { yylval.integer = BRW_OPCODE_ILLEGAL; return ILLEGAL; } +jmpi { yylval.integer = BRW_OPCODE_JMPI; return JMPI; } +line { yylval.integer = BRW_OPCODE_LINE; return LINE; } +lrp { yylval.integer = BRW_OPCODE_LRP; return LRP; } +lzd { yylval.integer = BRW_OPCODE_LZD; return LZD; } +mac { yylval.integer = BRW_OPCODE_MAC; return MAC; } +mach { yylval.integer = BRW_OPCODE_MACH; return MACH; } +mad { yylval.integer = BRW_OPCODE_MAD; return MAD; } +madm { yylval.integer = BRW_OPCODE_MADM; return MADM; } +mov { yylval.integer = BRW_OPCODE_MOV; return MOV; } +movi { yylval.integer = BRW_OPCODE_MOVI; return MOVI; } +mul { yylval.integer = BRW_OPCODE_MUL; return MUL; } +mrest { yylval.integer = BRW_OPCODE_MREST; return MREST; } +msave { yylval.integer = BRW_OPCODE_MSAVE; return MSAVE; } +nenop { yylval.integer = BRW_OPCODE_NENOP; return NENOP; } +nop { yylval.integer = BRW_OPCODE_NOP; return NOP; } +not { yylval.integer = BRW_OPCODE_NOT; return NOT; } +or { yylval.integer = BRW_OPCODE_OR; return OR; } +pln { yylval.integer = BRW_OPCODE_PLN; return PLN; } +pop { yylval.integer = BRW_OPCODE_POP; return POP; } +push { yylval.integer = BRW_OPCODE_PUSH; return PUSH; } +ret { yylval.integer = BRW_OPCODE_RET; return RET; } +rndd { yylval.integer = BRW_OPCODE_RNDD; return RNDD; } +rnde { yylval.integer = BRW_OPCODE_RNDE; return RNDE; } +rndu { yylval.integer = BRW_OPCODE_RNDU; return RNDU; } +rndz { yylval.integer = BRW_OPCODE_RNDZ; return RNDZ; } +rol { yylval.integer = BRW_OPCODE_ROL; return ROL; } +ror { yylval.integer = BRW_OPCODE_ROR; return ROR; } +sad2 { yylval.integer = BRW_OPCODE_SAD2; return SAD2; } +sada2 { yylval.integer = BRW_OPCODE_SADA2; return SADA2; } +sel { yylval.integer = BRW_OPCODE_SEL; return SEL; } +send { + yylval.integer = BRW_OPCODE_SEND; + return p->devinfo->ver < 12 ? SEND_GFX4 : SEND_GFX12; + } +sendc { + yylval.integer = BRW_OPCODE_SENDC; + return p->devinfo->ver < 12 ? SENDC_GFX4 : SENDC_GFX12; + } +sends { yylval.integer = BRW_OPCODE_SENDS; return SENDS; } +sendsc { yylval.integer = BRW_OPCODE_SENDSC; return SENDSC; } +shl { yylval.integer = BRW_OPCODE_SHL; return SHL; } +shr { yylval.integer = BRW_OPCODE_SHR; return SHR; } +smov { yylval.integer = BRW_OPCODE_SMOV; return SMOV; } +subb { yylval.integer = BRW_OPCODE_SUBB; return SUBB; } +wait { yylval.integer = BRW_OPCODE_WAIT; return WAIT; } +while { yylval.integer = BRW_OPCODE_WHILE; return WHILE; } +xor { yylval.integer = BRW_OPCODE_XOR; return XOR; } +sync { yylval.integer = BRW_OPCODE_SYNC; return SYNC; } + + /* extended math functions */ +cos { yylval.integer = BRW_MATH_FUNCTION_COS; return COS; } +exp { yylval.integer = BRW_MATH_FUNCTION_EXP; return EXP; } +fdiv { yylval.integer = BRW_MATH_FUNCTION_FDIV; return FDIV; } +inv { yylval.integer = BRW_MATH_FUNCTION_INV; return INV; } +invm { yylval.integer = GFX8_MATH_FUNCTION_INVM; return INVM; } +intdiv { + yylval.integer = BRW_MATH_FUNCTION_INT_DIV_QUOTIENT; + return INTDIV; + } +intdivmod { + yylval.integer = + BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER; + return INTDIVMOD; + } +intmod { + yylval.integer = BRW_MATH_FUNCTION_INT_DIV_REMAINDER; + return INTMOD; + } +log { yylval.integer = BRW_MATH_FUNCTION_LOG; return LOG; } +pow { yylval.integer = BRW_MATH_FUNCTION_POW; return POW; } +rsq { yylval.integer = BRW_MATH_FUNCTION_RSQ; return RSQ; } +rsqrtm { yylval.integer = GFX8_MATH_FUNCTION_RSQRTM; return RSQRTM; } +sin { yylval.integer = BRW_MATH_FUNCTION_SIN; return SIN; } +sqrt { yylval.integer = BRW_MATH_FUNCTION_SQRT; return SQRT; } +sincos { yylval.integer = BRW_MATH_FUNCTION_SINCOS; return SINCOS; } + + /* sync instruction */ +allrd { yylval.integer = TGL_SYNC_ALLRD; return ALLRD; } +allwr { yylval.integer = TGL_SYNC_ALLWR; return ALLWR; } +fence { yylval.integer = TGL_SYNC_FENCE; return FENCE; } +bar { yylval.integer = TGL_SYNC_BAR; return BAR; } +host { yylval.integer = TGL_SYNC_HOST; return HOST; } + + /* shared functions for send instruction */ +sampler { return SAMPLER; } +dp_sampler { return DP_SAMPLER; } +gateway { return GATEWAY; } +urb { return URB; } +thread_spawner { return THREAD_SPAWNER; } +render { return RENDER; } +const { return CONST; } +data { return DATA; } +cre { return CRE; } +math { return MATH; } +read { return READ; } +write { return WRITE; } +vme { return VME; } +"pixel interp" { return PIXEL_INTERP; } +"dp data 1" { return DP_DATA_1; } +"rt accel" { return RT_ACCEL; } +slm { return SLM; } +tgm { return TGM; } +ugm { return UGM; } + +";" { return SEMICOLON; } +":" { return COLON; } +"(" { return LPAREN; } +")" { return RPAREN; } +"{" { return LCURLY; } +"}" { return RCURLY; } +"[" { return LSQUARE; } +"]" { return RSQUARE; } +"<" { return LANGLE; } +">" { return RANGLE; } +"," { return COMMA; } +"." { return DOT; } +"+" { return PLUS; } +"-" { return MINUS; } +"~" { return MINUS; } +"(abs)" { return ABS; } + + +"VxH" { return VxH; } +"<" { return LANGLE; } +[0-9][0-9]* { + yylval.integer = strtoul(yytext, NULL, 10); + return INTEGER; + } +">" { return RANGLE; } +"," { return COMMA; } +"." { BEGIN(DOTSEL); return DOT; } +";" { return SEMICOLON; } + +"x" { yylval.integer = BRW_CHANNEL_X; return X; } +"y" { yylval.integer = BRW_CHANNEL_Y; return Y; } +"z" { yylval.integer = BRW_CHANNEL_Z; return Z; } +"w" { yylval.integer = BRW_CHANNEL_W; return W; } +[0-9][0-9]* { + yylval.integer = strtoul(yytext, NULL, 10); + BEGIN(REG); + return INTEGER; + } +. { yyless(0); BEGIN(INITIAL); } +. { yyless(0); BEGIN(INITIAL); } + + /* Access mode */ +"align1" { return ALIGN1; } +"align16" { return ALIGN16; } + + /* Accumulator write control */ +AccWrEnable { return ACCWREN; } + + /* Mask control (formerly WECtrl/Write Enable Control) */ +"WE_all" { return WECTRL; } + + /* Compaction control */ +compacted { return CMPTCTRL; } + + /* Debug control */ +breakpoint { return BREAKPOINT; } + + /* Dependency control */ +NoDDClr { return NODDCLR; } +NoDDChk { return NODDCHK; } + + /* End of thread */ +EOT { return EOT; } + + /* Mask control */ +nomask { return MASK_DISABLE; } + + /* Channel */ +"x" { yylval.integer = BRW_CHANNEL_X; return X; } +"y" { yylval.integer = BRW_CHANNEL_Y; return Y; } +"z" { yylval.integer = BRW_CHANNEL_Z; return Z; } +"w" { yylval.integer = BRW_CHANNEL_W; return W; } +[0-9][0-9]* { + yylval.integer = strtoul(yytext, NULL, 10); + return INTEGER; + } +"." { return DOT; } +. { yyless(0); BEGIN(INITIAL); } + + + /* Predicate Control */ +".anyv" { yylval.integer = BRW_PREDICATE_ALIGN1_ANYV; return ANYV; } +".allv" { yylval.integer = BRW_PREDICATE_ALIGN1_ALLV; return ALLV; } +".any2h" { yylval.integer = BRW_PREDICATE_ALIGN1_ANY2H; return ANY2H; } +".all2h" { yylval.integer = BRW_PREDICATE_ALIGN1_ALL2H; return ALL2H; } +".any4h" { yylval.integer = BRW_PREDICATE_ALIGN16_ANY4H; return ANY4H; } +".all4h" { yylval.integer = BRW_PREDICATE_ALIGN16_ALL4H; return ALL4H; } +".any8h" { yylval.integer = BRW_PREDICATE_ALIGN1_ANY8H; return ANY8H; } +".all8h" { yylval.integer = BRW_PREDICATE_ALIGN1_ALL8H; return ALL8H; } +".any16h" { yylval.integer = BRW_PREDICATE_ALIGN1_ANY16H; return ANY16H; } +".all16h" { yylval.integer = BRW_PREDICATE_ALIGN1_ALL16H; return ALL16H; } +".any32h" { yylval.integer = BRW_PREDICATE_ALIGN1_ANY32H; return ANY32H; } +".all32h" { yylval.integer = BRW_PREDICATE_ALIGN1_ALL32H; return ALL32H; } + + /* Saturation */ +".sat" { return SATURATE; } + + /* Thread control */ +atomic { return ATOMIC; } +switch { return SWITCH; } + + /* compression control */ +compr { return COMPR; } +compr4 { return COMPR4; } +sechalf { return SECHALF; } + + /* Quarter Control */ +1[HNQ] { } +"2Q" { return QTR_2Q; } +"3Q" { return QTR_3Q; } +"4Q" { return QTR_4Q; } +"2H" { return QTR_2H; } +"2N" { return QTR_2N; } +"3N" { return QTR_3N; } +"4N" { return QTR_4N; } +"5N" { return QTR_5N; } +"6N" { return QTR_6N; } +"7N" { return QTR_7N; } +"8N" { return QTR_8N; } + + /* data types */ +:?B { return TYPE_B; } +:?D { return TYPE_D; } +:?DF { return TYPE_DF; } +:?F { return TYPE_F; } +:?HF { return TYPE_HF; } +:?NF { return TYPE_NF; } +:?Q { return TYPE_Q; } +:?UB { return TYPE_UB; } +:?UD { return TYPE_UD; } +:?UW { return TYPE_UW; } +:?UQ { return TYPE_UQ; } +:?UV { return TYPE_UV; } +:?V { return TYPE_V; } +:?VF { return TYPE_VF; } +:?W { return TYPE_W; } + + /* Address registers */ +"a0" { return ADDRREG; } + + /* accumulator registers */ +"acc"[0-9]+ { yylval.integer = atoi(yytext + 3); return ACCREG; } + + /* channel enable registers */ +"ce0" { return CHANNELENABLEREG; } + + /* control registers */ +"cr0" { return CONTROLREG; } + + /* flag registers */ +"f"[0|1] { BEGIN(CHANNEL); yylval.integer = atoi(yytext + 1); return FLAGREG; } + + /* message control registers */ +"m" { return MSGREGFILE; } +m[0-9]+ { yylval.integer = atoi(yytext + 1); BEGIN(REG); return MSGREG; } + + /* state register */ +sr[0-9]+ { yylval.integer = atoi(yytext + 2); return STATEREG; } + + /* notification registers */ +"n0" { BEGIN(REG); return NOTIFYREG; } + + /* IP register */ +"ip" { return IPREG; } + + /* Thread control register */ +"tdr0" { return THREADREG; } + + /* performance register */ +"tm0" { BEGIN(REG); return PERFORMANCEREG; } + +[gr][0-9]+ { + yylval.integer = atoi(yytext + 1); + BEGIN(REG); return GENREG; + } +[gr] { return GENREGFILE; } +"mask"[0-9]+ { yylval.integer = atoi(yytext + 4); return MASKREG; } + + /* Conditional modifiers */ +".e" { yylval.integer = BRW_CONDITIONAL_Z; return EQUAL; } +".g" { yylval.integer = BRW_CONDITIONAL_G; return GREATER; } +".ge" { yylval.integer = BRW_CONDITIONAL_GE; return GREATER_EQUAL; } +".l" { yylval.integer = BRW_CONDITIONAL_L; return LESS; } +".le" { yylval.integer = BRW_CONDITIONAL_LE; return LESS_EQUAL; } +".ne" { yylval.integer = BRW_CONDITIONAL_NZ; return NOT_EQUAL; } +".nz" { yylval.integer = BRW_CONDITIONAL_NZ; return NOT_ZERO; } +".o" { yylval.integer = BRW_CONDITIONAL_O; return OVERFLOW; } +".r" { yylval.integer = BRW_CONDITIONAL_R; return ROUND_INCREMENT; } +".u" { yylval.integer = BRW_CONDITIONAL_U; return UNORDERED; } +".z" { yylval.integer = BRW_CONDITIONAL_Z; return ZERO; } + + /* Eat up JIP and UIP token, their values will be parsed + * in numeric section + */ +"JIP: " { BEGIN(LABEL); } +"UIP: " { BEGIN(LABEL); } +"Jump: " { } +"Pop: " { } +[ \t]+ { } + +"MsgDesc: " { BEGIN(MSGDESC); return MSGDESC_BEGIN; } +ex_bso { return EX_BSO; } +src1_len { return SRC1_LEN; } +"=" { return ASSIGN; } +[0-9][0-9]* { + yylval.integer = strtoul(yytext, NULL, 10); + return INTEGER; + } +"{" { yyless(0); BEGIN(INITIAL); return MSGDESC_END; } +. { } + +"0x"[0-9a-f][0-9a-f]* { + yylval.llint = strtoull(yytext + 2, NULL, 16); + return LONG; + } +[0-9][0-9]* { + yylval.llint = strtoll(yytext, NULL, 10); + return LONG; + } + + /* jump label target */ +[a-zA-Z_][0-9a-zA-Z_]*":" { + yylval.string = ralloc_strdup(p->mem_ctx, yytext); + /* Stomp the trailing ':' */ + yylval.string[yyleng - 1] = '\0'; + return JUMP_LABEL_TARGET; +} + + /* jump label */ +