mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-15 18:38:05 +02:00
intel/elk: Fork Gfx8- compiler by copying existing code
Based on code from commit c3ceec6cd8.
Acked-by: Ian Romanick <ian.d.romanick@intel.com>
Acked-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27563>
This commit is contained in:
parent
a9214460ee
commit
d44462c08d
777 changed files with 151345 additions and 0 deletions
122
src/intel/compiler/elk/brw_asm.h
Normal file
122
src/intel/compiler/elk/brw_asm.h
Normal file
|
|
@ -0,0 +1,122 @@
|
|||
/*
|
||||
* Copyright © 2018 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef BRW_ASM_H
|
||||
#define BRW_ASM_H
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdbool.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "compiler/brw_reg.h"
|
||||
#include "compiler/brw_reg_type.h"
|
||||
#include "compiler/brw_eu_defines.h"
|
||||
#include "compiler/brw_inst.h"
|
||||
#include "compiler/brw_eu.h"
|
||||
#include "dev/intel_device_info.h"
|
||||
#include "util/list.h"
|
||||
|
||||
/* glibc < 2.27 defines OVERFLOW in /usr/include/math.h. */
|
||||
#undef OVERFLOW
|
||||
|
||||
int yyparse(void);
|
||||
int yylex(void);
|
||||
char *lex_text(void);
|
||||
|
||||
extern struct brw_codegen *p;
|
||||
extern int errors;
|
||||
extern char *input_filename;
|
||||
|
||||
extern struct list_head instr_labels;
|
||||
extern struct list_head target_labels;
|
||||
|
||||
struct condition {
|
||||
unsigned cond_modifier:4;
|
||||
unsigned flag_reg_nr:1;
|
||||
unsigned flag_subreg_nr:1;
|
||||
};
|
||||
|
||||
struct predicate {
|
||||
unsigned pred_control:4;
|
||||
unsigned pred_inv:1;
|
||||
unsigned flag_reg_nr:1;
|
||||
unsigned flag_subreg_nr:1;
|
||||
};
|
||||
|
||||
enum instoption_type {
|
||||
INSTOPTION_FLAG,
|
||||
INSTOPTION_DEP_INFO,
|
||||
};
|
||||
|
||||
struct instoption {
|
||||
enum instoption_type type;
|
||||
union {
|
||||
unsigned uint_value;
|
||||
struct tgl_swsb depinfo_value;
|
||||
};
|
||||
};
|
||||
|
||||
struct options {
|
||||
unsigned access_mode:1;
|
||||
unsigned compression_control:2;
|
||||
unsigned thread_control:2;
|
||||
unsigned no_dd_check:1; // Dependency control
|
||||
unsigned no_dd_clear:1; // Dependency control
|
||||
unsigned mask_control:1;
|
||||
unsigned debug_control:1;
|
||||
unsigned acc_wr_control:1;
|
||||
unsigned end_of_thread:1;
|
||||
unsigned compaction:1;
|
||||
unsigned qtr_ctrl:2;
|
||||
unsigned nib_ctrl:1;
|
||||
unsigned is_compr:1;
|
||||
struct tgl_swsb depinfo;
|
||||
};
|
||||
|
||||
struct msgdesc {
|
||||
unsigned ex_bso:1;
|
||||
unsigned src1_len:5;
|
||||
};
|
||||
|
||||
enum instr_label_type {
|
||||
INSTR_LABEL_JIP,
|
||||
INSTR_LABEL_UIP,
|
||||
};
|
||||
|
||||
struct instr_label {
|
||||
struct list_head link;
|
||||
|
||||
char *name;
|
||||
int offset;
|
||||
enum instr_label_type type;
|
||||
};
|
||||
|
||||
struct target_label {
|
||||
struct list_head link;
|
||||
|
||||
char *name;
|
||||
int offset;
|
||||
};
|
||||
|
||||
#endif /* BRW_ASM_H */
|
||||
385
src/intel/compiler/elk/brw_asm_tool.c
Normal file
385
src/intel/compiler/elk/brw_asm_tool.c
Normal file
|
|
@ -0,0 +1,385 @@
|
|||
/*
|
||||
* Copyright © 2018 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <getopt.h>
|
||||
#include "brw_asm.h"
|
||||
#include "intel/compiler/brw_disasm_info.h"
|
||||
|
||||
enum opt_output_type {
|
||||
OPT_OUTPUT_HEX,
|
||||
OPT_OUTPUT_C_LITERAL,
|
||||
OPT_OUTPUT_BIN,
|
||||
};
|
||||
|
||||
extern FILE *yyin;
|
||||
struct brw_codegen *p;
|
||||
static enum opt_output_type output_type = OPT_OUTPUT_BIN;
|
||||
char *input_filename = NULL;
|
||||
int errors;
|
||||
|
||||
struct list_head instr_labels;
|
||||
struct list_head target_labels;
|
||||
|
||||
static void
|
||||
print_help(const char *progname, FILE *file)
|
||||
{
|
||||
fprintf(file,
|
||||
"Usage: %s [OPTION] inputfile\n"
|
||||
"Assemble i965 instructions from input file.\n\n"
|
||||
" -h, --help display this help and exit\n"
|
||||
" -t, --type=OUTPUT_TYPE OUTPUT_TYPE can be 'bin' (default if omitted),\n"
|
||||
" 'c_literal', or 'hex'\n"
|
||||
" -o, --output specify output file\n"
|
||||
" --compact print compacted instructions\n"
|
||||
" -g, --gen=platform assemble instructions for given \n"
|
||||
" platform (3 letter platform name)\n"
|
||||
"Example:\n"
|
||||
" i965_asm -g kbl input.asm -t hex -o output\n",
|
||||
progname);
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
get_dword(const brw_inst *inst, int idx)
|
||||
{
|
||||
uint32_t dword;
|
||||
memcpy(&dword, (char *)inst + 4 * idx, sizeof(dword));
|
||||
return dword;
|
||||
}
|
||||
|
||||
static void
|
||||
print_instruction(FILE *output, bool compact, const brw_inst *instruction)
|
||||
{
|
||||
int byte_limit;
|
||||
|
||||
byte_limit = (compact == true) ? 8 : 16;
|
||||
|
||||
switch (output_type) {
|
||||
case OPT_OUTPUT_HEX: {
|
||||
fprintf(output, "%02x", ((unsigned char *)instruction)[0]);
|
||||
|
||||
for (unsigned i = 1; i < byte_limit; i++) {
|
||||
fprintf(output, " %02x", ((unsigned char *)instruction)[i]);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case OPT_OUTPUT_C_LITERAL: {
|
||||
fprintf(output, "\t0x%08x,", get_dword(instruction, 0));
|
||||
|
||||
for (unsigned i = 1; i < byte_limit / 4; i++)
|
||||
fprintf(output, " 0x%08x,", get_dword(instruction, i));
|
||||
|
||||
break;
|
||||
}
|
||||
case OPT_OUTPUT_BIN:
|
||||
fwrite(instruction, 1, byte_limit, output);
|
||||
break;
|
||||
}
|
||||
|
||||
if (output_type != OPT_OUTPUT_BIN) {
|
||||
fprintf(output, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
static struct intel_device_info *
|
||||
i965_disasm_init(uint16_t pci_id)
|
||||
{
|
||||
struct intel_device_info *devinfo;
|
||||
|
||||
devinfo = malloc(sizeof *devinfo);
|
||||
if (devinfo == NULL)
|
||||
return NULL;
|
||||
|
||||
if (!intel_get_device_info_from_pci_id(pci_id, devinfo)) {
|
||||
fprintf(stderr, "can't find device information: pci_id=0x%x\n",
|
||||
pci_id);
|
||||
free(devinfo);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return devinfo;
|
||||
}
|
||||
|
||||
static bool
|
||||
i965_postprocess_labels()
|
||||
{
|
||||
if (p->devinfo->ver < 6) {
|
||||
return true;
|
||||
}
|
||||
|
||||
void *store = p->store;
|
||||
|
||||
struct target_label *tlabel;
|
||||
struct instr_label *ilabel, *s;
|
||||
|
||||
const unsigned to_bytes_scale = brw_jump_scale(p->devinfo);
|
||||
|
||||
LIST_FOR_EACH_ENTRY(tlabel, &target_labels, link) {
|
||||
LIST_FOR_EACH_ENTRY_SAFE(ilabel, s, &instr_labels, link) {
|
||||
if (!strcmp(tlabel->name, ilabel->name)) {
|
||||
brw_inst *inst = store + ilabel->offset;
|
||||
|
||||
int relative_offset = (tlabel->offset - ilabel->offset) / sizeof(brw_inst);
|
||||
relative_offset *= to_bytes_scale;
|
||||
|
||||
unsigned opcode = brw_inst_opcode(p->isa, inst);
|
||||
|
||||
if (ilabel->type == INSTR_LABEL_JIP) {
|
||||
switch (opcode) {
|
||||
case BRW_OPCODE_IF:
|
||||
case BRW_OPCODE_ELSE:
|
||||
case BRW_OPCODE_ENDIF:
|
||||
case BRW_OPCODE_WHILE:
|
||||
if (p->devinfo->ver >= 7) {
|
||||
brw_inst_set_jip(p->devinfo, inst, relative_offset);
|
||||
} else if (p->devinfo->ver == 6) {
|
||||
brw_inst_set_gfx6_jump_count(p->devinfo, inst, relative_offset);
|
||||
}
|
||||
break;
|
||||
case BRW_OPCODE_BREAK:
|
||||
case BRW_OPCODE_HALT:
|
||||
case BRW_OPCODE_CONTINUE:
|
||||
brw_inst_set_jip(p->devinfo, inst, relative_offset);
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "Unknown opcode %d with JIP label\n", opcode);
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
switch (opcode) {
|
||||
case BRW_OPCODE_IF:
|
||||
case BRW_OPCODE_ELSE:
|
||||
if (p->devinfo->ver > 7) {
|
||||
brw_inst_set_uip(p->devinfo, inst, relative_offset);
|
||||
} else if (p->devinfo->ver == 7) {
|
||||
brw_inst_set_uip(p->devinfo, inst, relative_offset);
|
||||
} else if (p->devinfo->ver == 6) {
|
||||
// Nothing
|
||||
}
|
||||
break;
|
||||
case BRW_OPCODE_WHILE:
|
||||
case BRW_OPCODE_ENDIF:
|
||||
fprintf(stderr, "WHILE/ENDIF cannot have UIP offset\n");
|
||||
return false;
|
||||
case BRW_OPCODE_BREAK:
|
||||
case BRW_OPCODE_CONTINUE:
|
||||
case BRW_OPCODE_HALT:
|
||||
brw_inst_set_uip(p->devinfo, inst, relative_offset);
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "Unknown opcode %d with UIP label\n", opcode);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
list_del(&ilabel->link);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
LIST_FOR_EACH_ENTRY(ilabel, &instr_labels, link) {
|
||||
fprintf(stderr, "Unknown label '%s'\n", ilabel->name);
|
||||
}
|
||||
|
||||
return list_is_empty(&instr_labels);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
char *output_file = NULL;
|
||||
char c;
|
||||
FILE *output = stdout;
|
||||
bool help = false, compact = false;
|
||||
void *store;
|
||||
uint64_t pci_id = 0;
|
||||
int offset = 0, err;
|
||||
int start_offset = 0;
|
||||
struct disasm_info *disasm_info;
|
||||
struct intel_device_info *devinfo = NULL;
|
||||
int result = EXIT_FAILURE;
|
||||
list_inithead(&instr_labels);
|
||||
list_inithead(&target_labels);
|
||||
|
||||
const struct option i965_asm_opts[] = {
|
||||
{ "help", no_argument, (int *) &help, true },
|
||||
{ "type", required_argument, NULL, 't' },
|
||||
{ "gen", required_argument, NULL, 'g' },
|
||||
{ "output", required_argument, NULL, 'o' },
|
||||
{ "compact", no_argument, (int *) &compact, true },
|
||||
{ NULL, 0, NULL, 0 }
|
||||
};
|
||||
|
||||
while ((c = getopt_long(argc, argv, ":t:g:o:h", i965_asm_opts, NULL)) != -1) {
|
||||
switch (c) {
|
||||
case 'g': {
|
||||
const int id = intel_device_name_to_pci_device_id(optarg);
|
||||
if (id < 0) {
|
||||
fprintf(stderr, "can't parse gen: '%s', expected 3 letter "
|
||||
"platform name\n", optarg);
|
||||
goto end;
|
||||
} else {
|
||||
pci_id = id;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 'h':
|
||||
help = true;
|
||||
print_help(argv[0], stderr);
|
||||
goto end;
|
||||
case 't': {
|
||||
if (strcmp(optarg, "hex") == 0) {
|
||||
output_type = OPT_OUTPUT_HEX;
|
||||
} else if (strcmp(optarg, "c_literal") == 0) {
|
||||
output_type = OPT_OUTPUT_C_LITERAL;
|
||||
} else if (strcmp(optarg, "bin") == 0) {
|
||||
output_type = OPT_OUTPUT_BIN;
|
||||
} else {
|
||||
fprintf(stderr, "invalid value for --type: %s\n", optarg);
|
||||
goto end;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 'o':
|
||||
output_file = strdup(optarg);
|
||||
break;
|
||||
case 0:
|
||||
break;
|
||||
case ':':
|
||||
fprintf(stderr, "%s: option `-%c' requires an argument\n",
|
||||
argv[0], optopt);
|
||||
goto end;
|
||||
case '?':
|
||||
default:
|
||||
fprintf(stderr, "%s: option `-%c' is invalid: ignored\n",
|
||||
argv[0], optopt);
|
||||
goto end;
|
||||
}
|
||||
}
|
||||
|
||||
if (help || !pci_id) {
|
||||
print_help(argv[0], stderr);
|
||||
goto end;
|
||||
}
|
||||
|
||||
if (!argv[optind]) {
|
||||
fprintf(stderr, "Please specify input file\n");
|
||||
goto end;
|
||||
}
|
||||
|
||||
input_filename = strdup(argv[optind]);
|
||||
yyin = fopen(input_filename, "r");
|
||||
if (!yyin) {
|
||||
fprintf(stderr, "Unable to read input file : %s\n",
|
||||
input_filename);
|
||||
goto end;
|
||||
}
|
||||
|
||||
if (output_file) {
|
||||
output = fopen(output_file, "w");
|
||||
if (!output) {
|
||||
fprintf(stderr, "Couldn't open output file\n");
|
||||
goto end;
|
||||
}
|
||||
}
|
||||
|
||||
devinfo = i965_disasm_init(pci_id);
|
||||
if (!devinfo) {
|
||||
fprintf(stderr, "Unable to allocate memory for "
|
||||
"intel_device_info struct instance.\n");
|
||||
goto end;
|
||||
}
|
||||
|
||||
struct brw_isa_info isa;
|
||||
brw_init_isa_info(&isa, devinfo);
|
||||
|
||||
p = rzalloc(NULL, struct brw_codegen);
|
||||
brw_init_codegen(&isa, p, p);
|
||||
p->automatic_exec_sizes = false;
|
||||
|
||||
err = yyparse();
|
||||
if (err || errors)
|
||||
goto end;
|
||||
|
||||
if (!i965_postprocess_labels())
|
||||
goto end;
|
||||
|
||||
store = p->store;
|
||||
|
||||
disasm_info = disasm_initialize(p->isa, NULL);
|
||||
if (!disasm_info) {
|
||||
fprintf(stderr, "Unable to initialize disasm_info struct instance\n");
|
||||
goto end;
|
||||
}
|
||||
|
||||
if (output_type == OPT_OUTPUT_C_LITERAL)
|
||||
fprintf(output, "{\n");
|
||||
|
||||
brw_validate_instructions(p->isa, p->store, 0,
|
||||
p->next_insn_offset, disasm_info);
|
||||
|
||||
const int nr_insn = (p->next_insn_offset - start_offset) / 16;
|
||||
|
||||
if (compact)
|
||||
brw_compact_instructions(p, start_offset, disasm_info);
|
||||
|
||||
for (int i = 0; i < nr_insn; i++) {
|
||||
const brw_inst *insn = store + offset;
|
||||
bool compacted = false;
|
||||
|
||||
if (compact && brw_inst_cmpt_control(p->devinfo, insn)) {
|
||||
offset += 8;
|
||||
compacted = true;
|
||||
} else {
|
||||
offset += 16;
|
||||
}
|
||||
|
||||
print_instruction(output, compacted, insn);
|
||||
}
|
||||
|
||||
ralloc_free(disasm_info);
|
||||
|
||||
if (output_type == OPT_OUTPUT_C_LITERAL)
|
||||
fprintf(output, "}");
|
||||
|
||||
result = EXIT_SUCCESS;
|
||||
goto end;
|
||||
|
||||
end:
|
||||
free(input_filename);
|
||||
free(output_file);
|
||||
|
||||
if (yyin)
|
||||
fclose(yyin);
|
||||
|
||||
if (output)
|
||||
fclose(output);
|
||||
|
||||
if (p)
|
||||
ralloc_free(p);
|
||||
|
||||
if (devinfo)
|
||||
free(devinfo);
|
||||
|
||||
exit(result);
|
||||
}
|
||||
833
src/intel/compiler/elk/brw_cfg.cpp
Normal file
833
src/intel/compiler/elk/brw_cfg.cpp
Normal file
|
|
@ -0,0 +1,833 @@
|
|||
/*
|
||||
* Copyright © 2012 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* Authors:
|
||||
* Eric Anholt <eric@anholt.net>
|
||||
*
|
||||
*/
|
||||
|
||||
#include "brw_cfg.h"
|
||||
#include "util/u_dynarray.h"
|
||||
#include "brw_shader.h"
|
||||
|
||||
/** @file brw_cfg.cpp
|
||||
*
|
||||
* Walks the shader instructions generated and creates a set of basic
|
||||
* blocks with successor/predecessor edges connecting them.
|
||||
*/
|
||||
|
||||
using namespace brw;
|
||||
|
||||
static bblock_t *
|
||||
pop_stack(exec_list *list)
|
||||
{
|
||||
bblock_link *link = (bblock_link *)list->get_tail();
|
||||
bblock_t *block = link->block;
|
||||
link->link.remove();
|
||||
|
||||
return block;
|
||||
}
|
||||
|
||||
static exec_node *
|
||||
link(void *mem_ctx, bblock_t *block, enum bblock_link_kind kind)
|
||||
{
|
||||
bblock_link *l = new(mem_ctx) bblock_link(block, kind);
|
||||
return &l->link;
|
||||
}
|
||||
|
||||
void
|
||||
push_stack(exec_list *list, void *mem_ctx, bblock_t *block)
|
||||
{
|
||||
/* The kind of the link is immaterial, but we need to provide one since
|
||||
* this is (ab)using the edge data structure in order to implement a stack.
|
||||
*/
|
||||
list->push_tail(link(mem_ctx, block, bblock_link_logical));
|
||||
}
|
||||
|
||||
bblock_t::bblock_t(cfg_t *cfg) :
|
||||
cfg(cfg), start_ip(0), end_ip(0), end_ip_delta(0), num(0)
|
||||
{
|
||||
instructions.make_empty();
|
||||
parents.make_empty();
|
||||
children.make_empty();
|
||||
}
|
||||
|
||||
void
|
||||
bblock_t::add_successor(void *mem_ctx, bblock_t *successor,
|
||||
enum bblock_link_kind kind)
|
||||
{
|
||||
successor->parents.push_tail(::link(mem_ctx, this, kind));
|
||||
children.push_tail(::link(mem_ctx, successor, kind));
|
||||
}
|
||||
|
||||
bool
|
||||
bblock_t::is_predecessor_of(const bblock_t *block,
|
||||
enum bblock_link_kind kind) const
|
||||
{
|
||||
foreach_list_typed_safe (bblock_link, parent, link, &block->parents) {
|
||||
if (parent->block == this && parent->kind <= kind) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool
|
||||
bblock_t::is_successor_of(const bblock_t *block,
|
||||
enum bblock_link_kind kind) const
|
||||
{
|
||||
foreach_list_typed_safe (bblock_link, child, link, &block->children) {
|
||||
if (child->block == this && child->kind <= kind) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool
|
||||
ends_block(const backend_instruction *inst)
|
||||
{
|
||||
enum opcode op = inst->opcode;
|
||||
|
||||
return op == BRW_OPCODE_IF ||
|
||||
op == BRW_OPCODE_ELSE ||
|
||||
op == BRW_OPCODE_CONTINUE ||
|
||||
op == BRW_OPCODE_BREAK ||
|
||||
op == BRW_OPCODE_DO ||
|
||||
op == BRW_OPCODE_WHILE;
|
||||
}
|
||||
|
||||
static bool
|
||||
starts_block(const backend_instruction *inst)
|
||||
{
|
||||
enum opcode op = inst->opcode;
|
||||
|
||||
return op == BRW_OPCODE_DO ||
|
||||
op == BRW_OPCODE_ENDIF;
|
||||
}
|
||||
|
||||
bool
|
||||
bblock_t::can_combine_with(const bblock_t *that) const
|
||||
{
|
||||
if ((const bblock_t *)this->link.next != that)
|
||||
return false;
|
||||
|
||||
if (ends_block(this->end()) ||
|
||||
starts_block(that->start()))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
bblock_t::combine_with(bblock_t *that)
|
||||
{
|
||||
assert(this->can_combine_with(that));
|
||||
foreach_list_typed (bblock_link, link, link, &that->parents) {
|
||||
assert(link->block == this);
|
||||
}
|
||||
|
||||
this->end_ip = that->end_ip;
|
||||
this->instructions.append_list(&that->instructions);
|
||||
|
||||
this->cfg->remove_block(that);
|
||||
}
|
||||
|
||||
void
|
||||
bblock_t::dump(FILE *file) const
|
||||
{
|
||||
const backend_shader *s = this->cfg->s;
|
||||
|
||||
int ip = this->start_ip;
|
||||
foreach_inst_in_block(backend_instruction, inst, this) {
|
||||
fprintf(file, "%5d: ", ip);
|
||||
s->dump_instruction(inst, file);
|
||||
ip++;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
bblock_t::unlink_list(exec_list *list)
|
||||
{
|
||||
assert(list == &parents || list == &children);
|
||||
const bool remove_parent = list == &children;
|
||||
|
||||
foreach_list_typed_safe(bblock_link, link, link, list) {
|
||||
/* Also break the links from the other block back to this block. */
|
||||
exec_list *sub_list = remove_parent ? &link->block->parents : &link->block->children;
|
||||
|
||||
foreach_list_typed_safe(bblock_link, sub_link, link, sub_list) {
|
||||
if (sub_link->block == this) {
|
||||
sub_link->link.remove();
|
||||
ralloc_free(sub_link);
|
||||
}
|
||||
}
|
||||
|
||||
link->link.remove();
|
||||
ralloc_free(link);
|
||||
}
|
||||
}
|
||||
|
||||
cfg_t::cfg_t(const backend_shader *s, exec_list *instructions) :
|
||||
s(s)
|
||||
{
|
||||
mem_ctx = ralloc_context(NULL);
|
||||
block_list.make_empty();
|
||||
blocks = NULL;
|
||||
num_blocks = 0;
|
||||
|
||||
bblock_t *cur = NULL;
|
||||
int ip = 0;
|
||||
|
||||
bblock_t *entry = new_block();
|
||||
bblock_t *cur_if = NULL; /**< BB ending with IF. */
|
||||
bblock_t *cur_else = NULL; /**< BB ending with ELSE. */
|
||||
bblock_t *cur_do = NULL; /**< BB starting with DO. */
|
||||
bblock_t *cur_while = NULL; /**< BB immediately following WHILE. */
|
||||
exec_list if_stack, else_stack, do_stack, while_stack;
|
||||
bblock_t *next;
|
||||
|
||||
set_next_block(&cur, entry, ip);
|
||||
|
||||
foreach_in_list_safe(backend_instruction, inst, instructions) {
|
||||
/* set_next_block wants the post-incremented ip */
|
||||
ip++;
|
||||
|
||||
inst->exec_node::remove();
|
||||
|
||||
switch (inst->opcode) {
|
||||
case BRW_OPCODE_IF:
|
||||
cur->instructions.push_tail(inst);
|
||||
|
||||
/* Push our information onto a stack so we can recover from
|
||||
* nested ifs.
|
||||
*/
|
||||
push_stack(&if_stack, mem_ctx, cur_if);
|
||||
push_stack(&else_stack, mem_ctx, cur_else);
|
||||
|
||||
cur_if = cur;
|
||||
cur_else = NULL;
|
||||
|
||||
/* Set up our immediately following block, full of "then"
|
||||
* instructions.
|
||||
*/
|
||||
next = new_block();
|
||||
cur_if->add_successor(mem_ctx, next, bblock_link_logical);
|
||||
|
||||
set_next_block(&cur, next, ip);
|
||||
break;
|
||||
|
||||
case BRW_OPCODE_ELSE:
|
||||
cur->instructions.push_tail(inst);
|
||||
|
||||
cur_else = cur;
|
||||
|
||||
next = new_block();
|
||||
assert(cur_if != NULL);
|
||||
cur_if->add_successor(mem_ctx, next, bblock_link_logical);
|
||||
cur_else->add_successor(mem_ctx, next, bblock_link_physical);
|
||||
|
||||
set_next_block(&cur, next, ip);
|
||||
break;
|
||||
|
||||
case BRW_OPCODE_ENDIF: {
|
||||
bblock_t *cur_endif;
|
||||
|
||||
if (cur->instructions.is_empty()) {
|
||||
/* New block was just created; use it. */
|
||||
cur_endif = cur;
|
||||
} else {
|
||||
cur_endif = new_block();
|
||||
|
||||
cur->add_successor(mem_ctx, cur_endif, bblock_link_logical);
|
||||
|
||||
set_next_block(&cur, cur_endif, ip - 1);
|
||||
}
|
||||
|
||||
cur->instructions.push_tail(inst);
|
||||
|
||||
if (cur_else) {
|
||||
cur_else->add_successor(mem_ctx, cur_endif, bblock_link_logical);
|
||||
} else {
|
||||
assert(cur_if != NULL);
|
||||
cur_if->add_successor(mem_ctx, cur_endif, bblock_link_logical);
|
||||
}
|
||||
|
||||
assert(cur_if->end()->opcode == BRW_OPCODE_IF);
|
||||
assert(!cur_else || cur_else->end()->opcode == BRW_OPCODE_ELSE);
|
||||
|
||||
/* Pop the stack so we're in the previous if/else/endif */
|
||||
cur_if = pop_stack(&if_stack);
|
||||
cur_else = pop_stack(&else_stack);
|
||||
break;
|
||||
}
|
||||
case BRW_OPCODE_DO:
|
||||
/* Push our information onto a stack so we can recover from
|
||||
* nested loops.
|
||||
*/
|
||||
push_stack(&do_stack, mem_ctx, cur_do);
|
||||
push_stack(&while_stack, mem_ctx, cur_while);
|
||||
|
||||
/* Set up the block just after the while. Don't know when exactly
|
||||
* it will start, yet.
|
||||
*/
|
||||
cur_while = new_block();
|
||||
|
||||
if (cur->instructions.is_empty()) {
|
||||
/* New block was just created; use it. */
|
||||
cur_do = cur;
|
||||
} else {
|
||||
cur_do = new_block();
|
||||
|
||||
cur->add_successor(mem_ctx, cur_do, bblock_link_logical);
|
||||
|
||||
set_next_block(&cur, cur_do, ip - 1);
|
||||
}
|
||||
|
||||
cur->instructions.push_tail(inst);
|
||||
|
||||
/* Represent divergent execution of the loop as a pair of alternative
|
||||
* edges coming out of the DO instruction: For any physical iteration
|
||||
* of the loop a given logical thread can either start off enabled
|
||||
* (which is represented as the "next" successor), or disabled (if it
|
||||
* has reached a non-uniform exit of the loop during a previous
|
||||
* iteration, which is represented as the "cur_while" successor).
|
||||
*
|
||||
* The disabled edge will be taken by the logical thread anytime we
|
||||
* arrive at the DO instruction through a back-edge coming from a
|
||||
* conditional exit of the loop where divergent control flow started.
|
||||
*
|
||||
* This guarantees that there is a control-flow path from any
|
||||
* divergence point of the loop into the convergence point
|
||||
* (immediately past the WHILE instruction) such that it overlaps the
|
||||
* whole IP region of divergent control flow (potentially the whole
|
||||
* loop) *and* doesn't imply the execution of any instructions part
|
||||
* of the loop (since the corresponding execution mask bit will be
|
||||
* disabled for a diverging thread).
|
||||
*
|
||||
* This way we make sure that any variables that are live throughout
|
||||
* the region of divergence for an inactive logical thread are also
|
||||
* considered to interfere with any other variables assigned by
|
||||
* active logical threads within the same physical region of the
|
||||
* program, since otherwise we would risk cross-channel data
|
||||
* corruption.
|
||||
*/
|
||||
next = new_block();
|
||||
cur->add_successor(mem_ctx, next, bblock_link_logical);
|
||||
cur->add_successor(mem_ctx, cur_while, bblock_link_physical);
|
||||
set_next_block(&cur, next, ip);
|
||||
break;
|
||||
|
||||
case BRW_OPCODE_CONTINUE:
|
||||
cur->instructions.push_tail(inst);
|
||||
|
||||
/* A conditional CONTINUE may start a region of divergent control
|
||||
* flow until the start of the next loop iteration (*not* until the
|
||||
* end of the loop which is why the successor is not the top-level
|
||||
* divergence point at cur_do). The live interval of any variable
|
||||
* extending through a CONTINUE edge is guaranteed to overlap the
|
||||
* whole region of divergent execution, because any variable live-out
|
||||
* at the CONTINUE instruction will also be live-in at the top of the
|
||||
* loop, and therefore also live-out at the bottom-most point of the
|
||||
* loop which is reachable from the top (since a control flow path
|
||||
* exists from a definition of the variable through this CONTINUE
|
||||
* instruction, the top of the loop, the (reachable) bottom of the
|
||||
* loop, the top of the loop again, into a use of the variable).
|
||||
*/
|
||||
assert(cur_do != NULL);
|
||||
cur->add_successor(mem_ctx, cur_do->next(), bblock_link_logical);
|
||||
|
||||
next = new_block();
|
||||
if (inst->predicate)
|
||||
cur->add_successor(mem_ctx, next, bblock_link_logical);
|
||||
else
|
||||
cur->add_successor(mem_ctx, next, bblock_link_physical);
|
||||
|
||||
set_next_block(&cur, next, ip);
|
||||
break;
|
||||
|
||||
case BRW_OPCODE_BREAK:
|
||||
cur->instructions.push_tail(inst);
|
||||
|
||||
/* A conditional BREAK instruction may start a region of divergent
|
||||
* control flow until the end of the loop if the condition is
|
||||
* non-uniform, in which case the loop will execute additional
|
||||
* iterations with the present channel disabled. We model this as a
|
||||
* control flow path from the divergence point to the convergence
|
||||
* point that overlaps the whole IP range of the loop and skips over
|
||||
* the execution of any other instructions part of the loop.
|
||||
*
|
||||
* See the DO case for additional explanation.
|
||||
*/
|
||||
assert(cur_do != NULL);
|
||||
cur->add_successor(mem_ctx, cur_do, bblock_link_physical);
|
||||
cur->add_successor(mem_ctx, cur_while, bblock_link_logical);
|
||||
|
||||
next = new_block();
|
||||
if (inst->predicate)
|
||||
cur->add_successor(mem_ctx, next, bblock_link_logical);
|
||||
else
|
||||
cur->add_successor(mem_ctx, next, bblock_link_physical);
|
||||
|
||||
set_next_block(&cur, next, ip);
|
||||
break;
|
||||
|
||||
case BRW_OPCODE_WHILE:
|
||||
cur->instructions.push_tail(inst);
|
||||
|
||||
assert(cur_do != NULL && cur_while != NULL);
|
||||
|
||||
/* A conditional WHILE instruction may start a region of divergent
|
||||
* control flow until the end of the loop, just like the BREAK
|
||||
* instruction. See the BREAK case for more details. OTOH an
|
||||
* unconditional WHILE instruction is non-divergent (just like an
|
||||
* unconditional CONTINUE), and will necessarily lead to the
|
||||
* execution of an additional iteration of the loop for all enabled
|
||||
* channels, so we may skip over the divergence point at the top of
|
||||
* the loop to keep the CFG as unambiguous as possible.
|
||||
*/
|
||||
if (inst->predicate) {
|
||||
cur->add_successor(mem_ctx, cur_do, bblock_link_logical);
|
||||
} else {
|
||||
cur->add_successor(mem_ctx, cur_do->next(), bblock_link_logical);
|
||||
}
|
||||
|
||||
set_next_block(&cur, cur_while, ip);
|
||||
|
||||
/* Pop the stack so we're in the previous loop */
|
||||
cur_do = pop_stack(&do_stack);
|
||||
cur_while = pop_stack(&while_stack);
|
||||
break;
|
||||
|
||||
default:
|
||||
cur->instructions.push_tail(inst);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
cur->end_ip = ip - 1;
|
||||
|
||||
make_block_array();
|
||||
}
|
||||
|
||||
cfg_t::~cfg_t()
|
||||
{
|
||||
ralloc_free(mem_ctx);
|
||||
}
|
||||
|
||||
void
|
||||
cfg_t::remove_block(bblock_t *block)
|
||||
{
|
||||
foreach_list_typed_safe (bblock_link, predecessor, link, &block->parents) {
|
||||
/* cfg_t::validate checks that predecessor and successor lists are well
|
||||
* formed, so it is known that the loop here would find exactly one
|
||||
* block. Set old_link_kind to silence "variable used but not set"
|
||||
* warnings.
|
||||
*/
|
||||
bblock_link_kind old_link_kind = bblock_link_logical;
|
||||
|
||||
/* Remove block from all of its predecessors' successor lists. */
|
||||
foreach_list_typed_safe (bblock_link, successor, link,
|
||||
&predecessor->block->children) {
|
||||
if (block == successor->block) {
|
||||
old_link_kind = successor->kind;
|
||||
successor->link.remove();
|
||||
ralloc_free(successor);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Add removed-block's successors to its predecessors' successor lists. */
|
||||
foreach_list_typed (bblock_link, successor, link, &block->children) {
|
||||
bool need_to_link = true;
|
||||
bblock_link_kind new_link_kind = MAX2(old_link_kind, successor->kind);
|
||||
|
||||
foreach_list_typed_safe (bblock_link, child, link, &predecessor->block->children) {
|
||||
/* There is already a link between the two blocks. If the links
|
||||
* are the same kind or the link is logical, do nothing. If the
|
||||
* existing link is physical and the proposed new link is logical,
|
||||
* promote the existing link to logical.
|
||||
*
|
||||
* This is accomplished by taking the minimum of the existing link
|
||||
* kind and the proposed link kind.
|
||||
*/
|
||||
if (child->block == successor->block) {
|
||||
child->kind = MIN2(child->kind, new_link_kind);
|
||||
need_to_link = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (need_to_link) {
|
||||
predecessor->block->children.push_tail(link(mem_ctx,
|
||||
successor->block,
|
||||
new_link_kind));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach_list_typed_safe (bblock_link, successor, link, &block->children) {
|
||||
/* cfg_t::validate checks that predecessor and successor lists are well
|
||||
* formed, so it is known that the loop here would find exactly one
|
||||
* block. Set old_link_kind to silence "variable used but not set"
|
||||
* warnings.
|
||||
*/
|
||||
bblock_link_kind old_link_kind = bblock_link_logical;
|
||||
|
||||
/* Remove block from all of its childrens' parents lists. */
|
||||
foreach_list_typed_safe (bblock_link, predecessor, link,
|
||||
&successor->block->parents) {
|
||||
if (block == predecessor->block) {
|
||||
old_link_kind = predecessor->kind;
|
||||
predecessor->link.remove();
|
||||
ralloc_free(predecessor);
|
||||
}
|
||||
}
|
||||
|
||||
/* Add removed-block's predecessors to its successors' predecessor lists. */
|
||||
foreach_list_typed (bblock_link, predecessor, link, &block->parents) {
|
||||
bool need_to_link = true;
|
||||
bblock_link_kind new_link_kind = MAX2(old_link_kind, predecessor->kind);
|
||||
|
||||
foreach_list_typed_safe (bblock_link, parent, link, &successor->block->parents) {
|
||||
/* There is already a link between the two blocks. If the links
|
||||
* are the same kind or the link is logical, do nothing. If the
|
||||
* existing link is physical and the proposed new link is logical,
|
||||
* promote the existing link to logical.
|
||||
*
|
||||
* This is accomplished by taking the minimum of the existing link
|
||||
* kind and the proposed link kind.
|
||||
*/
|
||||
if (parent->block == predecessor->block) {
|
||||
parent->kind = MIN2(parent->kind, new_link_kind);
|
||||
need_to_link = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (need_to_link) {
|
||||
successor->block->parents.push_tail(link(mem_ctx,
|
||||
predecessor->block,
|
||||
new_link_kind));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
block->link.remove();
|
||||
|
||||
for (int b = block->num; b < this->num_blocks - 1; b++) {
|
||||
this->blocks[b] = this->blocks[b + 1];
|
||||
this->blocks[b]->num = b;
|
||||
}
|
||||
|
||||
this->blocks[this->num_blocks - 1]->num = this->num_blocks - 2;
|
||||
this->num_blocks--;
|
||||
}
|
||||
|
||||
bblock_t *
|
||||
cfg_t::new_block()
|
||||
{
|
||||
bblock_t *block = new(mem_ctx) bblock_t(this);
|
||||
|
||||
return block;
|
||||
}
|
||||
|
||||
void
|
||||
cfg_t::set_next_block(bblock_t **cur, bblock_t *block, int ip)
|
||||
{
|
||||
if (*cur) {
|
||||
(*cur)->end_ip = ip - 1;
|
||||
}
|
||||
|
||||
block->start_ip = ip;
|
||||
block->num = num_blocks++;
|
||||
block_list.push_tail(&block->link);
|
||||
*cur = block;
|
||||
}
|
||||
|
||||
void
|
||||
cfg_t::make_block_array()
|
||||
{
|
||||
blocks = ralloc_array(mem_ctx, bblock_t *, num_blocks);
|
||||
|
||||
int i = 0;
|
||||
foreach_block (block, this) {
|
||||
blocks[i++] = block;
|
||||
}
|
||||
assert(i == num_blocks);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
struct link_desc {
|
||||
char kind;
|
||||
int num;
|
||||
};
|
||||
|
||||
int
|
||||
compare_link_desc(const void *a, const void *b)
|
||||
{
|
||||
const link_desc *la = (const link_desc *)a;
|
||||
const link_desc *lb = (const link_desc *)b;
|
||||
|
||||
return la->num < lb->num ? -1 :
|
||||
la->num > lb->num ? +1 :
|
||||
la->kind < lb->kind ? -1 :
|
||||
la->kind > lb->kind ? +1 :
|
||||
0;
|
||||
}
|
||||
|
||||
void
|
||||
sort_links(util_dynarray *scratch, exec_list *list)
|
||||
{
|
||||
util_dynarray_clear(scratch);
|
||||
foreach_list_typed(bblock_link, link, link, list) {
|
||||
link_desc l;
|
||||
l.kind = link->kind == bblock_link_logical ? '-' : '~';
|
||||
l.num = link->block->num;
|
||||
util_dynarray_append(scratch, link_desc, l);
|
||||
}
|
||||
qsort(scratch->data, util_dynarray_num_elements(scratch, link_desc),
|
||||
sizeof(link_desc), compare_link_desc);
|
||||
}
|
||||
|
||||
} /* namespace */
|
||||
|
||||
void
|
||||
cfg_t::dump(FILE *file)
|
||||
{
|
||||
const idom_tree *idom = (s ? &s->idom_analysis.require() : NULL);
|
||||
|
||||
/* Temporary storage to sort the lists of blocks. This normalizes the
|
||||
* output, making it possible to use it for certain tests.
|
||||
*/
|
||||
util_dynarray scratch;
|
||||
util_dynarray_init(&scratch, NULL);
|
||||
|
||||
foreach_block (block, this) {
|
||||
if (idom && idom->parent(block))
|
||||
fprintf(file, "START B%d IDOM(B%d)", block->num,
|
||||
idom->parent(block)->num);
|
||||
else
|
||||
fprintf(file, "START B%d IDOM(none)", block->num);
|
||||
|
||||
sort_links(&scratch, &block->parents);
|
||||
util_dynarray_foreach(&scratch, link_desc, l)
|
||||
fprintf(file, " <%cB%d", l->kind, l->num);
|
||||
fprintf(file, "\n");
|
||||
|
||||
if (s != NULL)
|
||||
block->dump(file);
|
||||
fprintf(file, "END B%d", block->num);
|
||||
|
||||
sort_links(&scratch, &block->children);
|
||||
util_dynarray_foreach(&scratch, link_desc, l)
|
||||
fprintf(file, " %c>B%d", l->kind, l->num);
|
||||
fprintf(file, "\n");
|
||||
}
|
||||
|
||||
util_dynarray_fini(&scratch);
|
||||
}
|
||||
|
||||
/* Calculates the immediate dominator of each block, according to "A Simple,
|
||||
* Fast Dominance Algorithm" by Keith D. Cooper, Timothy J. Harvey, and Ken
|
||||
* Kennedy.
|
||||
*
|
||||
* The authors claim that for control flow graphs of sizes normally encountered
|
||||
* (less than 1000 nodes) that this algorithm is significantly faster than
|
||||
* others like Lengauer-Tarjan.
|
||||
*/
|
||||
idom_tree::idom_tree(const backend_shader *s) :
|
||||
num_parents(s->cfg->num_blocks),
|
||||
parents(new bblock_t *[num_parents]())
|
||||
{
|
||||
bool changed;
|
||||
|
||||
parents[0] = s->cfg->blocks[0];
|
||||
|
||||
do {
|
||||
changed = false;
|
||||
|
||||
foreach_block(block, s->cfg) {
|
||||
if (block->num == 0)
|
||||
continue;
|
||||
|
||||
bblock_t *new_idom = NULL;
|
||||
foreach_list_typed(bblock_link, parent_link, link, &block->parents) {
|
||||
if (parent(parent_link->block)) {
|
||||
new_idom = (new_idom ? intersect(new_idom, parent_link->block) :
|
||||
parent_link->block);
|
||||
}
|
||||
}
|
||||
|
||||
if (parent(block) != new_idom) {
|
||||
parents[block->num] = new_idom;
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
} while (changed);
|
||||
}
|
||||
|
||||
idom_tree::~idom_tree()
|
||||
{
|
||||
delete[] parents;
|
||||
}
|
||||
|
||||
bblock_t *
|
||||
idom_tree::intersect(bblock_t *b1, bblock_t *b2) const
|
||||
{
|
||||
/* Note, the comparisons here are the opposite of what the paper says
|
||||
* because we index blocks from beginning -> end (i.e. reverse post-order)
|
||||
* instead of post-order like they assume.
|
||||
*/
|
||||
while (b1->num != b2->num) {
|
||||
while (b1->num > b2->num)
|
||||
b1 = parent(b1);
|
||||
while (b2->num > b1->num)
|
||||
b2 = parent(b2);
|
||||
}
|
||||
assert(b1);
|
||||
return b1;
|
||||
}
|
||||
|
||||
void
|
||||
idom_tree::dump() const
|
||||
{
|
||||
printf("digraph DominanceTree {\n");
|
||||
for (unsigned i = 0; i < num_parents; i++)
|
||||
printf("\t%d -> %d\n", parents[i]->num, i);
|
||||
printf("}\n");
|
||||
}
|
||||
|
||||
void
|
||||
cfg_t::dump_cfg()
|
||||
{
|
||||
printf("digraph CFG {\n");
|
||||
for (int b = 0; b < num_blocks; b++) {
|
||||
bblock_t *block = this->blocks[b];
|
||||
|
||||
foreach_list_typed_safe (bblock_link, child, link, &block->children) {
|
||||
printf("\t%d -> %d\n", b, child->block->num);
|
||||
}
|
||||
}
|
||||
printf("}\n");
|
||||
}
|
||||
|
||||
#define cfgv_assert(assertion) \
|
||||
{ \
|
||||
if (!(assertion)) { \
|
||||
fprintf(stderr, "ASSERT: CFG validation in %s failed!\n", stage_abbrev); \
|
||||
fprintf(stderr, "%s:%d: '%s' failed\n", __FILE__, __LINE__, #assertion); \
|
||||
abort(); \
|
||||
} \
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
void
|
||||
cfg_t::validate(const char *stage_abbrev)
|
||||
{
|
||||
foreach_block(block, this) {
|
||||
foreach_list_typed(bblock_link, successor, link, &block->children) {
|
||||
/* Each successor of a block must have one predecessor link back to
|
||||
* the block.
|
||||
*/
|
||||
bool successor_links_back_to_predecessor = false;
|
||||
bblock_t *succ_block = successor->block;
|
||||
|
||||
foreach_list_typed(bblock_link, predecessor, link, &succ_block->parents) {
|
||||
if (predecessor->block == block) {
|
||||
cfgv_assert(!successor_links_back_to_predecessor);
|
||||
cfgv_assert(successor->kind == predecessor->kind);
|
||||
successor_links_back_to_predecessor = true;
|
||||
}
|
||||
}
|
||||
|
||||
cfgv_assert(successor_links_back_to_predecessor);
|
||||
|
||||
/* Each successor block must appear only once in the list of
|
||||
* successors.
|
||||
*/
|
||||
foreach_list_typed_from(bblock_link, later_successor, link,
|
||||
&block->children, successor->link.next) {
|
||||
cfgv_assert(successor->block != later_successor->block);
|
||||
}
|
||||
}
|
||||
|
||||
foreach_list_typed(bblock_link, predecessor, link, &block->parents) {
|
||||
/* Each predecessor of a block must have one successor link back to
|
||||
* the block.
|
||||
*/
|
||||
bool predecessor_links_back_to_successor = false;
|
||||
bblock_t *pred_block = predecessor->block;
|
||||
|
||||
foreach_list_typed(bblock_link, successor, link, &pred_block->children) {
|
||||
if (successor->block == block) {
|
||||
cfgv_assert(!predecessor_links_back_to_successor);
|
||||
cfgv_assert(successor->kind == predecessor->kind);
|
||||
predecessor_links_back_to_successor = true;
|
||||
}
|
||||
}
|
||||
|
||||
cfgv_assert(predecessor_links_back_to_successor);
|
||||
|
||||
/* Each precessor block must appear only once in the list of
|
||||
* precessors.
|
||||
*/
|
||||
foreach_list_typed_from(bblock_link, later_precessor, link,
|
||||
&block->parents, predecessor->link.next) {
|
||||
cfgv_assert(predecessor->block != later_precessor->block);
|
||||
}
|
||||
}
|
||||
|
||||
backend_instruction *first_inst = block->start();
|
||||
if (first_inst->opcode == BRW_OPCODE_DO) {
|
||||
/* DO instructions both begin and end a block, so the DO instruction
|
||||
* must be the only instruction in the block.
|
||||
*/
|
||||
cfgv_assert(exec_list_is_singular(&block->instructions));
|
||||
|
||||
/* A block starting with DO should have exactly two successors. One
|
||||
* is a physical link to the block starting after the WHILE
|
||||
* instruction. The other is a logical link to the block starting the
|
||||
* body of the loop.
|
||||
*/
|
||||
bblock_t *physical_block = nullptr;
|
||||
bblock_t *logical_block = nullptr;
|
||||
|
||||
foreach_list_typed(bblock_link, child, link, &block->children) {
|
||||
if (child->kind == bblock_link_physical) {
|
||||
cfgv_assert(physical_block == nullptr);
|
||||
physical_block = child->block;
|
||||
} else {
|
||||
cfgv_assert(logical_block == nullptr);
|
||||
logical_block = child->block;
|
||||
}
|
||||
}
|
||||
|
||||
cfgv_assert(logical_block != nullptr);
|
||||
cfgv_assert(physical_block != nullptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
532
src/intel/compiler/elk/brw_cfg.h
Normal file
532
src/intel/compiler/elk/brw_cfg.h
Normal file
|
|
@ -0,0 +1,532 @@
|
|||
/*
|
||||
* Copyright © 2012 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* Authors:
|
||||
* Eric Anholt <eric@anholt.net>
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef BRW_CFG_H
|
||||
#define BRW_CFG_H
|
||||
|
||||
#include "brw_ir.h"
|
||||
#ifdef __cplusplus
|
||||
#include "brw_ir_analysis.h"
|
||||
#endif
|
||||
|
||||
struct bblock_t;
|
||||
|
||||
/**
|
||||
* CFG edge types.
|
||||
*
|
||||
* A logical edge represents a potential control flow path of the original
|
||||
* scalar program, while a physical edge represents a control flow path that
|
||||
* may not have existed in the original program but was introduced during
|
||||
* vectorization in order to implement divergent control flow of different
|
||||
* shader invocations within the same SIMD thread.
|
||||
*
|
||||
* All logical edges in the CFG are considered to be physical edges but not
|
||||
* the other way around -- I.e. the logical CFG is a subset of the physical
|
||||
* one.
|
||||
*/
|
||||
enum bblock_link_kind {
|
||||
bblock_link_logical = 0,
|
||||
bblock_link_physical
|
||||
};
|
||||
|
||||
struct bblock_link {
|
||||
#ifdef __cplusplus
|
||||
DECLARE_RALLOC_CXX_OPERATORS(bblock_link)
|
||||
|
||||
bblock_link(bblock_t *block, enum bblock_link_kind kind)
|
||||
: block(block), kind(kind)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
struct exec_node link;
|
||||
struct bblock_t *block;
|
||||
|
||||
/* Type of this CFG edge. Because bblock_link_logical also implies
|
||||
* bblock_link_physical, the proper way to test for membership of edge 'l'
|
||||
* in CFG kind 'k' is 'l.kind <= k'.
|
||||
*/
|
||||
enum bblock_link_kind kind;
|
||||
};
|
||||
|
||||
struct backend_shader;
|
||||
struct cfg_t;
|
||||
|
||||
struct bblock_t {
|
||||
#ifdef __cplusplus
|
||||
DECLARE_RALLOC_CXX_OPERATORS(bblock_t)
|
||||
|
||||
explicit bblock_t(cfg_t *cfg);
|
||||
|
||||
void add_successor(void *mem_ctx, bblock_t *successor,
|
||||
enum bblock_link_kind kind);
|
||||
bool is_predecessor_of(const bblock_t *block,
|
||||
enum bblock_link_kind kind) const;
|
||||
bool is_successor_of(const bblock_t *block,
|
||||
enum bblock_link_kind kind) const;
|
||||
bool can_combine_with(const bblock_t *that) const;
|
||||
void combine_with(bblock_t *that);
|
||||
void dump(FILE *file = stderr) const;
|
||||
|
||||
backend_instruction *start();
|
||||
const backend_instruction *start() const;
|
||||
backend_instruction *end();
|
||||
const backend_instruction *end() const;
|
||||
|
||||
bblock_t *next();
|
||||
const bblock_t *next() const;
|
||||
bblock_t *prev();
|
||||
const bblock_t *prev() const;
|
||||
|
||||
bool starts_with_control_flow() const;
|
||||
bool ends_with_control_flow() const;
|
||||
|
||||
backend_instruction *first_non_control_flow_inst();
|
||||
backend_instruction *last_non_control_flow_inst();
|
||||
|
||||
private:
|
||||
/**
|
||||
* \sa unlink_parents, unlink_children
|
||||
*/
|
||||
void unlink_list(exec_list *);
|
||||
|
||||
public:
|
||||
void unlink_parents()
|
||||
{
|
||||
unlink_list(&parents);
|
||||
}
|
||||
|
||||
void unlink_children()
|
||||
{
|
||||
unlink_list(&children);
|
||||
}
|
||||
#endif
|
||||
|
||||
struct exec_node link;
|
||||
struct cfg_t *cfg;
|
||||
|
||||
int start_ip;
|
||||
int end_ip;
|
||||
|
||||
/**
|
||||
* Change in end_ip since the last time IPs of later blocks were updated.
|
||||
*/
|
||||
int end_ip_delta;
|
||||
|
||||
struct exec_list instructions;
|
||||
struct exec_list parents;
|
||||
struct exec_list children;
|
||||
int num;
|
||||
};
|
||||
|
||||
static inline struct backend_instruction *
|
||||
bblock_start(struct bblock_t *block)
|
||||
{
|
||||
return (struct backend_instruction *)exec_list_get_head(&block->instructions);
|
||||
}
|
||||
|
||||
static inline const struct backend_instruction *
|
||||
bblock_start_const(const struct bblock_t *block)
|
||||
{
|
||||
return (const struct backend_instruction *)exec_list_get_head_const(&block->instructions);
|
||||
}
|
||||
|
||||
static inline struct backend_instruction *
|
||||
bblock_end(struct bblock_t *block)
|
||||
{
|
||||
return (struct backend_instruction *)exec_list_get_tail(&block->instructions);
|
||||
}
|
||||
|
||||
static inline const struct backend_instruction *
|
||||
bblock_end_const(const struct bblock_t *block)
|
||||
{
|
||||
return (const struct backend_instruction *)exec_list_get_tail_const(&block->instructions);
|
||||
}
|
||||
|
||||
static inline struct bblock_t *
|
||||
bblock_next(struct bblock_t *block)
|
||||
{
|
||||
if (exec_node_is_tail_sentinel(block->link.next))
|
||||
return NULL;
|
||||
|
||||
return (struct bblock_t *)block->link.next;
|
||||
}
|
||||
|
||||
static inline const struct bblock_t *
|
||||
bblock_next_const(const struct bblock_t *block)
|
||||
{
|
||||
if (exec_node_is_tail_sentinel(block->link.next))
|
||||
return NULL;
|
||||
|
||||
return (const struct bblock_t *)block->link.next;
|
||||
}
|
||||
|
||||
static inline struct bblock_t *
|
||||
bblock_prev(struct bblock_t *block)
|
||||
{
|
||||
if (exec_node_is_head_sentinel(block->link.prev))
|
||||
return NULL;
|
||||
|
||||
return (struct bblock_t *)block->link.prev;
|
||||
}
|
||||
|
||||
static inline const struct bblock_t *
|
||||
bblock_prev_const(const struct bblock_t *block)
|
||||
{
|
||||
if (exec_node_is_head_sentinel(block->link.prev))
|
||||
return NULL;
|
||||
|
||||
return (const struct bblock_t *)block->link.prev;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
bblock_starts_with_control_flow(const struct bblock_t *block)
|
||||
{
|
||||
enum opcode op = bblock_start_const(block)->opcode;
|
||||
return op == BRW_OPCODE_DO || op == BRW_OPCODE_ENDIF;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
bblock_ends_with_control_flow(const struct bblock_t *block)
|
||||
{
|
||||
enum opcode op = bblock_end_const(block)->opcode;
|
||||
return op == BRW_OPCODE_IF ||
|
||||
op == BRW_OPCODE_ELSE ||
|
||||
op == BRW_OPCODE_WHILE ||
|
||||
op == BRW_OPCODE_BREAK ||
|
||||
op == BRW_OPCODE_CONTINUE;
|
||||
}
|
||||
|
||||
static inline struct backend_instruction *
|
||||
bblock_first_non_control_flow_inst(struct bblock_t *block)
|
||||
{
|
||||
struct backend_instruction *inst = bblock_start(block);
|
||||
if (bblock_starts_with_control_flow(block))
|
||||
#ifdef __cplusplus
|
||||
inst = (struct backend_instruction *)inst->next;
|
||||
#else
|
||||
inst = (struct backend_instruction *)inst->link.next;
|
||||
#endif
|
||||
return inst;
|
||||
}
|
||||
|
||||
static inline struct backend_instruction *
|
||||
bblock_last_non_control_flow_inst(struct bblock_t *block)
|
||||
{
|
||||
struct backend_instruction *inst = bblock_end(block);
|
||||
if (bblock_ends_with_control_flow(block))
|
||||
#ifdef __cplusplus
|
||||
inst = (struct backend_instruction *)inst->prev;
|
||||
#else
|
||||
inst = (struct backend_instruction *)inst->link.prev;
|
||||
#endif
|
||||
return inst;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
inline backend_instruction *
|
||||
bblock_t::start()
|
||||
{
|
||||
return bblock_start(this);
|
||||
}
|
||||
|
||||
inline const backend_instruction *
|
||||
bblock_t::start() const
|
||||
{
|
||||
return bblock_start_const(this);
|
||||
}
|
||||
|
||||
inline backend_instruction *
|
||||
bblock_t::end()
|
||||
{
|
||||
return bblock_end(this);
|
||||
}
|
||||
|
||||
inline const backend_instruction *
|
||||
bblock_t::end() const
|
||||
{
|
||||
return bblock_end_const(this);
|
||||
}
|
||||
|
||||
inline bblock_t *
|
||||
bblock_t::next()
|
||||
{
|
||||
return bblock_next(this);
|
||||
}
|
||||
|
||||
inline const bblock_t *
|
||||
bblock_t::next() const
|
||||
{
|
||||
return bblock_next_const(this);
|
||||
}
|
||||
|
||||
inline bblock_t *
|
||||
bblock_t::prev()
|
||||
{
|
||||
return bblock_prev(this);
|
||||
}
|
||||
|
||||
inline const bblock_t *
|
||||
bblock_t::prev() const
|
||||
{
|
||||
return bblock_prev_const(this);
|
||||
}
|
||||
|
||||
inline bool
|
||||
bblock_t::starts_with_control_flow() const
|
||||
{
|
||||
return bblock_starts_with_control_flow(this);
|
||||
}
|
||||
|
||||
inline bool
|
||||
bblock_t::ends_with_control_flow() const
|
||||
{
|
||||
return bblock_ends_with_control_flow(this);
|
||||
}
|
||||
|
||||
inline backend_instruction *
|
||||
bblock_t::first_non_control_flow_inst()
|
||||
{
|
||||
return bblock_first_non_control_flow_inst(this);
|
||||
}
|
||||
|
||||
inline backend_instruction *
|
||||
bblock_t::last_non_control_flow_inst()
|
||||
{
|
||||
return bblock_last_non_control_flow_inst(this);
|
||||
}
|
||||
#endif
|
||||
|
||||
struct cfg_t {
|
||||
#ifdef __cplusplus
|
||||
DECLARE_RALLOC_CXX_OPERATORS(cfg_t)
|
||||
|
||||
cfg_t(const backend_shader *s, exec_list *instructions);
|
||||
~cfg_t();
|
||||
|
||||
void remove_block(bblock_t *block);
|
||||
|
||||
bblock_t *first_block();
|
||||
const bblock_t *first_block() const;
|
||||
bblock_t *last_block();
|
||||
const bblock_t *last_block() const;
|
||||
|
||||
bblock_t *new_block();
|
||||
void set_next_block(bblock_t **cur, bblock_t *block, int ip);
|
||||
void make_block_array();
|
||||
|
||||
void dump(FILE *file = stderr);
|
||||
void dump_cfg();
|
||||
|
||||
#ifdef NDEBUG
|
||||
void validate(UNUSED const char *stage_abbrev) { }
|
||||
#else
|
||||
void validate(const char *stage_abbrev);
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Propagate bblock_t::end_ip_delta data through the CFG.
|
||||
*/
|
||||
inline void adjust_block_ips();
|
||||
|
||||
#endif
|
||||
const struct backend_shader *s;
|
||||
void *mem_ctx;
|
||||
|
||||
/** Ordered list (by ip) of basic blocks */
|
||||
struct exec_list block_list;
|
||||
struct bblock_t **blocks;
|
||||
int num_blocks;
|
||||
};
|
||||
|
||||
static inline struct bblock_t *
|
||||
cfg_first_block(struct cfg_t *cfg)
|
||||
{
|
||||
return (struct bblock_t *)exec_list_get_head(&cfg->block_list);
|
||||
}
|
||||
|
||||
static inline const struct bblock_t *
|
||||
cfg_first_block_const(const struct cfg_t *cfg)
|
||||
{
|
||||
return (const struct bblock_t *)exec_list_get_head_const(&cfg->block_list);
|
||||
}
|
||||
|
||||
static inline struct bblock_t *
|
||||
cfg_last_block(struct cfg_t *cfg)
|
||||
{
|
||||
return (struct bblock_t *)exec_list_get_tail(&cfg->block_list);
|
||||
}
|
||||
|
||||
static inline const struct bblock_t *
|
||||
cfg_last_block_const(const struct cfg_t *cfg)
|
||||
{
|
||||
return (const struct bblock_t *)exec_list_get_tail_const(&cfg->block_list);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
inline bblock_t *
|
||||
cfg_t::first_block()
|
||||
{
|
||||
return cfg_first_block(this);
|
||||
}
|
||||
|
||||
const inline bblock_t *
|
||||
cfg_t::first_block() const
|
||||
{
|
||||
return cfg_first_block_const(this);
|
||||
}
|
||||
|
||||
inline bblock_t *
|
||||
cfg_t::last_block()
|
||||
{
|
||||
return cfg_last_block(this);
|
||||
}
|
||||
|
||||
const inline bblock_t *
|
||||
cfg_t::last_block() const
|
||||
{
|
||||
return cfg_last_block_const(this);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Note that this is implemented with a double for loop -- break will
|
||||
* break from the inner loop only!
|
||||
*/
|
||||
#define foreach_block_and_inst(__block, __type, __inst, __cfg) \
|
||||
foreach_block (__block, __cfg) \
|
||||
foreach_inst_in_block (__type, __inst, __block)
|
||||
|
||||
/* Note that this is implemented with a double for loop -- break will
|
||||
* break from the inner loop only!
|
||||
*/
|
||||
#define foreach_block_and_inst_safe(__block, __type, __inst, __cfg) \
|
||||
foreach_block_safe (__block, __cfg) \
|
||||
foreach_inst_in_block_safe (__type, __inst, __block)
|
||||
|
||||
#define foreach_block(__block, __cfg) \
|
||||
foreach_list_typed (bblock_t, __block, link, &(__cfg)->block_list)
|
||||
|
||||
#define foreach_block_reverse(__block, __cfg) \
|
||||
foreach_list_typed_reverse (bblock_t, __block, link, &(__cfg)->block_list)
|
||||
|
||||
#define foreach_block_safe(__block, __cfg) \
|
||||
foreach_list_typed_safe (bblock_t, __block, link, &(__cfg)->block_list)
|
||||
|
||||
#define foreach_block_reverse_safe(__block, __cfg) \
|
||||
foreach_list_typed_reverse_safe (bblock_t, __block, link, &(__cfg)->block_list)
|
||||
|
||||
#define foreach_inst_in_block(__type, __inst, __block) \
|
||||
foreach_in_list(__type, __inst, &(__block)->instructions)
|
||||
|
||||
#define foreach_inst_in_block_safe(__type, __inst, __block) \
|
||||
for (__type *__inst = (__type *)__block->instructions.head_sentinel.next, \
|
||||
*__next = (__type *)__inst->next; \
|
||||
__next != NULL; \
|
||||
__inst = __next, \
|
||||
__next = (__type *)__next->next)
|
||||
|
||||
#define foreach_inst_in_block_reverse(__type, __inst, __block) \
|
||||
foreach_in_list_reverse(__type, __inst, &(__block)->instructions)
|
||||
|
||||
#define foreach_inst_in_block_reverse_safe(__type, __inst, __block) \
|
||||
foreach_in_list_reverse_safe(__type, __inst, &(__block)->instructions)
|
||||
|
||||
#define foreach_inst_in_block_starting_from(__type, __scan_inst, __inst) \
|
||||
for (__type *__scan_inst = (__type *)__inst->next; \
|
||||
!__scan_inst->is_tail_sentinel(); \
|
||||
__scan_inst = (__type *)__scan_inst->next)
|
||||
|
||||
#define foreach_inst_in_block_reverse_starting_from(__type, __scan_inst, __inst) \
|
||||
for (__type *__scan_inst = (__type *)__inst->prev; \
|
||||
!__scan_inst->is_head_sentinel(); \
|
||||
__scan_inst = (__type *)__scan_inst->prev)
|
||||
|
||||
#ifdef __cplusplus
|
||||
inline void
|
||||
cfg_t::adjust_block_ips()
|
||||
{
|
||||
int delta = 0;
|
||||
|
||||
foreach_block(block, this) {
|
||||
block->start_ip += delta;
|
||||
block->end_ip += delta;
|
||||
|
||||
delta += block->end_ip_delta;
|
||||
|
||||
block->end_ip_delta = 0;
|
||||
}
|
||||
}
|
||||
|
||||
namespace brw {
|
||||
/**
|
||||
* Immediate dominator tree analysis of a shader.
|
||||
*/
|
||||
struct idom_tree {
|
||||
idom_tree(const backend_shader *s);
|
||||
~idom_tree();
|
||||
|
||||
bool
|
||||
validate(const backend_shader *) const
|
||||
{
|
||||
/* FINISHME */
|
||||
return true;
|
||||
}
|
||||
|
||||
analysis_dependency_class
|
||||
dependency_class() const
|
||||
{
|
||||
return DEPENDENCY_BLOCKS;
|
||||
}
|
||||
|
||||
const bblock_t *
|
||||
parent(const bblock_t *b) const
|
||||
{
|
||||
assert(unsigned(b->num) < num_parents);
|
||||
return parents[b->num];
|
||||
}
|
||||
|
||||
bblock_t *
|
||||
parent(bblock_t *b) const
|
||||
{
|
||||
assert(unsigned(b->num) < num_parents);
|
||||
return parents[b->num];
|
||||
}
|
||||
|
||||
bblock_t *
|
||||
intersect(bblock_t *b1, bblock_t *b2) const;
|
||||
|
||||
void
|
||||
dump() const;
|
||||
|
||||
private:
|
||||
unsigned num_parents;
|
||||
bblock_t **parents;
|
||||
};
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* BRW_CFG_H */
|
||||
163
src/intel/compiler/elk/brw_clip.h
Normal file
163
src/intel/compiler/elk/brw_clip.h
Normal file
|
|
@ -0,0 +1,163 @@
|
|||
/*
|
||||
Copyright (C) Intel Corp. 2006. All Rights Reserved.
|
||||
Intel funded Tungsten Graphics to
|
||||
develop this 3D driver.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice (including the
|
||||
next paragraph) shall be included in all copies or substantial
|
||||
portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
|
||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
**********************************************************************/
|
||||
/*
|
||||
* Authors:
|
||||
* Keith Whitwell <keithw@vmware.com>
|
||||
*/
|
||||
|
||||
#ifndef BRW_CLIP_H
|
||||
#define BRW_CLIP_H
|
||||
|
||||
#include "brw_compiler.h"
|
||||
#include "brw_eu.h"
|
||||
|
||||
/* Initial 3 verts, plus at most 6 additional verts from intersections
|
||||
* with fixed planes, plus at most 8 additional verts from intersections
|
||||
* with user clip planes
|
||||
*/
|
||||
#define MAX_VERTS (3+6+8)
|
||||
|
||||
#define PRIM_MASK (0x1f)
|
||||
|
||||
struct brw_clip_compile {
|
||||
struct brw_codegen func;
|
||||
struct brw_clip_prog_key key;
|
||||
struct brw_clip_prog_data prog_data;
|
||||
|
||||
struct {
|
||||
struct brw_reg R0;
|
||||
struct brw_reg vertex[MAX_VERTS];
|
||||
|
||||
struct brw_reg t;
|
||||
struct brw_reg t0, t1;
|
||||
struct brw_reg dp0, dp1;
|
||||
|
||||
struct brw_reg dpPrev;
|
||||
struct brw_reg dp;
|
||||
struct brw_reg loopcount;
|
||||
struct brw_reg nr_verts;
|
||||
struct brw_reg planemask;
|
||||
|
||||
struct brw_reg inlist;
|
||||
struct brw_reg outlist;
|
||||
struct brw_reg freelist;
|
||||
|
||||
struct brw_reg dir;
|
||||
struct brw_reg tmp0, tmp1;
|
||||
struct brw_reg offset;
|
||||
|
||||
struct brw_reg fixed_planes;
|
||||
struct brw_reg plane_equation;
|
||||
|
||||
struct brw_reg ff_sync;
|
||||
|
||||
/* Bitmask indicating which coordinate attribute should be used for
|
||||
* comparison to each clipping plane. A 0 indicates that VARYING_SLOT_POS
|
||||
* should be used, because it's one of the fixed +/- x/y/z planes that
|
||||
* constitute the bounds of the view volume. A 1 indicates that
|
||||
* VARYING_SLOT_CLIP_VERTEX should be used (if available) since it's a user-
|
||||
* defined clipping plane.
|
||||
*/
|
||||
struct brw_reg vertex_src_mask;
|
||||
|
||||
/* Offset into the vertex of the current plane's clipdistance value */
|
||||
struct brw_reg clipdistance_offset;
|
||||
} reg;
|
||||
|
||||
/* Number of registers storing VUE data */
|
||||
GLuint nr_regs;
|
||||
|
||||
GLuint first_tmp;
|
||||
GLuint last_tmp;
|
||||
|
||||
bool need_direction;
|
||||
|
||||
struct intel_vue_map vue_map;
|
||||
};
|
||||
|
||||
/**
|
||||
* True if the given varying is one of the outputs of the vertex shader.
|
||||
*/
|
||||
static inline bool brw_clip_have_varying(struct brw_clip_compile *c,
|
||||
GLuint varying)
|
||||
{
|
||||
return (c->key.attrs & BITFIELD64_BIT(varying)) ? 1 : 0;
|
||||
}
|
||||
|
||||
/* Points are only culled, so no need for a clip routine, however it
|
||||
* works out easier to have a dummy one.
|
||||
*/
|
||||
void brw_emit_unfilled_clip( struct brw_clip_compile *c );
|
||||
void brw_emit_tri_clip( struct brw_clip_compile *c );
|
||||
void brw_emit_line_clip( struct brw_clip_compile *c );
|
||||
void brw_emit_point_clip( struct brw_clip_compile *c );
|
||||
|
||||
/* brw_clip_tri.c, for use by the unfilled clip routine:
|
||||
*/
|
||||
void brw_clip_tri_init_vertices( struct brw_clip_compile *c );
|
||||
void brw_clip_tri_flat_shade( struct brw_clip_compile *c );
|
||||
void brw_clip_tri( struct brw_clip_compile *c );
|
||||
void brw_clip_tri_emit_polygon( struct brw_clip_compile *c );
|
||||
void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
|
||||
GLuint nr_verts );
|
||||
|
||||
|
||||
/* Utils:
|
||||
*/
|
||||
|
||||
void brw_clip_interp_vertex( struct brw_clip_compile *c,
|
||||
struct brw_indirect dest_ptr,
|
||||
struct brw_indirect v0_ptr, /* from */
|
||||
struct brw_indirect v1_ptr, /* to */
|
||||
struct brw_reg t0,
|
||||
bool force_edgeflag );
|
||||
|
||||
void brw_clip_init_planes( struct brw_clip_compile *c );
|
||||
|
||||
void brw_clip_emit_vue(struct brw_clip_compile *c,
|
||||
struct brw_indirect vert,
|
||||
enum brw_urb_write_flags flags,
|
||||
GLuint header);
|
||||
|
||||
void brw_clip_kill_thread(struct brw_clip_compile *c);
|
||||
|
||||
struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c );
|
||||
struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c );
|
||||
|
||||
void brw_clip_copy_flatshaded_attributes( struct brw_clip_compile *c,
|
||||
GLuint to, GLuint from );
|
||||
|
||||
void brw_clip_init_clipmask( struct brw_clip_compile *c );
|
||||
|
||||
struct brw_reg get_tmp( struct brw_clip_compile *c );
|
||||
|
||||
void brw_clip_project_position(struct brw_clip_compile *c,
|
||||
struct brw_reg pos );
|
||||
void brw_clip_ff_sync(struct brw_clip_compile *c);
|
||||
void brw_clip_init_ff_sync(struct brw_clip_compile *c);
|
||||
|
||||
#endif
|
||||
303
src/intel/compiler/elk/brw_clip_line.c
Normal file
303
src/intel/compiler/elk/brw_clip_line.c
Normal file
|
|
@ -0,0 +1,303 @@
|
|||
/*
|
||||
Copyright (C) Intel Corp. 2006. All Rights Reserved.
|
||||
Intel funded Tungsten Graphics to
|
||||
develop this 3D driver.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice (including the
|
||||
next paragraph) shall be included in all copies or substantial
|
||||
portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
|
||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
**********************************************************************/
|
||||
/*
|
||||
* Authors:
|
||||
* Keith Whitwell <keithw@vmware.com>
|
||||
*/
|
||||
|
||||
#include "brw_clip.h"
|
||||
#include "brw_prim.h"
|
||||
|
||||
static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
|
||||
{
|
||||
const struct intel_device_info *devinfo = c->func.devinfo;
|
||||
GLuint i = 0,j;
|
||||
|
||||
/* Register usage is static, precompute here:
|
||||
*/
|
||||
c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
|
||||
|
||||
if (c->key.nr_userclip) {
|
||||
c->reg.fixed_planes = brw_vec4_grf(i, 0);
|
||||
i += (6 + c->key.nr_userclip + 1) / 2;
|
||||
|
||||
c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2;
|
||||
}
|
||||
else
|
||||
c->prog_data.curb_read_length = 0;
|
||||
|
||||
|
||||
/* Payload vertices plus space for more generated vertices:
|
||||
*/
|
||||
for (j = 0; j < 4; j++) {
|
||||
c->reg.vertex[j] = brw_vec4_grf(i, 0);
|
||||
i += c->nr_regs;
|
||||
}
|
||||
|
||||
c->reg.t = brw_vec1_grf(i, 0);
|
||||
c->reg.t0 = brw_vec1_grf(i, 1);
|
||||
c->reg.t1 = brw_vec1_grf(i, 2);
|
||||
c->reg.planemask = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD);
|
||||
c->reg.plane_equation = brw_vec4_grf(i, 4);
|
||||
i++;
|
||||
|
||||
c->reg.dp0 = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */
|
||||
c->reg.dp1 = brw_vec1_grf(i, 4);
|
||||
i++;
|
||||
|
||||
if (!c->key.nr_userclip) {
|
||||
c->reg.fixed_planes = brw_vec8_grf(i, 0);
|
||||
i++;
|
||||
}
|
||||
|
||||
c->reg.vertex_src_mask = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
|
||||
c->reg.clipdistance_offset = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_W);
|
||||
i++;
|
||||
|
||||
if (devinfo->ver == 5) {
|
||||
c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
|
||||
i++;
|
||||
}
|
||||
|
||||
c->first_tmp = i;
|
||||
c->last_tmp = i;
|
||||
|
||||
c->prog_data.urb_read_length = c->nr_regs; /* ? */
|
||||
c->prog_data.total_grf = i;
|
||||
}
|
||||
|
||||
|
||||
/* Line clipping, more or less following the following algorithm:
|
||||
*
|
||||
* for (p=0;p<MAX_PLANES;p++) {
|
||||
* if (clipmask & (1 << p)) {
|
||||
* GLfloat dp0 = DOTPROD( vtx0, plane[p] );
|
||||
* GLfloat dp1 = DOTPROD( vtx1, plane[p] );
|
||||
*
|
||||
* if (dp1 < 0.0f) {
|
||||
* GLfloat t = dp1 / (dp1 - dp0);
|
||||
* if (t > t1) t1 = t;
|
||||
* } else {
|
||||
* GLfloat t = dp0 / (dp0 - dp1);
|
||||
* if (t > t0) t0 = t;
|
||||
* }
|
||||
*
|
||||
* if (t0 + t1 >= 1.0)
|
||||
* return;
|
||||
* }
|
||||
* }
|
||||
*
|
||||
* interp( ctx, newvtx0, vtx0, vtx1, t0 );
|
||||
* interp( ctx, newvtx1, vtx1, vtx0, t1 );
|
||||
*
|
||||
*/
|
||||
static void clip_and_emit_line( struct brw_clip_compile *c )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
struct brw_indirect vtx0 = brw_indirect(0, 0);
|
||||
struct brw_indirect vtx1 = brw_indirect(1, 0);
|
||||
struct brw_indirect newvtx0 = brw_indirect(2, 0);
|
||||
struct brw_indirect newvtx1 = brw_indirect(3, 0);
|
||||
struct brw_indirect plane_ptr = brw_indirect(4, 0);
|
||||
struct brw_reg v1_null_ud = retype(vec1(brw_null_reg()), BRW_REGISTER_TYPE_UD);
|
||||
GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
|
||||
GLint clipdist0_offset = c->key.nr_userclip
|
||||
? brw_varying_to_offset(&c->vue_map, VARYING_SLOT_CLIP_DIST0)
|
||||
: 0;
|
||||
|
||||
brw_MOV(p, get_addr_reg(vtx0), brw_address(c->reg.vertex[0]));
|
||||
brw_MOV(p, get_addr_reg(vtx1), brw_address(c->reg.vertex[1]));
|
||||
brw_MOV(p, get_addr_reg(newvtx0), brw_address(c->reg.vertex[2]));
|
||||
brw_MOV(p, get_addr_reg(newvtx1), brw_address(c->reg.vertex[3]));
|
||||
brw_MOV(p, get_addr_reg(plane_ptr), brw_clip_plane0_address(c));
|
||||
|
||||
/* Note: init t0, t1 together:
|
||||
*/
|
||||
brw_MOV(p, vec2(c->reg.t0), brw_imm_f(0));
|
||||
|
||||
brw_clip_init_planes(c);
|
||||
brw_clip_init_clipmask(c);
|
||||
|
||||
/* -ve rhw workaround */
|
||||
if (p->devinfo->has_negative_rhw_bug) {
|
||||
brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
|
||||
brw_imm_ud(1<<20));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
|
||||
brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(0x3f));
|
||||
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
|
||||
}
|
||||
|
||||
/* Set the initial vertex source mask: The first 6 planes are the bounds
|
||||
* of the view volume; the next 8 planes are the user clipping planes.
|
||||
*/
|
||||
brw_MOV(p, c->reg.vertex_src_mask, brw_imm_ud(0x3fc0));
|
||||
|
||||
/* Set the initial clipdistance offset to be 6 floats before gl_ClipDistance[0].
|
||||
* We'll increment 6 times before we start hitting actual user clipping. */
|
||||
brw_MOV(p, c->reg.clipdistance_offset, brw_imm_d(clipdist0_offset - 6*sizeof(float)));
|
||||
|
||||
brw_DO(p, BRW_EXECUTE_1);
|
||||
{
|
||||
/* if (planemask & 1)
|
||||
*/
|
||||
brw_AND(p, v1_null_ud, c->reg.planemask, brw_imm_ud(1));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
|
||||
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
brw_AND(p, v1_null_ud, c->reg.vertex_src_mask, brw_imm_ud(1));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
/* user clip distance: just fetch the correct float from each vertex */
|
||||
struct brw_indirect temp_ptr = brw_indirect(7, 0);
|
||||
brw_ADD(p, get_addr_reg(temp_ptr), get_addr_reg(vtx0), c->reg.clipdistance_offset);
|
||||
brw_MOV(p, c->reg.dp0, deref_1f(temp_ptr, 0));
|
||||
brw_ADD(p, get_addr_reg(temp_ptr), get_addr_reg(vtx1), c->reg.clipdistance_offset);
|
||||
brw_MOV(p, c->reg.dp1, deref_1f(temp_ptr, 0));
|
||||
}
|
||||
brw_ELSE(p);
|
||||
{
|
||||
/* fixed plane: fetch the hpos, dp4 against the plane. */
|
||||
if (c->key.nr_userclip)
|
||||
brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
|
||||
else
|
||||
brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));
|
||||
|
||||
brw_DP4(p, vec4(c->reg.dp0), deref_4f(vtx0, hpos_offset), c->reg.plane_equation);
|
||||
brw_DP4(p, vec4(c->reg.dp1), deref_4f(vtx1, hpos_offset), c->reg.plane_equation);
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
|
||||
brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, vec1(c->reg.dp1), brw_imm_f(0.0f));
|
||||
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
/*
|
||||
* Both can be negative on GM965/G965 due to RHW workaround
|
||||
* if so, this object should be rejected.
|
||||
*/
|
||||
if (p->devinfo->has_negative_rhw_bug) {
|
||||
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE, c->reg.dp0, brw_imm_f(0.0));
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
brw_clip_kill_thread(c);
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
}
|
||||
|
||||
brw_ADD(p, c->reg.t, c->reg.dp1, negate(c->reg.dp0));
|
||||
brw_math_invert(p, c->reg.t, c->reg.t);
|
||||
brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp1);
|
||||
|
||||
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t1 );
|
||||
brw_MOV(p, c->reg.t1, c->reg.t);
|
||||
brw_inst_set_pred_control(p->devinfo, brw_last_inst,
|
||||
BRW_PREDICATE_NORMAL);
|
||||
}
|
||||
brw_ELSE(p);
|
||||
{
|
||||
/* Coming back in. We know that both cannot be negative
|
||||
* because the line would have been culled in that case.
|
||||
*/
|
||||
|
||||
/* If both are positive, do nothing */
|
||||
/* Only on GM965/G965 */
|
||||
if (p->devinfo->has_negative_rhw_bug) {
|
||||
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0));
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
}
|
||||
|
||||
{
|
||||
brw_ADD(p, c->reg.t, c->reg.dp0, negate(c->reg.dp1));
|
||||
brw_math_invert(p, c->reg.t, c->reg.t);
|
||||
brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp0);
|
||||
|
||||
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t0 );
|
||||
brw_MOV(p, c->reg.t0, c->reg.t);
|
||||
brw_inst_set_pred_control(p->devinfo, brw_last_inst,
|
||||
BRW_PREDICATE_NORMAL);
|
||||
}
|
||||
|
||||
if (p->devinfo->has_negative_rhw_bug) {
|
||||
brw_ENDIF(p);
|
||||
}
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
|
||||
/* plane_ptr++;
|
||||
*/
|
||||
brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));
|
||||
|
||||
/* while (planemask>>=1) != 0
|
||||
*/
|
||||
brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
|
||||
brw_SHR(p, c->reg.vertex_src_mask, c->reg.vertex_src_mask, brw_imm_ud(1));
|
||||
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
|
||||
brw_ADD(p, c->reg.clipdistance_offset, c->reg.clipdistance_offset, brw_imm_w(sizeof(float)));
|
||||
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
|
||||
}
|
||||
brw_WHILE(p);
|
||||
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
|
||||
|
||||
brw_ADD(p, c->reg.t, c->reg.t0, c->reg.t1);
|
||||
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.t, brw_imm_f(1.0));
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
brw_clip_interp_vertex(c, newvtx0, vtx0, vtx1, c->reg.t0, false);
|
||||
brw_clip_interp_vertex(c, newvtx1, vtx1, vtx0, c->reg.t1, false);
|
||||
|
||||
brw_clip_emit_vue(c, newvtx0, BRW_URB_WRITE_ALLOCATE_COMPLETE,
|
||||
(_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
|
||||
| URB_WRITE_PRIM_START);
|
||||
brw_clip_emit_vue(c, newvtx1, BRW_URB_WRITE_EOT_COMPLETE,
|
||||
(_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
|
||||
| URB_WRITE_PRIM_END);
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
brw_clip_kill_thread(c);
|
||||
}
|
||||
|
||||
|
||||
|
||||
void brw_emit_line_clip( struct brw_clip_compile *c )
|
||||
{
|
||||
brw_clip_line_alloc_regs(c);
|
||||
brw_clip_init_ff_sync(c);
|
||||
|
||||
if (c->key.contains_flat_varying) {
|
||||
if (c->key.pv_first)
|
||||
brw_clip_copy_flatshaded_attributes(c, 1, 0);
|
||||
else
|
||||
brw_clip_copy_flatshaded_attributes(c, 0, 1);
|
||||
}
|
||||
|
||||
clip_and_emit_line(c);
|
||||
}
|
||||
45
src/intel/compiler/elk/brw_clip_point.c
Normal file
45
src/intel/compiler/elk/brw_clip_point.c
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
/*
|
||||
Copyright (C) Intel Corp. 2006. All Rights Reserved.
|
||||
Intel funded Tungsten Graphics to
|
||||
develop this 3D driver.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice (including the
|
||||
next paragraph) shall be included in all copies or substantial
|
||||
portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
|
||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
**********************************************************************/
|
||||
/*
|
||||
* Authors:
|
||||
* Keith Whitwell <keithw@vmware.com>
|
||||
*/
|
||||
|
||||
#include "brw_clip.h"
|
||||
|
||||
|
||||
/* Point clipping, nothing to do?
|
||||
*/
|
||||
void brw_emit_point_clip( struct brw_clip_compile *c )
|
||||
{
|
||||
/* Send an empty message to kill the thread:
|
||||
*/
|
||||
brw_clip_tri_alloc_regs(c, 0);
|
||||
brw_clip_init_ff_sync(c);
|
||||
|
||||
brw_clip_kill_thread(c);
|
||||
}
|
||||
659
src/intel/compiler/elk/brw_clip_tri.c
Normal file
659
src/intel/compiler/elk/brw_clip_tri.c
Normal file
|
|
@ -0,0 +1,659 @@
|
|||
/*
|
||||
Copyright (C) Intel Corp. 2006. All Rights Reserved.
|
||||
Intel funded Tungsten Graphics to
|
||||
develop this 3D driver.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice (including the
|
||||
next paragraph) shall be included in all copies or substantial
|
||||
portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
|
||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
**********************************************************************/
|
||||
/*
|
||||
* Authors:
|
||||
* Keith Whitwell <keithw@vmware.com>
|
||||
*/
|
||||
|
||||
#include "brw_clip.h"
|
||||
#include "brw_prim.h"
|
||||
|
||||
static void release_tmps( struct brw_clip_compile *c )
|
||||
{
|
||||
c->last_tmp = c->first_tmp;
|
||||
}
|
||||
|
||||
|
||||
void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
|
||||
GLuint nr_verts )
|
||||
{
|
||||
const struct intel_device_info *devinfo = c->func.devinfo;
|
||||
GLuint i = 0,j;
|
||||
|
||||
/* Register usage is static, precompute here:
|
||||
*/
|
||||
c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
|
||||
|
||||
if (c->key.nr_userclip) {
|
||||
c->reg.fixed_planes = brw_vec4_grf(i, 0);
|
||||
i += (6 + c->key.nr_userclip + 1) / 2;
|
||||
|
||||
c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2;
|
||||
}
|
||||
else
|
||||
c->prog_data.curb_read_length = 0;
|
||||
|
||||
|
||||
/* Payload vertices plus space for more generated vertices:
|
||||
*/
|
||||
for (j = 0; j < nr_verts; j++) {
|
||||
c->reg.vertex[j] = brw_vec4_grf(i, 0);
|
||||
i += c->nr_regs;
|
||||
}
|
||||
|
||||
if (c->vue_map.num_slots % 2 && nr_verts > 0) {
|
||||
/* The VUE has an odd number of slots so the last register is only half
|
||||
* used. Fill the second half with zero.
|
||||
*/
|
||||
for (j = 0; j < 3; j++) {
|
||||
GLuint delta = brw_vue_slot_to_offset(c->vue_map.num_slots);
|
||||
|
||||
brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0));
|
||||
}
|
||||
}
|
||||
|
||||
c->reg.t = brw_vec1_grf(i, 0);
|
||||
c->reg.loopcount = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_D);
|
||||
c->reg.nr_verts = retype(brw_vec1_grf(i, 2), BRW_REGISTER_TYPE_UD);
|
||||
c->reg.planemask = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD);
|
||||
c->reg.plane_equation = brw_vec4_grf(i, 4);
|
||||
i++;
|
||||
|
||||
c->reg.dpPrev = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */
|
||||
c->reg.dp = brw_vec1_grf(i, 4);
|
||||
i++;
|
||||
|
||||
c->reg.inlist = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
|
||||
i++;
|
||||
|
||||
c->reg.outlist = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
|
||||
i++;
|
||||
|
||||
c->reg.freelist = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
|
||||
i++;
|
||||
|
||||
if (!c->key.nr_userclip) {
|
||||
c->reg.fixed_planes = brw_vec8_grf(i, 0);
|
||||
i++;
|
||||
}
|
||||
|
||||
if (c->key.do_unfilled) {
|
||||
c->reg.dir = brw_vec4_grf(i, 0);
|
||||
c->reg.offset = brw_vec4_grf(i, 4);
|
||||
i++;
|
||||
c->reg.tmp0 = brw_vec4_grf(i, 0);
|
||||
c->reg.tmp1 = brw_vec4_grf(i, 4);
|
||||
i++;
|
||||
}
|
||||
|
||||
c->reg.vertex_src_mask = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
|
||||
c->reg.clipdistance_offset = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_W);
|
||||
i++;
|
||||
|
||||
if (devinfo->ver == 5) {
|
||||
c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
|
||||
i++;
|
||||
}
|
||||
|
||||
c->first_tmp = i;
|
||||
c->last_tmp = i;
|
||||
|
||||
c->prog_data.urb_read_length = c->nr_regs; /* ? */
|
||||
c->prog_data.total_grf = i;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void brw_clip_tri_init_vertices( struct brw_clip_compile *c )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
|
||||
|
||||
/* Initial list of indices for incoming vertices:
|
||||
*/
|
||||
brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK));
|
||||
brw_CMP(p,
|
||||
vec1(brw_null_reg()),
|
||||
BRW_CONDITIONAL_EQ,
|
||||
tmp0,
|
||||
brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE));
|
||||
|
||||
/* XXX: Is there an easier way to do this? Need to reverse every
|
||||
* second tristrip element: Can ignore sometimes?
|
||||
*/
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
brw_MOV(p, get_element(c->reg.inlist, 0), brw_address(c->reg.vertex[1]) );
|
||||
brw_MOV(p, get_element(c->reg.inlist, 1), brw_address(c->reg.vertex[0]) );
|
||||
if (c->need_direction)
|
||||
brw_MOV(p, c->reg.dir, brw_imm_f(-1));
|
||||
}
|
||||
brw_ELSE(p);
|
||||
{
|
||||
brw_MOV(p, get_element(c->reg.inlist, 0), brw_address(c->reg.vertex[0]) );
|
||||
brw_MOV(p, get_element(c->reg.inlist, 1), brw_address(c->reg.vertex[1]) );
|
||||
if (c->need_direction)
|
||||
brw_MOV(p, c->reg.dir, brw_imm_f(1));
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
|
||||
brw_MOV(p, get_element(c->reg.inlist, 2), brw_address(c->reg.vertex[2]) );
|
||||
brw_MOV(p, brw_vec8_grf(c->reg.outlist.nr, 0), brw_imm_f(0));
|
||||
brw_MOV(p, c->reg.nr_verts, brw_imm_ud(3));
|
||||
}
|
||||
|
||||
|
||||
|
||||
void brw_clip_tri_flat_shade( struct brw_clip_compile *c )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
|
||||
|
||||
brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK));
|
||||
brw_CMP(p,
|
||||
vec1(brw_null_reg()),
|
||||
BRW_CONDITIONAL_EQ,
|
||||
tmp0,
|
||||
brw_imm_ud(_3DPRIM_POLYGON));
|
||||
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
brw_clip_copy_flatshaded_attributes(c, 1, 0);
|
||||
brw_clip_copy_flatshaded_attributes(c, 2, 0);
|
||||
}
|
||||
brw_ELSE(p);
|
||||
{
|
||||
if (c->key.pv_first) {
|
||||
brw_CMP(p,
|
||||
vec1(brw_null_reg()),
|
||||
BRW_CONDITIONAL_EQ,
|
||||
tmp0,
|
||||
brw_imm_ud(_3DPRIM_TRIFAN));
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
brw_clip_copy_flatshaded_attributes(c, 0, 1);
|
||||
brw_clip_copy_flatshaded_attributes(c, 2, 1);
|
||||
}
|
||||
brw_ELSE(p);
|
||||
{
|
||||
brw_clip_copy_flatshaded_attributes(c, 1, 0);
|
||||
brw_clip_copy_flatshaded_attributes(c, 2, 0);
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
}
|
||||
else {
|
||||
brw_clip_copy_flatshaded_attributes(c, 0, 2);
|
||||
brw_clip_copy_flatshaded_attributes(c, 1, 2);
|
||||
}
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Loads the clip distance for a vertex into `dst`, and ends with
|
||||
* a comparison of it to zero with the condition `cond`.
|
||||
*
|
||||
* - If using a fixed plane, the distance is dot(hpos, plane).
|
||||
* - If using a user clip plane, the distance is directly available in the vertex.
|
||||
*/
|
||||
static inline void
|
||||
load_clip_distance(struct brw_clip_compile *c, struct brw_indirect vtx,
|
||||
struct brw_reg dst, GLuint hpos_offset, int cond)
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
|
||||
dst = vec4(dst);
|
||||
brw_AND(p, vec1(brw_null_reg()), c->reg.vertex_src_mask, brw_imm_ud(1));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
struct brw_indirect temp_ptr = brw_indirect(7, 0);
|
||||
brw_ADD(p, get_addr_reg(temp_ptr), get_addr_reg(vtx), c->reg.clipdistance_offset);
|
||||
brw_MOV(p, vec1(dst), deref_1f(temp_ptr, 0));
|
||||
}
|
||||
brw_ELSE(p);
|
||||
{
|
||||
brw_MOV(p, dst, deref_4f(vtx, hpos_offset));
|
||||
brw_DP4(p, dst, dst, c->reg.plane_equation);
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
|
||||
brw_CMP(p, brw_null_reg(), cond, vec1(dst), brw_imm_f(0.0f));
|
||||
}
|
||||
|
||||
|
||||
/* Use mesa's clipping algorithms, translated to GFX4 assembly.
|
||||
*/
|
||||
void brw_clip_tri( struct brw_clip_compile *c )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
struct brw_indirect vtx = brw_indirect(0, 0);
|
||||
struct brw_indirect vtxPrev = brw_indirect(1, 0);
|
||||
struct brw_indirect vtxOut = brw_indirect(2, 0);
|
||||
struct brw_indirect plane_ptr = brw_indirect(3, 0);
|
||||
struct brw_indirect inlist_ptr = brw_indirect(4, 0);
|
||||
struct brw_indirect outlist_ptr = brw_indirect(5, 0);
|
||||
struct brw_indirect freelist_ptr = brw_indirect(6, 0);
|
||||
GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
|
||||
GLint clipdist0_offset = c->key.nr_userclip
|
||||
? brw_varying_to_offset(&c->vue_map, VARYING_SLOT_CLIP_DIST0)
|
||||
: 0;
|
||||
|
||||
brw_MOV(p, get_addr_reg(vtxPrev), brw_address(c->reg.vertex[2]) );
|
||||
brw_MOV(p, get_addr_reg(plane_ptr), brw_clip_plane0_address(c));
|
||||
brw_MOV(p, get_addr_reg(inlist_ptr), brw_address(c->reg.inlist));
|
||||
brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist));
|
||||
|
||||
brw_MOV(p, get_addr_reg(freelist_ptr), brw_address(c->reg.vertex[3]) );
|
||||
|
||||
/* Set the initial vertex source mask: The first 6 planes are the bounds
|
||||
* of the view volume; the next 8 planes are the user clipping planes.
|
||||
*/
|
||||
brw_MOV(p, c->reg.vertex_src_mask, brw_imm_ud(0x3fc0));
|
||||
|
||||
/* Set the initial clipdistance offset to be 6 floats before gl_ClipDistance[0].
|
||||
* We'll increment 6 times before we start hitting actual user clipping. */
|
||||
brw_MOV(p, c->reg.clipdistance_offset, brw_imm_d(clipdist0_offset - 6*sizeof(float)));
|
||||
|
||||
brw_DO(p, BRW_EXECUTE_1);
|
||||
{
|
||||
/* if (planemask & 1)
|
||||
*/
|
||||
brw_AND(p, vec1(brw_null_reg()), c->reg.planemask, brw_imm_ud(1));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
|
||||
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
/* vtxOut = freelist_ptr++
|
||||
*/
|
||||
brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(freelist_ptr) );
|
||||
brw_ADD(p, get_addr_reg(freelist_ptr), get_addr_reg(freelist_ptr), brw_imm_uw(c->nr_regs * REG_SIZE));
|
||||
|
||||
if (c->key.nr_userclip)
|
||||
brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
|
||||
else
|
||||
brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));
|
||||
|
||||
brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
|
||||
brw_MOV(p, c->reg.nr_verts, brw_imm_ud(0));
|
||||
|
||||
brw_DO(p, BRW_EXECUTE_1);
|
||||
{
|
||||
/* vtx = *input_ptr;
|
||||
*/
|
||||
brw_MOV(p, get_addr_reg(vtx), deref_1uw(inlist_ptr, 0));
|
||||
|
||||
load_clip_distance(c, vtxPrev, c->reg.dpPrev, hpos_offset, BRW_CONDITIONAL_L);
|
||||
/* (prev < 0.0f) */
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
load_clip_distance(c, vtx, c->reg.dp, hpos_offset, BRW_CONDITIONAL_GE);
|
||||
/* IS_POSITIVE(next)
|
||||
*/
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
|
||||
/* Coming back in.
|
||||
*/
|
||||
brw_ADD(p, c->reg.t, c->reg.dpPrev, negate(c->reg.dp));
|
||||
brw_math_invert(p, c->reg.t, c->reg.t);
|
||||
brw_MUL(p, c->reg.t, c->reg.t, c->reg.dpPrev);
|
||||
|
||||
/* If (vtxOut == 0) vtxOut = vtxPrev
|
||||
*/
|
||||
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) );
|
||||
brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtxPrev));
|
||||
brw_inst_set_pred_control(p->devinfo, brw_last_inst,
|
||||
BRW_PREDICATE_NORMAL);
|
||||
|
||||
brw_clip_interp_vertex(c, vtxOut, vtxPrev, vtx, c->reg.t, false);
|
||||
|
||||
/* *outlist_ptr++ = vtxOut;
|
||||
* nr_verts++;
|
||||
* vtxOut = 0;
|
||||
*/
|
||||
brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut));
|
||||
brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
|
||||
brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
|
||||
brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) );
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
|
||||
}
|
||||
brw_ELSE(p);
|
||||
{
|
||||
/* *outlist_ptr++ = vtxPrev;
|
||||
* nr_verts++;
|
||||
*/
|
||||
brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxPrev));
|
||||
brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
|
||||
brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
|
||||
|
||||
load_clip_distance(c, vtx, c->reg.dp, hpos_offset, BRW_CONDITIONAL_L);
|
||||
/* (next < 0.0f)
|
||||
*/
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
/* Going out of bounds. Avoid division by zero as we
|
||||
* know dp != dpPrev from DIFFERENT_SIGNS, above.
|
||||
*/
|
||||
brw_ADD(p, c->reg.t, c->reg.dp, negate(c->reg.dpPrev));
|
||||
brw_math_invert(p, c->reg.t, c->reg.t);
|
||||
brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp);
|
||||
|
||||
/* If (vtxOut == 0) vtxOut = vtx
|
||||
*/
|
||||
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) );
|
||||
brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtx));
|
||||
brw_inst_set_pred_control(p->devinfo, brw_last_inst,
|
||||
BRW_PREDICATE_NORMAL);
|
||||
|
||||
brw_clip_interp_vertex(c, vtxOut, vtx, vtxPrev, c->reg.t, true);
|
||||
|
||||
/* *outlist_ptr++ = vtxOut;
|
||||
* nr_verts++;
|
||||
* vtxOut = 0;
|
||||
*/
|
||||
brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut));
|
||||
brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
|
||||
brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
|
||||
brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) );
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
|
||||
/* vtxPrev = vtx;
|
||||
* inlist_ptr++;
|
||||
*/
|
||||
brw_MOV(p, get_addr_reg(vtxPrev), get_addr_reg(vtx));
|
||||
brw_ADD(p, get_addr_reg(inlist_ptr), get_addr_reg(inlist_ptr), brw_imm_uw(sizeof(short)));
|
||||
|
||||
/* while (--loopcount != 0)
|
||||
*/
|
||||
brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
|
||||
}
|
||||
brw_WHILE(p);
|
||||
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
|
||||
|
||||
/* vtxPrev = *(outlist_ptr-1) OR: outlist[nr_verts-1]
|
||||
* inlist = outlist
|
||||
* inlist_ptr = &inlist[0]
|
||||
* outlist_ptr = &outlist[0]
|
||||
*/
|
||||
brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_w(-2));
|
||||
brw_MOV(p, get_addr_reg(vtxPrev), deref_1uw(outlist_ptr, 0));
|
||||
brw_MOV(p, brw_vec8_grf(c->reg.inlist.nr, 0), brw_vec8_grf(c->reg.outlist.nr, 0));
|
||||
brw_MOV(p, get_addr_reg(inlist_ptr), brw_address(c->reg.inlist));
|
||||
brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist));
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
|
||||
/* plane_ptr++;
|
||||
*/
|
||||
brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));
|
||||
|
||||
/* nr_verts >= 3
|
||||
*/
|
||||
brw_CMP(p,
|
||||
vec1(brw_null_reg()),
|
||||
BRW_CONDITIONAL_GE,
|
||||
c->reg.nr_verts,
|
||||
brw_imm_ud(3));
|
||||
brw_set_default_predicate_control(p, BRW_PREDICATE_NORMAL);
|
||||
|
||||
/* && (planemask>>=1) != 0
|
||||
*/
|
||||
brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
|
||||
brw_SHR(p, c->reg.vertex_src_mask, c->reg.vertex_src_mask, brw_imm_ud(1));
|
||||
brw_ADD(p, c->reg.clipdistance_offset, c->reg.clipdistance_offset, brw_imm_w(sizeof(float)));
|
||||
}
|
||||
brw_WHILE(p);
|
||||
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
|
||||
}
|
||||
|
||||
|
||||
|
||||
void brw_clip_tri_emit_polygon(struct brw_clip_compile *c)
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
|
||||
/* for (loopcount = nr_verts-2; loopcount > 0; loopcount--)
|
||||
*/
|
||||
brw_ADD(p,
|
||||
c->reg.loopcount,
|
||||
c->reg.nr_verts,
|
||||
brw_imm_d(-2));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_G);
|
||||
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
struct brw_indirect v0 = brw_indirect(0, 0);
|
||||
struct brw_indirect vptr = brw_indirect(1, 0);
|
||||
|
||||
brw_MOV(p, get_addr_reg(vptr), brw_address(c->reg.inlist));
|
||||
brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
|
||||
|
||||
brw_clip_emit_vue(c, v0, BRW_URB_WRITE_ALLOCATE_COMPLETE,
|
||||
((_3DPRIM_TRIFAN << URB_WRITE_PRIM_TYPE_SHIFT)
|
||||
| URB_WRITE_PRIM_START));
|
||||
|
||||
brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2));
|
||||
brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
|
||||
|
||||
brw_DO(p, BRW_EXECUTE_1);
|
||||
{
|
||||
brw_clip_emit_vue(c, v0, BRW_URB_WRITE_ALLOCATE_COMPLETE,
|
||||
(_3DPRIM_TRIFAN << URB_WRITE_PRIM_TYPE_SHIFT));
|
||||
|
||||
brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2));
|
||||
brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
|
||||
|
||||
brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
|
||||
}
|
||||
brw_WHILE(p);
|
||||
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
|
||||
|
||||
brw_clip_emit_vue(c, v0, BRW_URB_WRITE_EOT_COMPLETE,
|
||||
((_3DPRIM_TRIFAN << URB_WRITE_PRIM_TYPE_SHIFT)
|
||||
| URB_WRITE_PRIM_END));
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
}
|
||||
|
||||
static void do_clip_tri( struct brw_clip_compile *c )
|
||||
{
|
||||
brw_clip_init_planes(c);
|
||||
|
||||
brw_clip_tri(c);
|
||||
}
|
||||
|
||||
|
||||
static void maybe_do_clip_tri( struct brw_clip_compile *c )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
|
||||
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0));
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
do_clip_tri(c);
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
}
|
||||
|
||||
static void brw_clip_test( struct brw_clip_compile *c )
|
||||
{
|
||||
struct brw_reg t = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
|
||||
struct brw_reg t1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
|
||||
struct brw_reg t2 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
|
||||
struct brw_reg t3 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
|
||||
|
||||
struct brw_reg v0 = get_tmp(c);
|
||||
struct brw_reg v1 = get_tmp(c);
|
||||
struct brw_reg v2 = get_tmp(c);
|
||||
|
||||
struct brw_indirect vt0 = brw_indirect(0, 0);
|
||||
struct brw_indirect vt1 = brw_indirect(1, 0);
|
||||
struct brw_indirect vt2 = brw_indirect(2, 0);
|
||||
|
||||
struct brw_codegen *p = &c->func;
|
||||
struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
|
||||
|
||||
GLuint hpos_offset = brw_varying_to_offset(&c->vue_map,
|
||||
VARYING_SLOT_POS);
|
||||
|
||||
brw_MOV(p, get_addr_reg(vt0), brw_address(c->reg.vertex[0]));
|
||||
brw_MOV(p, get_addr_reg(vt1), brw_address(c->reg.vertex[1]));
|
||||
brw_MOV(p, get_addr_reg(vt2), brw_address(c->reg.vertex[2]));
|
||||
brw_MOV(p, v0, deref_4f(vt0, hpos_offset));
|
||||
brw_MOV(p, v1, deref_4f(vt1, hpos_offset));
|
||||
brw_MOV(p, v2, deref_4f(vt2, hpos_offset));
|
||||
brw_AND(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(~0x3f));
|
||||
|
||||
/* test nearz, xmin, ymin plane */
|
||||
/* clip.xyz < -clip.w */
|
||||
brw_CMP(p, t1, BRW_CONDITIONAL_L, v0, negate(get_element(v0, 3)));
|
||||
brw_CMP(p, t2, BRW_CONDITIONAL_L, v1, negate(get_element(v1, 3)));
|
||||
brw_CMP(p, t3, BRW_CONDITIONAL_L, v2, negate(get_element(v2, 3)));
|
||||
|
||||
/* All vertices are outside of a plane, rejected */
|
||||
brw_AND(p, t, t1, t2);
|
||||
brw_AND(p, t, t, t3);
|
||||
brw_OR(p, tmp0, get_element(t, 0), get_element(t, 1));
|
||||
brw_OR(p, tmp0, tmp0, get_element(t, 2));
|
||||
brw_AND(p, brw_null_reg(), tmp0, brw_imm_ud(0x1));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
brw_clip_kill_thread(c);
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
|
||||
|
||||
/* some vertices are inside a plane, some are outside,need to clip */
|
||||
brw_XOR(p, t, t1, t2);
|
||||
brw_XOR(p, t1, t2, t3);
|
||||
brw_OR(p, t, t, t1);
|
||||
brw_AND(p, t, t, brw_imm_ud(0x1));
|
||||
brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
|
||||
get_element(t, 0), brw_imm_ud(0));
|
||||
brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<5)));
|
||||
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
|
||||
brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
|
||||
get_element(t, 1), brw_imm_ud(0));
|
||||
brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<3)));
|
||||
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
|
||||
brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
|
||||
get_element(t, 2), brw_imm_ud(0));
|
||||
brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<1)));
|
||||
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
|
||||
|
||||
/* test farz, xmax, ymax plane */
|
||||
/* clip.xyz > clip.w */
|
||||
brw_CMP(p, t1, BRW_CONDITIONAL_G, v0, get_element(v0, 3));
|
||||
brw_CMP(p, t2, BRW_CONDITIONAL_G, v1, get_element(v1, 3));
|
||||
brw_CMP(p, t3, BRW_CONDITIONAL_G, v2, get_element(v2, 3));
|
||||
|
||||
/* All vertices are outside of a plane, rejected */
|
||||
brw_AND(p, t, t1, t2);
|
||||
brw_AND(p, t, t, t3);
|
||||
brw_OR(p, tmp0, get_element(t, 0), get_element(t, 1));
|
||||
brw_OR(p, tmp0, tmp0, get_element(t, 2));
|
||||
brw_AND(p, brw_null_reg(), tmp0, brw_imm_ud(0x1));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
brw_clip_kill_thread(c);
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
|
||||
|
||||
/* some vertices are inside a plane, some are outside,need to clip */
|
||||
brw_XOR(p, t, t1, t2);
|
||||
brw_XOR(p, t1, t2, t3);
|
||||
brw_OR(p, t, t, t1);
|
||||
brw_AND(p, t, t, brw_imm_ud(0x1));
|
||||
brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
|
||||
get_element(t, 0), brw_imm_ud(0));
|
||||
brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<4)));
|
||||
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
|
||||
brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
|
||||
get_element(t, 1), brw_imm_ud(0));
|
||||
brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<2)));
|
||||
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
|
||||
brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
|
||||
get_element(t, 2), brw_imm_ud(0));
|
||||
brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<0)));
|
||||
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
|
||||
|
||||
release_tmps(c);
|
||||
}
|
||||
|
||||
|
||||
void brw_emit_tri_clip( struct brw_clip_compile *c )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
|
||||
brw_clip_tri_init_vertices(c);
|
||||
brw_clip_init_clipmask(c);
|
||||
brw_clip_init_ff_sync(c);
|
||||
|
||||
/* if -ve rhw workaround bit is set,
|
||||
do cliptest */
|
||||
if (p->devinfo->has_negative_rhw_bug) {
|
||||
brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
|
||||
brw_imm_ud(1<<20));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
brw_clip_test(c);
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
}
|
||||
/* Can't push into do_clip_tri because with polygon (or quad)
|
||||
* flatshading, need to apply the flatshade here because we don't
|
||||
* respect the PV when converting to trifan for emit:
|
||||
*/
|
||||
if (c->key.contains_flat_varying)
|
||||
brw_clip_tri_flat_shade(c);
|
||||
|
||||
if ((c->key.clip_mode == BRW_CLIP_MODE_NORMAL) ||
|
||||
(c->key.clip_mode == BRW_CLIP_MODE_KERNEL_CLIP))
|
||||
do_clip_tri(c);
|
||||
else
|
||||
maybe_do_clip_tri(c);
|
||||
|
||||
brw_clip_tri_emit_polygon(c);
|
||||
|
||||
/* Send an empty message to kill the thread:
|
||||
*/
|
||||
brw_clip_kill_thread(c);
|
||||
}
|
||||
528
src/intel/compiler/elk/brw_clip_unfilled.c
Normal file
528
src/intel/compiler/elk/brw_clip_unfilled.c
Normal file
|
|
@ -0,0 +1,528 @@
|
|||
/*
|
||||
Copyright (C) Intel Corp. 2006. All Rights Reserved.
|
||||
Intel funded Tungsten Graphics to
|
||||
develop this 3D driver.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice (including the
|
||||
next paragraph) shall be included in all copies or substantial
|
||||
portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
|
||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
**********************************************************************/
|
||||
/*
|
||||
* Authors:
|
||||
* Keith Whitwell <keithw@vmware.com>
|
||||
*/
|
||||
|
||||
#include "brw_clip.h"
|
||||
#include "brw_prim.h"
|
||||
|
||||
|
||||
/* This is performed against the original triangles, so no indirection
|
||||
* required:
|
||||
BZZZT!
|
||||
*/
|
||||
static void compute_tri_direction( struct brw_clip_compile *c )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
struct brw_reg e = c->reg.tmp0;
|
||||
struct brw_reg f = c->reg.tmp1;
|
||||
GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
|
||||
struct brw_reg v0 = byte_offset(c->reg.vertex[0], hpos_offset);
|
||||
struct brw_reg v1 = byte_offset(c->reg.vertex[1], hpos_offset);
|
||||
struct brw_reg v2 = byte_offset(c->reg.vertex[2], hpos_offset);
|
||||
|
||||
|
||||
struct brw_reg v0n = get_tmp(c);
|
||||
struct brw_reg v1n = get_tmp(c);
|
||||
struct brw_reg v2n = get_tmp(c);
|
||||
|
||||
/* Convert to NDC.
|
||||
* NOTE: We can't modify the original vertex coordinates,
|
||||
* as it may impact further operations.
|
||||
* So, we have to keep normalized coordinates in temp registers.
|
||||
*
|
||||
* TBD-KC
|
||||
* Try to optimize unnecessary MOV's.
|
||||
*/
|
||||
brw_MOV(p, v0n, v0);
|
||||
brw_MOV(p, v1n, v1);
|
||||
brw_MOV(p, v2n, v2);
|
||||
|
||||
brw_clip_project_position(c, v0n);
|
||||
brw_clip_project_position(c, v1n);
|
||||
brw_clip_project_position(c, v2n);
|
||||
|
||||
/* Calculate the vectors of two edges of the triangle:
|
||||
*/
|
||||
brw_ADD(p, e, v0n, negate(v2n));
|
||||
brw_ADD(p, f, v1n, negate(v2n));
|
||||
|
||||
/* Take their crossproduct:
|
||||
*/
|
||||
brw_set_default_access_mode(p, BRW_ALIGN_16);
|
||||
brw_MUL(p, vec4(brw_null_reg()), brw_swizzle(e, BRW_SWIZZLE_YZXW),
|
||||
brw_swizzle(f, BRW_SWIZZLE_ZXYW));
|
||||
brw_MAC(p, vec4(e), negate(brw_swizzle(e, BRW_SWIZZLE_ZXYW)),
|
||||
brw_swizzle(f, BRW_SWIZZLE_YZXW));
|
||||
brw_set_default_access_mode(p, BRW_ALIGN_1);
|
||||
|
||||
brw_MUL(p, c->reg.dir, c->reg.dir, vec4(e));
|
||||
}
|
||||
|
||||
|
||||
static void cull_direction( struct brw_clip_compile *c )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
GLuint conditional;
|
||||
|
||||
assert (!(c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL &&
|
||||
c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL));
|
||||
|
||||
if (c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL)
|
||||
conditional = BRW_CONDITIONAL_GE;
|
||||
else
|
||||
conditional = BRW_CONDITIONAL_L;
|
||||
|
||||
brw_CMP(p,
|
||||
vec1(brw_null_reg()),
|
||||
conditional,
|
||||
get_element(c->reg.dir, 2),
|
||||
brw_imm_f(0));
|
||||
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
brw_clip_kill_thread(c);
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void copy_bfc( struct brw_clip_compile *c )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
GLuint conditional;
|
||||
|
||||
/* Do we have any colors to copy?
|
||||
*/
|
||||
if (!(brw_clip_have_varying(c, VARYING_SLOT_COL0) &&
|
||||
brw_clip_have_varying(c, VARYING_SLOT_BFC0)) &&
|
||||
!(brw_clip_have_varying(c, VARYING_SLOT_COL1) &&
|
||||
brw_clip_have_varying(c, VARYING_SLOT_BFC1)))
|
||||
return;
|
||||
|
||||
/* In some weird degenerate cases we can end up testing the
|
||||
* direction twice, once for culling and once for bfc copying. Oh
|
||||
* well, that's what you get for setting weird GL state.
|
||||
*/
|
||||
if (c->key.copy_bfc_ccw)
|
||||
conditional = BRW_CONDITIONAL_GE;
|
||||
else
|
||||
conditional = BRW_CONDITIONAL_L;
|
||||
|
||||
brw_CMP(p,
|
||||
vec1(brw_null_reg()),
|
||||
conditional,
|
||||
get_element(c->reg.dir, 2),
|
||||
brw_imm_f(0));
|
||||
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
GLuint i;
|
||||
|
||||
for (i = 0; i < 3; i++) {
|
||||
if (brw_clip_have_varying(c, VARYING_SLOT_COL0) &&
|
||||
brw_clip_have_varying(c, VARYING_SLOT_BFC0))
|
||||
brw_MOV(p,
|
||||
byte_offset(c->reg.vertex[i],
|
||||
brw_varying_to_offset(&c->vue_map,
|
||||
VARYING_SLOT_COL0)),
|
||||
byte_offset(c->reg.vertex[i],
|
||||
brw_varying_to_offset(&c->vue_map,
|
||||
VARYING_SLOT_BFC0)));
|
||||
|
||||
if (brw_clip_have_varying(c, VARYING_SLOT_COL1) &&
|
||||
brw_clip_have_varying(c, VARYING_SLOT_BFC1))
|
||||
brw_MOV(p,
|
||||
byte_offset(c->reg.vertex[i],
|
||||
brw_varying_to_offset(&c->vue_map,
|
||||
VARYING_SLOT_COL1)),
|
||||
byte_offset(c->reg.vertex[i],
|
||||
brw_varying_to_offset(&c->vue_map,
|
||||
VARYING_SLOT_BFC1)));
|
||||
}
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
GLfloat iz = 1.0 / dir.z;
|
||||
GLfloat ac = dir.x * iz;
|
||||
GLfloat bc = dir.y * iz;
|
||||
offset = ctx->Polygon.OffsetUnits * DEPTH_SCALE;
|
||||
offset += MAX2( abs(ac), abs(bc) ) * ctx->Polygon.OffsetFactor;
|
||||
if (ctx->Polygon.OffsetClamp && isfinite(ctx->Polygon.OffsetClamp)) {
|
||||
if (ctx->Polygon.OffsetClamp < 0)
|
||||
offset = MAX2( offset, ctx->Polygon.OffsetClamp );
|
||||
else
|
||||
offset = MIN2( offset, ctx->Polygon.OffsetClamp );
|
||||
}
|
||||
offset *= MRD;
|
||||
*/
|
||||
static void compute_offset( struct brw_clip_compile *c )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
struct brw_reg off = c->reg.offset;
|
||||
struct brw_reg dir = c->reg.dir;
|
||||
|
||||
brw_math_invert(p, get_element(off, 2), get_element(dir, 2));
|
||||
brw_MUL(p, vec2(off), vec2(dir), get_element(off, 2));
|
||||
|
||||
brw_CMP(p,
|
||||
vec1(brw_null_reg()),
|
||||
BRW_CONDITIONAL_GE,
|
||||
brw_abs(get_element(off, 0)),
|
||||
brw_abs(get_element(off, 1)));
|
||||
|
||||
brw_SEL(p, vec1(off),
|
||||
brw_abs(get_element(off, 0)), brw_abs(get_element(off, 1)));
|
||||
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
|
||||
|
||||
brw_MUL(p, vec1(off), vec1(off), brw_imm_f(c->key.offset_factor));
|
||||
brw_ADD(p, vec1(off), vec1(off), brw_imm_f(c->key.offset_units));
|
||||
if (c->key.offset_clamp && isfinite(c->key.offset_clamp)) {
|
||||
brw_CMP(p,
|
||||
vec1(brw_null_reg()),
|
||||
c->key.offset_clamp < 0 ? BRW_CONDITIONAL_GE : BRW_CONDITIONAL_L,
|
||||
vec1(off),
|
||||
brw_imm_f(c->key.offset_clamp));
|
||||
brw_SEL(p, vec1(off), vec1(off), brw_imm_f(c->key.offset_clamp));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void merge_edgeflags( struct brw_clip_compile *c )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
struct brw_reg tmp0 = get_element_ud(c->reg.tmp0, 0);
|
||||
|
||||
brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK));
|
||||
brw_CMP(p,
|
||||
vec1(brw_null_reg()),
|
||||
BRW_CONDITIONAL_EQ,
|
||||
tmp0,
|
||||
brw_imm_ud(_3DPRIM_POLYGON));
|
||||
|
||||
/* Get away with using reg.vertex because we know that this is not
|
||||
* a _3DPRIM_TRISTRIP_REVERSE:
|
||||
*/
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<8));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_EQ);
|
||||
brw_MOV(p, byte_offset(c->reg.vertex[0],
|
||||
brw_varying_to_offset(&c->vue_map,
|
||||
VARYING_SLOT_EDGE)),
|
||||
brw_imm_f(0));
|
||||
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
|
||||
|
||||
brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<9));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_EQ);
|
||||
brw_MOV(p, byte_offset(c->reg.vertex[2],
|
||||
brw_varying_to_offset(&c->vue_map,
|
||||
VARYING_SLOT_EDGE)),
|
||||
brw_imm_f(0));
|
||||
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void apply_one_offset( struct brw_clip_compile *c,
|
||||
struct brw_indirect vert )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
GLuint ndc_offset = brw_varying_to_offset(&c->vue_map,
|
||||
BRW_VARYING_SLOT_NDC);
|
||||
struct brw_reg z = deref_1f(vert, ndc_offset +
|
||||
2 * type_sz(BRW_REGISTER_TYPE_F));
|
||||
|
||||
brw_ADD(p, z, z, vec1(c->reg.offset));
|
||||
}
|
||||
|
||||
|
||||
|
||||
/***********************************************************************
|
||||
* Output clipped polygon as an unfilled primitive:
|
||||
*/
|
||||
static void emit_lines(struct brw_clip_compile *c,
|
||||
bool do_offset)
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
struct brw_indirect v0 = brw_indirect(0, 0);
|
||||
struct brw_indirect v1 = brw_indirect(1, 0);
|
||||
struct brw_indirect v0ptr = brw_indirect(2, 0);
|
||||
struct brw_indirect v1ptr = brw_indirect(3, 0);
|
||||
|
||||
/* Need a separate loop for offset:
|
||||
*/
|
||||
if (do_offset) {
|
||||
brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
|
||||
brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
|
||||
|
||||
brw_DO(p, BRW_EXECUTE_1);
|
||||
{
|
||||
brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
|
||||
brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
|
||||
|
||||
apply_one_offset(c, v0);
|
||||
|
||||
brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_G);
|
||||
}
|
||||
brw_WHILE(p);
|
||||
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
|
||||
}
|
||||
|
||||
/* v1ptr = &inlist[nr_verts]
|
||||
* *v1ptr = v0
|
||||
*/
|
||||
brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
|
||||
brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
|
||||
brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v0ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW));
|
||||
brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v1ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW));
|
||||
brw_MOV(p, deref_1uw(v1ptr, 0), deref_1uw(v0ptr, 0));
|
||||
|
||||
brw_DO(p, BRW_EXECUTE_1);
|
||||
{
|
||||
brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
|
||||
brw_MOV(p, get_addr_reg(v1), deref_1uw(v0ptr, 2));
|
||||
brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
|
||||
|
||||
/* draw edge if edgeflag != 0 */
|
||||
brw_CMP(p,
|
||||
vec1(brw_null_reg()), BRW_CONDITIONAL_NZ,
|
||||
deref_1f(v0, brw_varying_to_offset(&c->vue_map,
|
||||
VARYING_SLOT_EDGE)),
|
||||
brw_imm_f(0));
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
brw_clip_emit_vue(c, v0, BRW_URB_WRITE_ALLOCATE_COMPLETE,
|
||||
(_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
|
||||
| URB_WRITE_PRIM_START);
|
||||
brw_clip_emit_vue(c, v1, BRW_URB_WRITE_ALLOCATE_COMPLETE,
|
||||
(_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
|
||||
| URB_WRITE_PRIM_END);
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
|
||||
brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
|
||||
}
|
||||
brw_WHILE(p);
|
||||
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void emit_points(struct brw_clip_compile *c,
|
||||
bool do_offset )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
|
||||
struct brw_indirect v0 = brw_indirect(0, 0);
|
||||
struct brw_indirect v0ptr = brw_indirect(2, 0);
|
||||
|
||||
brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
|
||||
brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
|
||||
|
||||
brw_DO(p, BRW_EXECUTE_1);
|
||||
{
|
||||
brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
|
||||
brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
|
||||
|
||||
/* draw if edgeflag != 0
|
||||
*/
|
||||
brw_CMP(p,
|
||||
vec1(brw_null_reg()), BRW_CONDITIONAL_NZ,
|
||||
deref_1f(v0, brw_varying_to_offset(&c->vue_map,
|
||||
VARYING_SLOT_EDGE)),
|
||||
brw_imm_f(0));
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
if (do_offset)
|
||||
apply_one_offset(c, v0);
|
||||
|
||||
brw_clip_emit_vue(c, v0, BRW_URB_WRITE_ALLOCATE_COMPLETE,
|
||||
(_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT)
|
||||
| URB_WRITE_PRIM_START | URB_WRITE_PRIM_END);
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
|
||||
brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
|
||||
}
|
||||
brw_WHILE(p);
|
||||
brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
static void emit_primitives( struct brw_clip_compile *c,
|
||||
GLuint mode,
|
||||
bool do_offset )
|
||||
{
|
||||
switch (mode) {
|
||||
case BRW_CLIP_FILL_MODE_FILL:
|
||||
brw_clip_tri_emit_polygon(c);
|
||||
break;
|
||||
|
||||
case BRW_CLIP_FILL_MODE_LINE:
|
||||
emit_lines(c, do_offset);
|
||||
break;
|
||||
|
||||
case BRW_CLIP_FILL_MODE_POINT:
|
||||
emit_points(c, do_offset);
|
||||
break;
|
||||
|
||||
case BRW_CLIP_FILL_MODE_CULL:
|
||||
unreachable("not reached");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void emit_unfilled_primitives( struct brw_clip_compile *c )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
|
||||
/* Direction culling has already been done.
|
||||
*/
|
||||
if (c->key.fill_ccw != c->key.fill_cw &&
|
||||
c->key.fill_ccw != BRW_CLIP_FILL_MODE_CULL &&
|
||||
c->key.fill_cw != BRW_CLIP_FILL_MODE_CULL)
|
||||
{
|
||||
brw_CMP(p,
|
||||
vec1(brw_null_reg()),
|
||||
BRW_CONDITIONAL_GE,
|
||||
get_element(c->reg.dir, 2),
|
||||
brw_imm_f(0));
|
||||
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
|
||||
}
|
||||
brw_ELSE(p);
|
||||
{
|
||||
emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
}
|
||||
else if (c->key.fill_cw != BRW_CLIP_FILL_MODE_CULL) {
|
||||
emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
|
||||
}
|
||||
else if (c->key.fill_ccw != BRW_CLIP_FILL_MODE_CULL) {
|
||||
emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
static void check_nr_verts( struct brw_clip_compile *c )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
|
||||
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.nr_verts, brw_imm_d(3));
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
brw_clip_kill_thread(c);
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
}
|
||||
|
||||
|
||||
void brw_emit_unfilled_clip( struct brw_clip_compile *c )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
|
||||
c->need_direction = ((c->key.offset_ccw || c->key.offset_cw) ||
|
||||
(c->key.fill_ccw != c->key.fill_cw) ||
|
||||
c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL ||
|
||||
c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL ||
|
||||
c->key.copy_bfc_cw ||
|
||||
c->key.copy_bfc_ccw);
|
||||
|
||||
brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
|
||||
brw_clip_tri_init_vertices(c);
|
||||
brw_clip_init_ff_sync(c);
|
||||
|
||||
assert(brw_clip_have_varying(c, VARYING_SLOT_EDGE));
|
||||
|
||||
if (c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL &&
|
||||
c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL) {
|
||||
brw_clip_kill_thread(c);
|
||||
return;
|
||||
}
|
||||
|
||||
merge_edgeflags(c);
|
||||
|
||||
/* Need to use the inlist indirection here:
|
||||
*/
|
||||
if (c->need_direction)
|
||||
compute_tri_direction(c);
|
||||
|
||||
if (c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL ||
|
||||
c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL)
|
||||
cull_direction(c);
|
||||
|
||||
if (c->key.offset_ccw ||
|
||||
c->key.offset_cw)
|
||||
compute_offset(c);
|
||||
|
||||
if (c->key.copy_bfc_ccw ||
|
||||
c->key.copy_bfc_cw)
|
||||
copy_bfc(c);
|
||||
|
||||
/* Need to do this whether we clip or not:
|
||||
*/
|
||||
if (c->key.contains_flat_varying)
|
||||
brw_clip_tri_flat_shade(c);
|
||||
|
||||
brw_clip_init_clipmask(c);
|
||||
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0));
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
brw_clip_init_planes(c);
|
||||
brw_clip_tri(c);
|
||||
check_nr_verts(c);
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
|
||||
emit_unfilled_primitives(c);
|
||||
brw_clip_kill_thread(c);
|
||||
}
|
||||
464
src/intel/compiler/elk/brw_clip_util.c
Normal file
464
src/intel/compiler/elk/brw_clip_util.c
Normal file
|
|
@ -0,0 +1,464 @@
|
|||
/*
|
||||
Copyright (C) Intel Corp. 2006. All Rights Reserved.
|
||||
Intel funded Tungsten Graphics to
|
||||
develop this 3D driver.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice (including the
|
||||
next paragraph) shall be included in all copies or substantial
|
||||
portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
|
||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
**********************************************************************/
|
||||
/*
|
||||
* Authors:
|
||||
* Keith Whitwell <keithw@vmware.com>
|
||||
*/
|
||||
|
||||
#include "brw_clip.h"
|
||||
|
||||
|
||||
struct brw_reg get_tmp( struct brw_clip_compile *c )
|
||||
{
|
||||
struct brw_reg tmp = brw_vec4_grf(c->last_tmp, 0);
|
||||
|
||||
if (++c->last_tmp > c->prog_data.total_grf)
|
||||
c->prog_data.total_grf = c->last_tmp;
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
static void release_tmp( struct brw_clip_compile *c, struct brw_reg tmp )
|
||||
{
|
||||
if (tmp.nr == c->last_tmp-1)
|
||||
c->last_tmp--;
|
||||
}
|
||||
|
||||
|
||||
static struct brw_reg make_plane_ud(GLuint x, GLuint y, GLuint z, GLuint w)
|
||||
{
|
||||
return brw_imm_ud((w<<24) | (z<<16) | (y<<8) | x);
|
||||
}
|
||||
|
||||
|
||||
void brw_clip_init_planes( struct brw_clip_compile *c )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
|
||||
if (!c->key.nr_userclip) {
|
||||
brw_MOV(p, get_element_ud(c->reg.fixed_planes, 0), make_plane_ud( 0, 0, 0xff, 1));
|
||||
brw_MOV(p, get_element_ud(c->reg.fixed_planes, 1), make_plane_ud( 0, 0, 1, 1));
|
||||
brw_MOV(p, get_element_ud(c->reg.fixed_planes, 2), make_plane_ud( 0, 0xff, 0, 1));
|
||||
brw_MOV(p, get_element_ud(c->reg.fixed_planes, 3), make_plane_ud( 0, 1, 0, 1));
|
||||
brw_MOV(p, get_element_ud(c->reg.fixed_planes, 4), make_plane_ud(0xff, 0, 0, 1));
|
||||
brw_MOV(p, get_element_ud(c->reg.fixed_planes, 5), make_plane_ud( 1, 0, 0, 1));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#define W 3
|
||||
|
||||
/* Project 'pos' to screen space (or back again), overwrite with results:
|
||||
*/
|
||||
void brw_clip_project_position(struct brw_clip_compile *c, struct brw_reg pos )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
|
||||
/* calc rhw
|
||||
*/
|
||||
brw_math_invert(p, get_element(pos, W), get_element(pos, W));
|
||||
|
||||
/* value.xyz *= value.rhw
|
||||
*/
|
||||
brw_set_default_access_mode(p, BRW_ALIGN_16);
|
||||
brw_MUL(p, brw_writemask(pos, WRITEMASK_XYZ), pos,
|
||||
brw_swizzle(pos, BRW_SWIZZLE_WWWW));
|
||||
brw_set_default_access_mode(p, BRW_ALIGN_1);
|
||||
}
|
||||
|
||||
|
||||
static void brw_clip_project_vertex( struct brw_clip_compile *c,
|
||||
struct brw_indirect vert_addr )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
struct brw_reg tmp = get_tmp(c);
|
||||
GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
|
||||
GLuint ndc_offset = brw_varying_to_offset(&c->vue_map,
|
||||
BRW_VARYING_SLOT_NDC);
|
||||
|
||||
/* Fixup position. Extract from the original vertex and re-project
|
||||
* to screen space:
|
||||
*/
|
||||
brw_MOV(p, tmp, deref_4f(vert_addr, hpos_offset));
|
||||
brw_clip_project_position(c, tmp);
|
||||
brw_MOV(p, deref_4f(vert_addr, ndc_offset), tmp);
|
||||
|
||||
release_tmp(c, tmp);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/* Interpolate between two vertices and put the result into a0.0.
|
||||
* Increment a0.0 accordingly.
|
||||
*
|
||||
* Beware that dest_ptr can be equal to v0_ptr!
|
||||
*/
|
||||
void brw_clip_interp_vertex( struct brw_clip_compile *c,
|
||||
struct brw_indirect dest_ptr,
|
||||
struct brw_indirect v0_ptr, /* from */
|
||||
struct brw_indirect v1_ptr, /* to */
|
||||
struct brw_reg t0,
|
||||
bool force_edgeflag)
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
struct brw_reg t_nopersp, v0_ndc_copy;
|
||||
GLuint slot;
|
||||
|
||||
/* Just copy the vertex header:
|
||||
*/
|
||||
/*
|
||||
* After CLIP stage, only first 256 bits of the VUE are read
|
||||
* back on Ironlake, so needn't change it
|
||||
*/
|
||||
brw_copy_indirect_to_indirect(p, dest_ptr, v0_ptr, 1);
|
||||
|
||||
|
||||
/* First handle the 3D and NDC interpolation, in case we
|
||||
* need noperspective interpolation. Doing it early has no
|
||||
* performance impact in any case.
|
||||
*/
|
||||
|
||||
/* Take a copy of the v0 NDC coordinates, in case dest == v0. */
|
||||
if (c->key.contains_noperspective_varying) {
|
||||
GLuint offset = brw_varying_to_offset(&c->vue_map,
|
||||
BRW_VARYING_SLOT_NDC);
|
||||
v0_ndc_copy = get_tmp(c);
|
||||
brw_MOV(p, v0_ndc_copy, deref_4f(v0_ptr, offset));
|
||||
}
|
||||
|
||||
/* Compute the new 3D position
|
||||
*
|
||||
* dest_hpos = v0_hpos * (1 - t0) + v1_hpos * t0
|
||||
*/
|
||||
{
|
||||
GLuint delta = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
|
||||
struct brw_reg tmp = get_tmp(c);
|
||||
brw_MUL(p, vec4(brw_null_reg()), deref_4f(v1_ptr, delta), t0);
|
||||
brw_MAC(p, tmp, negate(deref_4f(v0_ptr, delta)), t0);
|
||||
brw_ADD(p, deref_4f(dest_ptr, delta), deref_4f(v0_ptr, delta), tmp);
|
||||
release_tmp(c, tmp);
|
||||
}
|
||||
|
||||
/* Recreate the projected (NDC) coordinate in the new vertex header */
|
||||
brw_clip_project_vertex(c, dest_ptr);
|
||||
|
||||
/* If we have noperspective attributes,
|
||||
* we need to compute the screen-space t
|
||||
*/
|
||||
if (c->key.contains_noperspective_varying) {
|
||||
GLuint delta = brw_varying_to_offset(&c->vue_map,
|
||||
BRW_VARYING_SLOT_NDC);
|
||||
struct brw_reg tmp = get_tmp(c);
|
||||
t_nopersp = get_tmp(c);
|
||||
|
||||
/* t_nopersp = vec4(v1.xy, dest.xy) */
|
||||
brw_MOV(p, t_nopersp, deref_4f(v1_ptr, delta));
|
||||
brw_MOV(p, tmp, deref_4f(dest_ptr, delta));
|
||||
brw_set_default_access_mode(p, BRW_ALIGN_16);
|
||||
brw_MOV(p,
|
||||
brw_writemask(t_nopersp, WRITEMASK_ZW),
|
||||
brw_swizzle(tmp, BRW_SWIZZLE_XYXY));
|
||||
|
||||
/* t_nopersp = vec4(v1.xy, dest.xy) - v0.xyxy */
|
||||
brw_ADD(p, t_nopersp, t_nopersp,
|
||||
negate(brw_swizzle(v0_ndc_copy, BRW_SWIZZLE_XYXY)));
|
||||
|
||||
/* Add the absolute values of the X and Y deltas so that if
|
||||
* the points aren't in the same place on the screen we get
|
||||
* nonzero values to divide.
|
||||
*
|
||||
* After that, we have vert1 - vert0 in t_nopersp.x and
|
||||
* vertnew - vert0 in t_nopersp.y
|
||||
*
|
||||
* t_nopersp = vec2(|v1.x -v0.x| + |v1.y -v0.y|,
|
||||
* |dest.x-v0.x| + |dest.y-v0.y|)
|
||||
*/
|
||||
brw_ADD(p,
|
||||
brw_writemask(t_nopersp, WRITEMASK_XY),
|
||||
brw_abs(brw_swizzle(t_nopersp, BRW_SWIZZLE_XZXZ)),
|
||||
brw_abs(brw_swizzle(t_nopersp, BRW_SWIZZLE_YWYW)));
|
||||
brw_set_default_access_mode(p, BRW_ALIGN_1);
|
||||
|
||||
/* If the points are in the same place, just substitute a
|
||||
* value to avoid divide-by-zero
|
||||
*/
|
||||
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ,
|
||||
vec1(t_nopersp),
|
||||
brw_imm_f(0));
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
brw_MOV(p, t_nopersp, brw_imm_vf4(brw_float_to_vf(1.0),
|
||||
brw_float_to_vf(0.0),
|
||||
brw_float_to_vf(0.0),
|
||||
brw_float_to_vf(0.0)));
|
||||
brw_ENDIF(p);
|
||||
|
||||
/* Now compute t_nopersp = t_nopersp.y/t_nopersp.x and broadcast it. */
|
||||
brw_math_invert(p, get_element(t_nopersp, 0), get_element(t_nopersp, 0));
|
||||
brw_MUL(p, vec1(t_nopersp), vec1(t_nopersp),
|
||||
vec1(suboffset(t_nopersp, 1)));
|
||||
brw_set_default_access_mode(p, BRW_ALIGN_16);
|
||||
brw_MOV(p, t_nopersp, brw_swizzle(t_nopersp, BRW_SWIZZLE_XXXX));
|
||||
brw_set_default_access_mode(p, BRW_ALIGN_1);
|
||||
|
||||
release_tmp(c, tmp);
|
||||
release_tmp(c, v0_ndc_copy);
|
||||
}
|
||||
|
||||
/* Now we can iterate over each attribute
|
||||
* (could be done in pairs?)
|
||||
*/
|
||||
for (slot = 0; slot < c->vue_map.num_slots; slot++) {
|
||||
int varying = c->vue_map.slot_to_varying[slot];
|
||||
GLuint delta = brw_vue_slot_to_offset(slot);
|
||||
|
||||
/* HPOS, NDC already handled above */
|
||||
if (varying == VARYING_SLOT_POS || varying == BRW_VARYING_SLOT_NDC)
|
||||
continue;
|
||||
|
||||
|
||||
if (varying == VARYING_SLOT_EDGE) {
|
||||
if (force_edgeflag)
|
||||
brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(1));
|
||||
else
|
||||
brw_MOV(p, deref_4f(dest_ptr, delta), deref_4f(v0_ptr, delta));
|
||||
} else if (varying == VARYING_SLOT_PSIZ) {
|
||||
/* PSIZ doesn't need interpolation because it isn't used by the
|
||||
* fragment shader.
|
||||
*/
|
||||
} else if (varying < VARYING_SLOT_MAX) {
|
||||
/* This is a true vertex result (and not a special value for the VUE
|
||||
* header), so interpolate:
|
||||
*
|
||||
* New = attr0 + t*attr1 - t*attr0
|
||||
*
|
||||
* Unless the attribute is flat shaded -- in which case just copy
|
||||
* from one of the sources (doesn't matter which; already copied from pv)
|
||||
*/
|
||||
GLuint interp = c->key.interp_mode[slot];
|
||||
|
||||
if (interp != INTERP_MODE_FLAT) {
|
||||
struct brw_reg tmp = get_tmp(c);
|
||||
struct brw_reg t =
|
||||
interp == INTERP_MODE_NOPERSPECTIVE ? t_nopersp : t0;
|
||||
|
||||
brw_MUL(p,
|
||||
vec4(brw_null_reg()),
|
||||
deref_4f(v1_ptr, delta),
|
||||
t);
|
||||
|
||||
brw_MAC(p,
|
||||
tmp,
|
||||
negate(deref_4f(v0_ptr, delta)),
|
||||
t);
|
||||
|
||||
brw_ADD(p,
|
||||
deref_4f(dest_ptr, delta),
|
||||
deref_4f(v0_ptr, delta),
|
||||
tmp);
|
||||
|
||||
release_tmp(c, tmp);
|
||||
}
|
||||
else {
|
||||
brw_MOV(p,
|
||||
deref_4f(dest_ptr, delta),
|
||||
deref_4f(v0_ptr, delta));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (c->vue_map.num_slots % 2) {
|
||||
GLuint delta = brw_vue_slot_to_offset(c->vue_map.num_slots);
|
||||
|
||||
brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0));
|
||||
}
|
||||
|
||||
if (c->key.contains_noperspective_varying)
|
||||
release_tmp(c, t_nopersp);
|
||||
}
|
||||
|
||||
void brw_clip_emit_vue(struct brw_clip_compile *c,
|
||||
struct brw_indirect vert,
|
||||
enum brw_urb_write_flags flags,
|
||||
GLuint header)
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
bool allocate = flags & BRW_URB_WRITE_ALLOCATE;
|
||||
|
||||
brw_clip_ff_sync(c);
|
||||
|
||||
/* Any URB entry that is allocated must subsequently be used or discarded,
|
||||
* so it doesn't make sense to mark EOT and ALLOCATE at the same time.
|
||||
*/
|
||||
assert(!(allocate && (flags & BRW_URB_WRITE_EOT)));
|
||||
|
||||
/* Copy the vertex from vertn into m1..mN+1:
|
||||
*/
|
||||
brw_copy_from_indirect(p, brw_message_reg(1), vert, c->nr_regs);
|
||||
|
||||
/* Overwrite PrimType and PrimStart in the message header, for
|
||||
* each vertex in turn:
|
||||
*/
|
||||
brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header));
|
||||
|
||||
|
||||
/* Send each vertex as a separate write to the urb. This
|
||||
* is different to the concept in brw_sf_emit.c, where
|
||||
* subsequent writes are used to build up a single urb
|
||||
* entry. Each of these writes instantiates a separate
|
||||
* urb entry - (I think... what about 'allocate'?)
|
||||
*/
|
||||
brw_urb_WRITE(p,
|
||||
allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
|
||||
0,
|
||||
c->reg.R0,
|
||||
flags,
|
||||
c->nr_regs + 1, /* msg length */
|
||||
allocate ? 1 : 0, /* response_length */
|
||||
0, /* urb offset */
|
||||
BRW_URB_SWIZZLE_NONE);
|
||||
}
|
||||
|
||||
|
||||
|
||||
void brw_clip_kill_thread(struct brw_clip_compile *c)
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
|
||||
brw_clip_ff_sync(c);
|
||||
/* Send an empty message to kill the thread and release any
|
||||
* allocated urb entry:
|
||||
*/
|
||||
brw_urb_WRITE(p,
|
||||
retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
|
||||
0,
|
||||
c->reg.R0,
|
||||
BRW_URB_WRITE_UNUSED | BRW_URB_WRITE_EOT_COMPLETE,
|
||||
1, /* msg len */
|
||||
0, /* response len */
|
||||
0,
|
||||
BRW_URB_SWIZZLE_NONE);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c )
|
||||
{
|
||||
return brw_address(c->reg.fixed_planes);
|
||||
}
|
||||
|
||||
|
||||
struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c )
|
||||
{
|
||||
if (c->key.nr_userclip) {
|
||||
return brw_imm_uw(16);
|
||||
}
|
||||
else {
|
||||
return brw_imm_uw(4);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Distribute flatshaded attributes from provoking vertex prior to
|
||||
* clipping.
|
||||
*/
|
||||
void brw_clip_copy_flatshaded_attributes( struct brw_clip_compile *c,
|
||||
GLuint to, GLuint from )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
|
||||
for (int i = 0; i < c->vue_map.num_slots; i++) {
|
||||
if (c->key.interp_mode[i] == INTERP_MODE_FLAT) {
|
||||
brw_MOV(p,
|
||||
byte_offset(c->reg.vertex[to], brw_vue_slot_to_offset(i)),
|
||||
byte_offset(c->reg.vertex[from], brw_vue_slot_to_offset(i)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
void brw_clip_init_clipmask( struct brw_clip_compile *c )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
struct brw_reg incoming = get_element_ud(c->reg.R0, 2);
|
||||
|
||||
/* Shift so that lowest outcode bit is rightmost:
|
||||
*/
|
||||
brw_SHR(p, c->reg.planemask, incoming, brw_imm_ud(26));
|
||||
|
||||
if (c->key.nr_userclip) {
|
||||
struct brw_reg tmp = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UD);
|
||||
|
||||
/* Rearrange userclip outcodes so that they come directly after
|
||||
* the fixed plane bits.
|
||||
*/
|
||||
if (p->devinfo->ver == 5 || p->devinfo->verx10 == 45)
|
||||
brw_AND(p, tmp, incoming, brw_imm_ud(0xff<<14));
|
||||
else
|
||||
brw_AND(p, tmp, incoming, brw_imm_ud(0x3f<<14));
|
||||
|
||||
brw_SHR(p, tmp, tmp, brw_imm_ud(8));
|
||||
brw_OR(p, c->reg.planemask, c->reg.planemask, tmp);
|
||||
|
||||
release_tmp(c, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
void brw_clip_ff_sync(struct brw_clip_compile *c)
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
|
||||
if (p->devinfo->ver == 5) {
|
||||
brw_AND(p, brw_null_reg(), c->reg.ff_sync, brw_imm_ud(0x1));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
{
|
||||
brw_OR(p, c->reg.ff_sync, c->reg.ff_sync, brw_imm_ud(0x1));
|
||||
brw_ff_sync(p,
|
||||
c->reg.R0,
|
||||
0,
|
||||
c->reg.R0,
|
||||
1, /* allocate */
|
||||
1, /* response length */
|
||||
0 /* eot */);
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
|
||||
}
|
||||
}
|
||||
|
||||
void brw_clip_init_ff_sync(struct brw_clip_compile *c)
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
|
||||
if (p->devinfo->ver == 5) {
|
||||
brw_MOV(p, c->reg.ff_sync, brw_imm_ud(0));
|
||||
}
|
||||
}
|
||||
97
src/intel/compiler/elk/brw_compile_clip.c
Normal file
97
src/intel/compiler/elk/brw_compile_clip.c
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
/*
|
||||
* Copyright © 2006 - 2017 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_clip.h"
|
||||
#include "brw_disasm.h"
|
||||
|
||||
#include "dev/intel_debug.h"
|
||||
|
||||
const unsigned *
|
||||
brw_compile_clip(const struct brw_compiler *compiler,
|
||||
void *mem_ctx,
|
||||
const struct brw_clip_prog_key *key,
|
||||
struct brw_clip_prog_data *prog_data,
|
||||
struct intel_vue_map *vue_map,
|
||||
unsigned *final_assembly_size)
|
||||
{
|
||||
struct brw_clip_compile c;
|
||||
memset(&c, 0, sizeof(c));
|
||||
|
||||
/* Begin the compilation:
|
||||
*/
|
||||
brw_init_codegen(&compiler->isa, &c.func, mem_ctx);
|
||||
|
||||
c.func.single_program_flow = 1;
|
||||
|
||||
c.key = *key;
|
||||
c.vue_map = *vue_map;
|
||||
|
||||
/* nr_regs is the number of registers filled by reading data from the VUE.
|
||||
* This program accesses the entire VUE, so nr_regs needs to be the size of
|
||||
* the VUE (measured in pairs, since two slots are stored in each
|
||||
* register).
|
||||
*/
|
||||
c.nr_regs = (c.vue_map.num_slots + 1)/2;
|
||||
|
||||
c.prog_data.clip_mode = c.key.clip_mode; /* XXX */
|
||||
|
||||
/* For some reason the thread is spawned with only 4 channels
|
||||
* unmasked.
|
||||
*/
|
||||
brw_set_default_mask_control(&c.func, BRW_MASK_DISABLE);
|
||||
|
||||
/* Would ideally have the option of producing a program which could
|
||||
* do all three:
|
||||
*/
|
||||
switch (key->primitive) {
|
||||
case MESA_PRIM_TRIANGLES:
|
||||
if (key->do_unfilled)
|
||||
brw_emit_unfilled_clip( &c );
|
||||
else
|
||||
brw_emit_tri_clip( &c );
|
||||
break;
|
||||
case MESA_PRIM_LINES:
|
||||
brw_emit_line_clip( &c );
|
||||
break;
|
||||
case MESA_PRIM_POINTS:
|
||||
brw_emit_point_clip( &c );
|
||||
break;
|
||||
default:
|
||||
unreachable("not reached");
|
||||
}
|
||||
|
||||
brw_compact_instructions(&c.func, 0, NULL);
|
||||
|
||||
*prog_data = c.prog_data;
|
||||
|
||||
const unsigned *program = brw_get_program(&c.func, final_assembly_size);
|
||||
|
||||
if (INTEL_DEBUG(DEBUG_CLIP)) {
|
||||
fprintf(stderr, "clip:\n");
|
||||
brw_disassemble_with_labels(&compiler->isa,
|
||||
program, 0, *final_assembly_size, stderr);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
return program;
|
||||
}
|
||||
662
src/intel/compiler/elk/brw_compile_ff_gs.c
Normal file
662
src/intel/compiler/elk/brw_compile_ff_gs.c
Normal file
|
|
@ -0,0 +1,662 @@
|
|||
/*
|
||||
Copyright (C) Intel Corp. 2006. All Rights Reserved.
|
||||
Intel funded Tungsten Graphics to
|
||||
develop this 3D driver.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice (including the
|
||||
next paragraph) shall be included in all copies or substantial
|
||||
portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
|
||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
**********************************************************************/
|
||||
/*
|
||||
* Authors:
|
||||
* Keith Whitwell <keithw@vmware.com>
|
||||
*/
|
||||
|
||||
#include "brw_compiler.h"
|
||||
#include "brw_disasm.h"
|
||||
#include "brw_eu.h"
|
||||
#include "brw_prim.h"
|
||||
|
||||
#include "dev/intel_debug.h"
|
||||
|
||||
#define MAX_GS_VERTS (4)
|
||||
|
||||
struct brw_ff_gs_compile {
|
||||
struct brw_codegen func;
|
||||
struct brw_ff_gs_prog_key key;
|
||||
struct brw_ff_gs_prog_data *prog_data;
|
||||
|
||||
struct {
|
||||
struct brw_reg R0;
|
||||
|
||||
/**
|
||||
* Register holding streamed vertex buffer pointers -- see the Sandy
|
||||
* Bridge PRM, volume 2 part 1, section 4.4.2 (GS Thread Payload
|
||||
* [DevSNB]). These pointers are delivered in GRF 1.
|
||||
*/
|
||||
struct brw_reg SVBI;
|
||||
|
||||
struct brw_reg vertex[MAX_GS_VERTS];
|
||||
struct brw_reg header;
|
||||
struct brw_reg temp;
|
||||
|
||||
/**
|
||||
* Register holding destination indices for streamed buffer writes.
|
||||
* Only used for SOL programs.
|
||||
*/
|
||||
struct brw_reg destination_indices;
|
||||
} reg;
|
||||
|
||||
/* Number of registers used to store vertex data */
|
||||
GLuint nr_regs;
|
||||
|
||||
struct intel_vue_map vue_map;
|
||||
};
|
||||
|
||||
/**
|
||||
* Allocate registers for GS.
|
||||
*
|
||||
* If sol_program is true, then:
|
||||
*
|
||||
* - The thread will be spawned with the "SVBI Payload Enable" bit set, so GRF
|
||||
* 1 needs to be set aside to hold the streamed vertex buffer indices.
|
||||
*
|
||||
* - The thread will need to use the destination_indices register.
|
||||
*/
|
||||
static void brw_ff_gs_alloc_regs(struct brw_ff_gs_compile *c,
|
||||
GLuint nr_verts,
|
||||
bool sol_program)
|
||||
{
|
||||
GLuint i = 0,j;
|
||||
|
||||
/* Register usage is static, precompute here:
|
||||
*/
|
||||
c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
|
||||
|
||||
/* Streamed vertex buffer indices */
|
||||
if (sol_program)
|
||||
c->reg.SVBI = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
|
||||
|
||||
/* Payload vertices plus space for more generated vertices:
|
||||
*/
|
||||
for (j = 0; j < nr_verts; j++) {
|
||||
c->reg.vertex[j] = brw_vec4_grf(i, 0);
|
||||
i += c->nr_regs;
|
||||
}
|
||||
|
||||
c->reg.header = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
|
||||
c->reg.temp = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
|
||||
|
||||
if (sol_program) {
|
||||
c->reg.destination_indices =
|
||||
retype(brw_vec4_grf(i++, 0), BRW_REGISTER_TYPE_UD);
|
||||
}
|
||||
|
||||
c->prog_data->urb_read_length = c->nr_regs;
|
||||
c->prog_data->total_grf = i;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Set up the initial value of c->reg.header register based on c->reg.R0.
|
||||
*
|
||||
* The following information is passed to the GS thread in R0, and needs to be
|
||||
* included in the first URB_WRITE or FF_SYNC message sent by the GS:
|
||||
*
|
||||
* - DWORD 0 [31:0] handle info (Gen4 only)
|
||||
* - DWORD 5 [7:0] FFTID
|
||||
* - DWORD 6 [31:0] Debug info
|
||||
* - DWORD 7 [31:0] Debug info
|
||||
*
|
||||
* This function sets up the above data by copying by copying the contents of
|
||||
* R0 to the header register.
|
||||
*/
|
||||
static void brw_ff_gs_initialize_header(struct brw_ff_gs_compile *c)
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
brw_MOV(p, c->reg.header, c->reg.R0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Overwrite DWORD 2 of c->reg.header with the given immediate unsigned value.
|
||||
*
|
||||
* In URB_WRITE messages, DWORD 2 contains the fields PrimType, PrimStart,
|
||||
* PrimEnd, Increment CL_INVOCATIONS, and SONumPrimsWritten, many of which we
|
||||
* need to be able to update on a per-vertex basis.
|
||||
*/
|
||||
static void brw_ff_gs_overwrite_header_dw2(struct brw_ff_gs_compile *c,
|
||||
unsigned dw2)
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
brw_MOV(p, get_element_ud(c->reg.header, 2), brw_imm_ud(dw2));
|
||||
}
|
||||
|
||||
/**
|
||||
* Overwrite DWORD 2 of c->reg.header with the primitive type from c->reg.R0.
|
||||
*
|
||||
* When the thread is spawned, GRF 0 contains the primitive type in bits 4:0
|
||||
* of DWORD 2. URB_WRITE messages need the primitive type in bits 6:2 of
|
||||
* DWORD 2. So this function extracts the primitive type field, bitshifts it
|
||||
* appropriately, and stores it in c->reg.header.
|
||||
*/
|
||||
static void brw_ff_gs_overwrite_header_dw2_from_r0(struct brw_ff_gs_compile *c)
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
brw_AND(p, get_element_ud(c->reg.header, 2), get_element_ud(c->reg.R0, 2),
|
||||
brw_imm_ud(0x1f));
|
||||
brw_SHL(p, get_element_ud(c->reg.header, 2),
|
||||
get_element_ud(c->reg.header, 2), brw_imm_ud(2));
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply an additive offset to DWORD 2 of c->reg.header.
|
||||
*
|
||||
* This is used to set/unset the "PrimStart" and "PrimEnd" flags appropriately
|
||||
* for each vertex.
|
||||
*/
|
||||
static void brw_ff_gs_offset_header_dw2(struct brw_ff_gs_compile *c,
|
||||
int offset)
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
brw_ADD(p, get_element_d(c->reg.header, 2), get_element_d(c->reg.header, 2),
|
||||
brw_imm_d(offset));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Emit a vertex using the URB_WRITE message. Use the contents of
|
||||
* c->reg.header for the message header, and the registers starting at \c vert
|
||||
* for the vertex data.
|
||||
*
|
||||
* If \c last is true, then this is the last vertex, so no further URB space
|
||||
* should be allocated, and this message should end the thread.
|
||||
*
|
||||
* If \c last is false, then a new URB entry will be allocated, and its handle
|
||||
* will be stored in DWORD 0 of c->reg.header for use in the next URB_WRITE
|
||||
* message.
|
||||
*/
|
||||
static void brw_ff_gs_emit_vue(struct brw_ff_gs_compile *c,
|
||||
struct brw_reg vert,
|
||||
bool last)
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
int write_offset = 0;
|
||||
bool complete = false;
|
||||
|
||||
do {
|
||||
/* We can't write more than 14 registers at a time to the URB */
|
||||
int write_len = MIN2(c->nr_regs - write_offset, 14);
|
||||
if (write_len == c->nr_regs - write_offset)
|
||||
complete = true;
|
||||
|
||||
/* Copy the vertex from vertn into m1..mN+1:
|
||||
*/
|
||||
brw_copy8(p, brw_message_reg(1), offset(vert, write_offset), write_len);
|
||||
|
||||
/* Send the vertex data to the URB. If this is the last write for this
|
||||
* vertex, then we mark it as complete, and either end the thread or
|
||||
* allocate another vertex URB entry (depending whether this is the last
|
||||
* vertex).
|
||||
*/
|
||||
enum brw_urb_write_flags flags;
|
||||
if (!complete)
|
||||
flags = BRW_URB_WRITE_NO_FLAGS;
|
||||
else if (last)
|
||||
flags = BRW_URB_WRITE_EOT_COMPLETE;
|
||||
else
|
||||
flags = BRW_URB_WRITE_ALLOCATE_COMPLETE;
|
||||
brw_urb_WRITE(p,
|
||||
(flags & BRW_URB_WRITE_ALLOCATE) ? c->reg.temp
|
||||
: retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
|
||||
0,
|
||||
c->reg.header,
|
||||
flags,
|
||||
write_len + 1, /* msg length */
|
||||
(flags & BRW_URB_WRITE_ALLOCATE) ? 1
|
||||
: 0, /* response length */
|
||||
write_offset, /* urb offset */
|
||||
BRW_URB_SWIZZLE_NONE);
|
||||
write_offset += write_len;
|
||||
} while (!complete);
|
||||
|
||||
if (!last) {
|
||||
brw_MOV(p, get_element_ud(c->reg.header, 0),
|
||||
get_element_ud(c->reg.temp, 0));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Send an FF_SYNC message to ensure that all previously spawned GS threads
|
||||
* have finished sending primitives down the pipeline, and to allocate a URB
|
||||
* entry for the first output vertex. Only needed on Ironlake+.
|
||||
*
|
||||
* This function modifies c->reg.header: in DWORD 1, it stores num_prim (which
|
||||
* is needed by the FF_SYNC message), and in DWORD 0, it stores the handle to
|
||||
* the allocated URB entry (which will be needed by the URB_WRITE meesage that
|
||||
* follows).
|
||||
*/
|
||||
static void brw_ff_gs_ff_sync(struct brw_ff_gs_compile *c, int num_prim)
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
|
||||
brw_MOV(p, get_element_ud(c->reg.header, 1), brw_imm_ud(num_prim));
|
||||
brw_ff_sync(p,
|
||||
c->reg.temp,
|
||||
0,
|
||||
c->reg.header,
|
||||
1, /* allocate */
|
||||
1, /* response length */
|
||||
0 /* eot */);
|
||||
brw_MOV(p, get_element_ud(c->reg.header, 0),
|
||||
get_element_ud(c->reg.temp, 0));
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
brw_ff_gs_quads(struct brw_ff_gs_compile *c,
|
||||
const struct brw_ff_gs_prog_key *key)
|
||||
{
|
||||
brw_ff_gs_alloc_regs(c, 4, false);
|
||||
brw_ff_gs_initialize_header(c);
|
||||
/* Use polygons for correct edgeflag behaviour. Note that vertex 3
|
||||
* is the PV for quads, but vertex 0 for polygons:
|
||||
*/
|
||||
if (c->func.devinfo->ver == 5)
|
||||
brw_ff_gs_ff_sync(c, 1);
|
||||
brw_ff_gs_overwrite_header_dw2(
|
||||
c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
|
||||
| URB_WRITE_PRIM_START));
|
||||
if (key->pv_first) {
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
|
||||
brw_ff_gs_overwrite_header_dw2(
|
||||
c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[1], 0);
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[2], 0);
|
||||
brw_ff_gs_overwrite_header_dw2(
|
||||
c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
|
||||
| URB_WRITE_PRIM_END));
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[3], 1);
|
||||
}
|
||||
else {
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[3], 0);
|
||||
brw_ff_gs_overwrite_header_dw2(
|
||||
c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[1], 0);
|
||||
brw_ff_gs_overwrite_header_dw2(
|
||||
c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
|
||||
| URB_WRITE_PRIM_END));
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[2], 1);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
brw_ff_gs_quad_strip(struct brw_ff_gs_compile *c,
|
||||
const struct brw_ff_gs_prog_key *key)
|
||||
{
|
||||
brw_ff_gs_alloc_regs(c, 4, false);
|
||||
brw_ff_gs_initialize_header(c);
|
||||
|
||||
if (c->func.devinfo->ver == 5)
|
||||
brw_ff_gs_ff_sync(c, 1);
|
||||
brw_ff_gs_overwrite_header_dw2(
|
||||
c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
|
||||
| URB_WRITE_PRIM_START));
|
||||
if (key->pv_first) {
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
|
||||
brw_ff_gs_overwrite_header_dw2(
|
||||
c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[1], 0);
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[2], 0);
|
||||
brw_ff_gs_overwrite_header_dw2(
|
||||
c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
|
||||
| URB_WRITE_PRIM_END));
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[3], 1);
|
||||
}
|
||||
else {
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[2], 0);
|
||||
brw_ff_gs_overwrite_header_dw2(
|
||||
c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[3], 0);
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
|
||||
brw_ff_gs_overwrite_header_dw2(
|
||||
c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
|
||||
| URB_WRITE_PRIM_END));
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[1], 1);
|
||||
}
|
||||
}
|
||||
|
||||
static void brw_ff_gs_lines(struct brw_ff_gs_compile *c)
|
||||
{
|
||||
brw_ff_gs_alloc_regs(c, 2, false);
|
||||
brw_ff_gs_initialize_header(c);
|
||||
|
||||
if (c->func.devinfo->ver == 5)
|
||||
brw_ff_gs_ff_sync(c, 1);
|
||||
brw_ff_gs_overwrite_header_dw2(
|
||||
c, ((_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
|
||||
| URB_WRITE_PRIM_START));
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
|
||||
brw_ff_gs_overwrite_header_dw2(
|
||||
c, ((_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
|
||||
| URB_WRITE_PRIM_END));
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[1], 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate the geometry shader program used on Gen6 to perform stream output
|
||||
* (transform feedback).
|
||||
*/
|
||||
static void
|
||||
gfx6_sol_program(struct brw_ff_gs_compile *c, const struct brw_ff_gs_prog_key *key,
|
||||
unsigned num_verts, bool check_edge_flags)
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
brw_inst *inst;
|
||||
c->prog_data->svbi_postincrement_value = num_verts;
|
||||
|
||||
brw_ff_gs_alloc_regs(c, num_verts, true);
|
||||
brw_ff_gs_initialize_header(c);
|
||||
|
||||
if (key->num_transform_feedback_bindings > 0) {
|
||||
unsigned vertex, binding;
|
||||
struct brw_reg destination_indices_uw =
|
||||
vec8(retype(c->reg.destination_indices, BRW_REGISTER_TYPE_UW));
|
||||
|
||||
/* Note: since we use the binding table to keep track of buffer offsets
|
||||
* and stride, the GS doesn't need to keep track of a separate pointer
|
||||
* into each buffer; it uses a single pointer which increments by 1 for
|
||||
* each vertex. So we use SVBI0 for this pointer, regardless of whether
|
||||
* transform feedback is in interleaved or separate attribs mode.
|
||||
*
|
||||
* Make sure that the buffers have enough room for all the vertices.
|
||||
*/
|
||||
brw_ADD(p, get_element_ud(c->reg.temp, 0),
|
||||
get_element_ud(c->reg.SVBI, 0), brw_imm_ud(num_verts));
|
||||
brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE,
|
||||
get_element_ud(c->reg.temp, 0),
|
||||
get_element_ud(c->reg.SVBI, 4));
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
|
||||
/* Compute the destination indices to write to. Usually we use SVBI[0]
|
||||
* + (0, 1, 2). However, for odd-numbered triangles in tristrips, the
|
||||
* vertices come down the pipeline in reversed winding order, so we need
|
||||
* to flip the order when writing to the transform feedback buffer. To
|
||||
* ensure that flatshading accuracy is preserved, we need to write them
|
||||
* in order SVBI[0] + (0, 2, 1) if we're using the first provoking
|
||||
* vertex convention, and in order SVBI[0] + (1, 0, 2) if we're using
|
||||
* the last provoking vertex convention.
|
||||
*
|
||||
* Note: since brw_imm_v can only be used in instructions in
|
||||
* packed-word execution mode, and SVBI is a double-word, we need to
|
||||
* first move the appropriate immediate constant ((0, 1, 2), (0, 2, 1),
|
||||
* or (1, 0, 2)) to the destination_indices register, and then add SVBI
|
||||
* using a separate instruction. Also, since the immediate constant is
|
||||
* expressed as packed words, and we need to load double-words into
|
||||
* destination_indices, we need to intersperse zeros to fill the upper
|
||||
* halves of each double-word.
|
||||
*/
|
||||
brw_MOV(p, destination_indices_uw,
|
||||
brw_imm_v(0x00020100)); /* (0, 1, 2) */
|
||||
if (num_verts == 3) {
|
||||
/* Get primitive type into temp register. */
|
||||
brw_AND(p, get_element_ud(c->reg.temp, 0),
|
||||
get_element_ud(c->reg.R0, 2), brw_imm_ud(0x1f));
|
||||
|
||||
/* Test if primitive type is TRISTRIP_REVERSE. We need to do this as
|
||||
* an 8-wide comparison so that the conditional MOV that follows
|
||||
* moves all 8 words correctly.
|
||||
*/
|
||||
brw_CMP(p, vec8(brw_null_reg()), BRW_CONDITIONAL_EQ,
|
||||
get_element_ud(c->reg.temp, 0),
|
||||
brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE));
|
||||
|
||||
/* If so, then overwrite destination_indices_uw with the appropriate
|
||||
* reordering.
|
||||
*/
|
||||
inst = brw_MOV(p, destination_indices_uw,
|
||||
brw_imm_v(key->pv_first ? 0x00010200 /* (0, 2, 1) */
|
||||
: 0x00020001)); /* (1, 0, 2) */
|
||||
brw_inst_set_pred_control(p->devinfo, inst, BRW_PREDICATE_NORMAL);
|
||||
}
|
||||
|
||||
assert(c->reg.destination_indices.width == BRW_EXECUTE_4);
|
||||
brw_push_insn_state(p);
|
||||
brw_set_default_exec_size(p, BRW_EXECUTE_4);
|
||||
brw_ADD(p, c->reg.destination_indices,
|
||||
c->reg.destination_indices, get_element_ud(c->reg.SVBI, 0));
|
||||
brw_pop_insn_state(p);
|
||||
/* For each vertex, generate code to output each varying using the
|
||||
* appropriate binding table entry.
|
||||
*/
|
||||
for (vertex = 0; vertex < num_verts; ++vertex) {
|
||||
/* Set up the correct destination index for this vertex */
|
||||
brw_MOV(p, get_element_ud(c->reg.header, 5),
|
||||
get_element_ud(c->reg.destination_indices, vertex));
|
||||
|
||||
for (binding = 0; binding < key->num_transform_feedback_bindings;
|
||||
++binding) {
|
||||
unsigned char varying =
|
||||
key->transform_feedback_bindings[binding];
|
||||
unsigned char slot = c->vue_map.varying_to_slot[varying];
|
||||
/* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
|
||||
*
|
||||
* "Prior to End of Thread with a URB_WRITE, the kernel must
|
||||
* ensure that all writes are complete by sending the final
|
||||
* write as a committed write."
|
||||
*/
|
||||
bool final_write =
|
||||
binding == key->num_transform_feedback_bindings - 1 &&
|
||||
vertex == num_verts - 1;
|
||||
struct brw_reg vertex_slot = c->reg.vertex[vertex];
|
||||
vertex_slot.nr += slot / 2;
|
||||
vertex_slot.subnr = (slot % 2) * 16;
|
||||
/* gl_PointSize is stored in VARYING_SLOT_PSIZ.w. */
|
||||
vertex_slot.swizzle = varying == VARYING_SLOT_PSIZ
|
||||
? BRW_SWIZZLE_WWWW : key->transform_feedback_swizzles[binding];
|
||||
brw_set_default_access_mode(p, BRW_ALIGN_16);
|
||||
brw_push_insn_state(p);
|
||||
brw_set_default_exec_size(p, BRW_EXECUTE_4);
|
||||
|
||||
brw_MOV(p, stride(c->reg.header, 4, 4, 1),
|
||||
retype(vertex_slot, BRW_REGISTER_TYPE_UD));
|
||||
brw_pop_insn_state(p);
|
||||
|
||||
brw_set_default_access_mode(p, BRW_ALIGN_1);
|
||||
brw_svb_write(p,
|
||||
final_write ? c->reg.temp : brw_null_reg(), /* dest */
|
||||
1, /* msg_reg_nr */
|
||||
c->reg.header, /* src0 */
|
||||
BRW_GFX6_SOL_BINDING_START + binding, /* binding_table_index */
|
||||
final_write); /* send_commit_msg */
|
||||
}
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
|
||||
/* Now, reinitialize the header register from R0 to restore the parts of
|
||||
* the register that we overwrote while streaming out transform feedback
|
||||
* data.
|
||||
*/
|
||||
brw_ff_gs_initialize_header(c);
|
||||
|
||||
/* Finally, wait for the write commit to occur so that we can proceed to
|
||||
* other things safely.
|
||||
*
|
||||
* From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
|
||||
*
|
||||
* The write commit does not modify the destination register, but
|
||||
* merely clears the dependency associated with the destination
|
||||
* register. Thus, a simple “mov” instruction using the register as a
|
||||
* source is sufficient to wait for the write commit to occur.
|
||||
*/
|
||||
brw_MOV(p, c->reg.temp, c->reg.temp);
|
||||
}
|
||||
|
||||
brw_ff_gs_ff_sync(c, 1);
|
||||
|
||||
brw_ff_gs_overwrite_header_dw2_from_r0(c);
|
||||
switch (num_verts) {
|
||||
case 1:
|
||||
brw_ff_gs_offset_header_dw2(c,
|
||||
URB_WRITE_PRIM_START | URB_WRITE_PRIM_END);
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[0], true);
|
||||
break;
|
||||
case 2:
|
||||
brw_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_START);
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[0], false);
|
||||
brw_ff_gs_offset_header_dw2(c,
|
||||
URB_WRITE_PRIM_END - URB_WRITE_PRIM_START);
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[1], true);
|
||||
break;
|
||||
case 3:
|
||||
if (check_edge_flags) {
|
||||
/* Only emit vertices 0 and 1 if this is the first triangle of the
|
||||
* polygon. Otherwise they are redundant.
|
||||
*/
|
||||
brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
|
||||
get_element_ud(c->reg.R0, 2),
|
||||
brw_imm_ud(BRW_GS_EDGE_INDICATOR_0));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
|
||||
brw_IF(p, BRW_EXECUTE_1);
|
||||
}
|
||||
brw_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_START);
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[0], false);
|
||||
brw_ff_gs_offset_header_dw2(c, -URB_WRITE_PRIM_START);
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[1], false);
|
||||
if (check_edge_flags) {
|
||||
brw_ENDIF(p);
|
||||
/* Only emit vertex 2 in PRIM_END mode if this is the last triangle
|
||||
* of the polygon. Otherwise leave the primitive incomplete because
|
||||
* there are more polygon vertices coming.
|
||||
*/
|
||||
brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
|
||||
get_element_ud(c->reg.R0, 2),
|
||||
brw_imm_ud(BRW_GS_EDGE_INDICATOR_1));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
|
||||
brw_set_default_predicate_control(p, BRW_PREDICATE_NORMAL);
|
||||
}
|
||||
brw_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_END);
|
||||
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
|
||||
brw_ff_gs_emit_vue(c, c->reg.vertex[2], true);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const unsigned *
|
||||
brw_compile_ff_gs_prog(struct brw_compiler *compiler,
|
||||
void *mem_ctx,
|
||||
const struct brw_ff_gs_prog_key *key,
|
||||
struct brw_ff_gs_prog_data *prog_data,
|
||||
struct intel_vue_map *vue_map,
|
||||
unsigned *final_assembly_size)
|
||||
{
|
||||
struct brw_ff_gs_compile c;
|
||||
const GLuint *program;
|
||||
|
||||
memset(&c, 0, sizeof(c));
|
||||
|
||||
c.key = *key;
|
||||
c.vue_map = *vue_map;
|
||||
c.nr_regs = (c.vue_map.num_slots + 1)/2;
|
||||
c.prog_data = prog_data;
|
||||
|
||||
mem_ctx = ralloc_context(NULL);
|
||||
|
||||
/* Begin the compilation:
|
||||
*/
|
||||
brw_init_codegen(&compiler->isa, &c.func, mem_ctx);
|
||||
|
||||
c.func.single_program_flow = 1;
|
||||
|
||||
/* For some reason the thread is spawned with only 4 channels
|
||||
* unmasked.
|
||||
*/
|
||||
brw_set_default_mask_control(&c.func, BRW_MASK_DISABLE);
|
||||
|
||||
if (compiler->devinfo->ver >= 6) {
|
||||
unsigned num_verts;
|
||||
bool check_edge_flag;
|
||||
/* On Sandybridge, we use the GS for implementing transform feedback
|
||||
* (called "Stream Out" in the PRM).
|
||||
*/
|
||||
switch (key->primitive) {
|
||||
case _3DPRIM_POINTLIST:
|
||||
num_verts = 1;
|
||||
check_edge_flag = false;
|
||||
break;
|
||||
case _3DPRIM_LINELIST:
|
||||
case _3DPRIM_LINESTRIP:
|
||||
case _3DPRIM_LINELOOP:
|
||||
num_verts = 2;
|
||||
check_edge_flag = false;
|
||||
break;
|
||||
case _3DPRIM_TRILIST:
|
||||
case _3DPRIM_TRIFAN:
|
||||
case _3DPRIM_TRISTRIP:
|
||||
case _3DPRIM_RECTLIST:
|
||||
num_verts = 3;
|
||||
check_edge_flag = false;
|
||||
break;
|
||||
case _3DPRIM_QUADLIST:
|
||||
case _3DPRIM_QUADSTRIP:
|
||||
case _3DPRIM_POLYGON:
|
||||
num_verts = 3;
|
||||
check_edge_flag = true;
|
||||
break;
|
||||
default:
|
||||
unreachable("Unexpected primitive type in Gen6 SOL program.");
|
||||
}
|
||||
gfx6_sol_program(&c, key, num_verts, check_edge_flag);
|
||||
} else {
|
||||
/* On Gen4-5, we use the GS to decompose certain types of primitives.
|
||||
* Note that primitives which don't require a GS program have already
|
||||
* been weeded out by now.
|
||||
*/
|
||||
switch (key->primitive) {
|
||||
case _3DPRIM_QUADLIST:
|
||||
brw_ff_gs_quads( &c, key );
|
||||
break;
|
||||
case _3DPRIM_QUADSTRIP:
|
||||
brw_ff_gs_quad_strip( &c, key );
|
||||
break;
|
||||
case _3DPRIM_LINELOOP:
|
||||
brw_ff_gs_lines( &c );
|
||||
break;
|
||||
default:
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
brw_compact_instructions(&c.func, 0, NULL);
|
||||
|
||||
/* get the program
|
||||
*/
|
||||
program = brw_get_program(&c.func, final_assembly_size);
|
||||
|
||||
if (INTEL_DEBUG(DEBUG_GS)) {
|
||||
fprintf(stderr, "gs:\n");
|
||||
brw_disassemble_with_labels(&compiler->isa, c.func.store,
|
||||
0, *final_assembly_size, stderr);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
return program;
|
||||
}
|
||||
|
||||
881
src/intel/compiler/elk/brw_compile_sf.c
Normal file
881
src/intel/compiler/elk/brw_compile_sf.c
Normal file
|
|
@ -0,0 +1,881 @@
|
|||
/*
|
||||
* Copyright © 2006 - 2017 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_compiler.h"
|
||||
#include "brw_disasm.h"
|
||||
#include "brw_eu.h"
|
||||
#include "brw_prim.h"
|
||||
|
||||
#include "dev/intel_debug.h"
|
||||
|
||||
struct brw_sf_compile {
|
||||
struct brw_codegen func;
|
||||
struct brw_sf_prog_key key;
|
||||
struct brw_sf_prog_data prog_data;
|
||||
|
||||
struct brw_reg pv;
|
||||
struct brw_reg det;
|
||||
struct brw_reg dx0;
|
||||
struct brw_reg dx2;
|
||||
struct brw_reg dy0;
|
||||
struct brw_reg dy2;
|
||||
|
||||
/* z and 1/w passed in separately:
|
||||
*/
|
||||
struct brw_reg z[3];
|
||||
struct brw_reg inv_w[3];
|
||||
|
||||
/* The vertices:
|
||||
*/
|
||||
struct brw_reg vert[3];
|
||||
|
||||
/* Temporaries, allocated after last vertex reg.
|
||||
*/
|
||||
struct brw_reg inv_det;
|
||||
struct brw_reg a1_sub_a0;
|
||||
struct brw_reg a2_sub_a0;
|
||||
struct brw_reg tmp;
|
||||
|
||||
struct brw_reg m1Cx;
|
||||
struct brw_reg m2Cy;
|
||||
struct brw_reg m3C0;
|
||||
|
||||
GLuint nr_verts;
|
||||
GLuint nr_attr_regs;
|
||||
GLuint nr_setup_regs;
|
||||
int urb_entry_read_offset;
|
||||
|
||||
/** The last known value of the f0.0 flag register. */
|
||||
unsigned flag_value;
|
||||
|
||||
struct intel_vue_map vue_map;
|
||||
};
|
||||
|
||||
/**
|
||||
* Determine the vue slot corresponding to the given half of the given register.
|
||||
*/
|
||||
static inline int vert_reg_to_vue_slot(struct brw_sf_compile *c, GLuint reg,
|
||||
int half)
|
||||
{
|
||||
return (reg + c->urb_entry_read_offset) * 2 + half;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine the varying corresponding to the given half of the given
|
||||
* register. half=0 means the first half of a register, half=1 means the
|
||||
* second half.
|
||||
*/
|
||||
static inline int vert_reg_to_varying(struct brw_sf_compile *c, GLuint reg,
|
||||
int half)
|
||||
{
|
||||
int vue_slot = vert_reg_to_vue_slot(c, reg, half);
|
||||
return c->vue_map.slot_to_varying[vue_slot];
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine the register corresponding to the given vue slot
|
||||
*/
|
||||
static struct brw_reg get_vue_slot(struct brw_sf_compile *c,
|
||||
struct brw_reg vert,
|
||||
int vue_slot)
|
||||
{
|
||||
GLuint off = vue_slot / 2 - c->urb_entry_read_offset;
|
||||
GLuint sub = vue_slot % 2;
|
||||
|
||||
return brw_vec4_grf(vert.nr + off, sub * 4);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine the register corresponding to the given varying.
|
||||
*/
|
||||
static struct brw_reg get_varying(struct brw_sf_compile *c,
|
||||
struct brw_reg vert,
|
||||
GLuint varying)
|
||||
{
|
||||
int vue_slot = c->vue_map.varying_to_slot[varying];
|
||||
assert (vue_slot >= c->urb_entry_read_offset);
|
||||
return get_vue_slot(c, vert, vue_slot);
|
||||
}
|
||||
|
||||
static bool
|
||||
have_attr(struct brw_sf_compile *c, GLuint attr)
|
||||
{
|
||||
return (c->key.attrs & BITFIELD64_BIT(attr)) ? 1 : 0;
|
||||
}
|
||||
|
||||
/***********************************************************************
|
||||
* Twoside lighting
|
||||
*/
|
||||
static void copy_bfc( struct brw_sf_compile *c,
|
||||
struct brw_reg vert )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
GLuint i;
|
||||
|
||||
for (i = 0; i < 2; i++) {
|
||||
if (have_attr(c, VARYING_SLOT_COL0+i) &&
|
||||
have_attr(c, VARYING_SLOT_BFC0+i))
|
||||
brw_MOV(p,
|
||||
get_varying(c, vert, VARYING_SLOT_COL0+i),
|
||||
get_varying(c, vert, VARYING_SLOT_BFC0+i));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void do_twoside_color( struct brw_sf_compile *c )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
GLuint backface_conditional = c->key.frontface_ccw ? BRW_CONDITIONAL_G : BRW_CONDITIONAL_L;
|
||||
|
||||
/* Already done in clip program:
|
||||
*/
|
||||
if (c->key.primitive == BRW_SF_PRIM_UNFILLED_TRIS)
|
||||
return;
|
||||
|
||||
/* If the vertex shader provides backface color, do the selection. The VS
|
||||
* promises to set up the front color if the backface color is provided, but
|
||||
* it may contain junk if never written to.
|
||||
*/
|
||||
if (!(have_attr(c, VARYING_SLOT_COL0) && have_attr(c, VARYING_SLOT_BFC0)) &&
|
||||
!(have_attr(c, VARYING_SLOT_COL1) && have_attr(c, VARYING_SLOT_BFC1)))
|
||||
return;
|
||||
|
||||
/* Need to use BRW_EXECUTE_4 and also do an 4-wide compare in order
|
||||
* to get all channels active inside the IF. In the clipping code
|
||||
* we run with NoMask, so it's not an option and we can use
|
||||
* BRW_EXECUTE_1 for all comparisons.
|
||||
*/
|
||||
brw_CMP(p, vec4(brw_null_reg()), backface_conditional, c->det, brw_imm_f(0));
|
||||
brw_IF(p, BRW_EXECUTE_4);
|
||||
{
|
||||
switch (c->nr_verts) {
|
||||
case 3: copy_bfc(c, c->vert[2]); FALLTHROUGH;
|
||||
case 2: copy_bfc(c, c->vert[1]); FALLTHROUGH;
|
||||
case 1: copy_bfc(c, c->vert[0]);
|
||||
}
|
||||
}
|
||||
brw_ENDIF(p);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/***********************************************************************
|
||||
* Flat shading
|
||||
*/
|
||||
|
||||
static void copy_flatshaded_attributes(struct brw_sf_compile *c,
|
||||
struct brw_reg dst,
|
||||
struct brw_reg src)
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < c->vue_map.num_slots; i++) {
|
||||
if (c->key.interp_mode[i] == INTERP_MODE_FLAT) {
|
||||
brw_MOV(p,
|
||||
get_vue_slot(c, dst, i),
|
||||
get_vue_slot(c, src, i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int count_flatshaded_attributes(struct brw_sf_compile *c)
|
||||
{
|
||||
int i;
|
||||
int count = 0;
|
||||
|
||||
for (i = 0; i < c->vue_map.num_slots; i++)
|
||||
if (c->key.interp_mode[i] == INTERP_MODE_FLAT)
|
||||
count++;
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* Need to use a computed jump to copy flatshaded attributes as the
|
||||
* vertices are ordered according to y-coordinate before reaching this
|
||||
* point, so the PV could be anywhere.
|
||||
*/
|
||||
static void do_flatshade_triangle( struct brw_sf_compile *c )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
GLuint nr;
|
||||
GLuint jmpi = 1;
|
||||
|
||||
/* Already done in clip program:
|
||||
*/
|
||||
if (c->key.primitive == BRW_SF_PRIM_UNFILLED_TRIS)
|
||||
return;
|
||||
|
||||
if (p->devinfo->ver == 5)
|
||||
jmpi = 2;
|
||||
|
||||
nr = count_flatshaded_attributes(c);
|
||||
|
||||
brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr*2+1)));
|
||||
brw_JMPI(p, c->pv, BRW_PREDICATE_NONE);
|
||||
|
||||
copy_flatshaded_attributes(c, c->vert[1], c->vert[0]);
|
||||
copy_flatshaded_attributes(c, c->vert[2], c->vert[0]);
|
||||
brw_JMPI(p, brw_imm_d(jmpi*(nr*4+1)), BRW_PREDICATE_NONE);
|
||||
|
||||
copy_flatshaded_attributes(c, c->vert[0], c->vert[1]);
|
||||
copy_flatshaded_attributes(c, c->vert[2], c->vert[1]);
|
||||
brw_JMPI(p, brw_imm_d(jmpi*nr*2), BRW_PREDICATE_NONE);
|
||||
|
||||
copy_flatshaded_attributes(c, c->vert[0], c->vert[2]);
|
||||
copy_flatshaded_attributes(c, c->vert[1], c->vert[2]);
|
||||
}
|
||||
|
||||
|
||||
static void do_flatshade_line( struct brw_sf_compile *c )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
GLuint nr;
|
||||
GLuint jmpi = 1;
|
||||
|
||||
/* Already done in clip program:
|
||||
*/
|
||||
if (c->key.primitive == BRW_SF_PRIM_UNFILLED_TRIS)
|
||||
return;
|
||||
|
||||
if (p->devinfo->ver == 5)
|
||||
jmpi = 2;
|
||||
|
||||
nr = count_flatshaded_attributes(c);
|
||||
|
||||
brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr+1)));
|
||||
brw_JMPI(p, c->pv, BRW_PREDICATE_NONE);
|
||||
copy_flatshaded_attributes(c, c->vert[1], c->vert[0]);
|
||||
|
||||
brw_JMPI(p, brw_imm_ud(jmpi*nr), BRW_PREDICATE_NONE);
|
||||
copy_flatshaded_attributes(c, c->vert[0], c->vert[1]);
|
||||
}
|
||||
|
||||
|
||||
/***********************************************************************
|
||||
* Triangle setup.
|
||||
*/
|
||||
|
||||
|
||||
static void alloc_regs( struct brw_sf_compile *c )
|
||||
{
|
||||
GLuint reg, i;
|
||||
|
||||
/* Values computed by fixed function unit:
|
||||
*/
|
||||
c->pv = retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_D);
|
||||
c->det = brw_vec1_grf(1, 2);
|
||||
c->dx0 = brw_vec1_grf(1, 3);
|
||||
c->dx2 = brw_vec1_grf(1, 4);
|
||||
c->dy0 = brw_vec1_grf(1, 5);
|
||||
c->dy2 = brw_vec1_grf(1, 6);
|
||||
|
||||
/* z and 1/w passed in separately:
|
||||
*/
|
||||
c->z[0] = brw_vec1_grf(2, 0);
|
||||
c->inv_w[0] = brw_vec1_grf(2, 1);
|
||||
c->z[1] = brw_vec1_grf(2, 2);
|
||||
c->inv_w[1] = brw_vec1_grf(2, 3);
|
||||
c->z[2] = brw_vec1_grf(2, 4);
|
||||
c->inv_w[2] = brw_vec1_grf(2, 5);
|
||||
|
||||
/* The vertices:
|
||||
*/
|
||||
reg = 3;
|
||||
for (i = 0; i < c->nr_verts; i++) {
|
||||
c->vert[i] = brw_vec8_grf(reg, 0);
|
||||
reg += c->nr_attr_regs;
|
||||
}
|
||||
|
||||
/* Temporaries, allocated after last vertex reg.
|
||||
*/
|
||||
c->inv_det = brw_vec1_grf(reg, 0); reg++;
|
||||
c->a1_sub_a0 = brw_vec8_grf(reg, 0); reg++;
|
||||
c->a2_sub_a0 = brw_vec8_grf(reg, 0); reg++;
|
||||
c->tmp = brw_vec8_grf(reg, 0); reg++;
|
||||
|
||||
/* Note grf allocation:
|
||||
*/
|
||||
c->prog_data.total_grf = reg;
|
||||
|
||||
|
||||
/* Outputs of this program - interpolation coefficients for
|
||||
* rasterization:
|
||||
*/
|
||||
c->m1Cx = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 1, 0);
|
||||
c->m2Cy = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 2, 0);
|
||||
c->m3C0 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 3, 0);
|
||||
}
|
||||
|
||||
|
||||
static void copy_z_inv_w( struct brw_sf_compile *c )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
GLuint i;
|
||||
|
||||
/* Copy both scalars with a single MOV:
|
||||
*/
|
||||
for (i = 0; i < c->nr_verts; i++)
|
||||
brw_MOV(p, vec2(suboffset(c->vert[i], 2)), vec2(c->z[i]));
|
||||
}
|
||||
|
||||
|
||||
static void invert_det( struct brw_sf_compile *c)
|
||||
{
|
||||
/* Looks like we invert all 8 elements just to get 1/det in
|
||||
* position 2 !?!
|
||||
*/
|
||||
gfx4_math(&c->func,
|
||||
c->inv_det,
|
||||
BRW_MATH_FUNCTION_INV,
|
||||
0,
|
||||
c->det,
|
||||
BRW_MATH_PRECISION_FULL);
|
||||
|
||||
}
|
||||
|
||||
|
||||
static bool
|
||||
calculate_masks(struct brw_sf_compile *c,
|
||||
GLuint reg,
|
||||
GLushort *pc,
|
||||
GLushort *pc_persp,
|
||||
GLushort *pc_linear)
|
||||
{
|
||||
bool is_last_attr = (reg == c->nr_setup_regs - 1);
|
||||
enum glsl_interp_mode interp;
|
||||
|
||||
*pc_persp = 0;
|
||||
*pc_linear = 0;
|
||||
*pc = 0xf;
|
||||
|
||||
interp = c->key.interp_mode[vert_reg_to_vue_slot(c, reg, 0)];
|
||||
if (interp == INTERP_MODE_SMOOTH) {
|
||||
*pc_linear = 0xf;
|
||||
*pc_persp = 0xf;
|
||||
} else if (interp == INTERP_MODE_NOPERSPECTIVE)
|
||||
*pc_linear = 0xf;
|
||||
|
||||
/* Maybe only process one attribute on the final round:
|
||||
*/
|
||||
if (vert_reg_to_varying(c, reg, 1) != BRW_VARYING_SLOT_COUNT) {
|
||||
*pc |= 0xf0;
|
||||
|
||||
interp = c->key.interp_mode[vert_reg_to_vue_slot(c, reg, 1)];
|
||||
if (interp == INTERP_MODE_SMOOTH) {
|
||||
*pc_linear |= 0xf0;
|
||||
*pc_persp |= 0xf0;
|
||||
} else if (interp == INTERP_MODE_NOPERSPECTIVE)
|
||||
*pc_linear |= 0xf0;
|
||||
}
|
||||
|
||||
return is_last_attr;
|
||||
}
|
||||
|
||||
/* Calculates the predicate control for which channels of a reg
|
||||
* (containing 2 attrs) to do point sprite coordinate replacement on.
|
||||
*/
|
||||
static uint16_t
|
||||
calculate_point_sprite_mask(struct brw_sf_compile *c, GLuint reg)
|
||||
{
|
||||
int varying1, varying2;
|
||||
uint16_t pc = 0;
|
||||
|
||||
varying1 = vert_reg_to_varying(c, reg, 0);
|
||||
if (varying1 >= VARYING_SLOT_TEX0 && varying1 <= VARYING_SLOT_TEX7) {
|
||||
if (c->key.point_sprite_coord_replace & (1 << (varying1 - VARYING_SLOT_TEX0)))
|
||||
pc |= 0x0f;
|
||||
}
|
||||
if (varying1 == BRW_VARYING_SLOT_PNTC)
|
||||
pc |= 0x0f;
|
||||
|
||||
varying2 = vert_reg_to_varying(c, reg, 1);
|
||||
if (varying2 >= VARYING_SLOT_TEX0 && varying2 <= VARYING_SLOT_TEX7) {
|
||||
if (c->key.point_sprite_coord_replace & (1 << (varying2 -
|
||||
VARYING_SLOT_TEX0)))
|
||||
pc |= 0xf0;
|
||||
}
|
||||
if (varying2 == BRW_VARYING_SLOT_PNTC)
|
||||
pc |= 0xf0;
|
||||
|
||||
return pc;
|
||||
}
|
||||
|
||||
static void
|
||||
set_predicate_control_flag_value(struct brw_codegen *p,
|
||||
struct brw_sf_compile *c,
|
||||
unsigned value)
|
||||
{
|
||||
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
|
||||
|
||||
if (value != 0xff) {
|
||||
if (value != c->flag_value) {
|
||||
brw_MOV(p, brw_flag_reg(0, 0), brw_imm_uw(value));
|
||||
c->flag_value = value;
|
||||
}
|
||||
|
||||
brw_set_default_predicate_control(p, BRW_PREDICATE_NORMAL);
|
||||
}
|
||||
}
|
||||
|
||||
static void brw_emit_tri_setup(struct brw_sf_compile *c, bool allocate)
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
GLuint i;
|
||||
|
||||
c->flag_value = 0xff;
|
||||
c->nr_verts = 3;
|
||||
|
||||
if (allocate)
|
||||
alloc_regs(c);
|
||||
|
||||
invert_det(c);
|
||||
copy_z_inv_w(c);
|
||||
|
||||
if (c->key.do_twoside_color)
|
||||
do_twoside_color(c);
|
||||
|
||||
if (c->key.contains_flat_varying)
|
||||
do_flatshade_triangle(c);
|
||||
|
||||
|
||||
for (i = 0; i < c->nr_setup_regs; i++)
|
||||
{
|
||||
/* Pair of incoming attributes:
|
||||
*/
|
||||
struct brw_reg a0 = offset(c->vert[0], i);
|
||||
struct brw_reg a1 = offset(c->vert[1], i);
|
||||
struct brw_reg a2 = offset(c->vert[2], i);
|
||||
GLushort pc, pc_persp, pc_linear;
|
||||
bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
|
||||
|
||||
if (pc_persp)
|
||||
{
|
||||
set_predicate_control_flag_value(p, c, pc_persp);
|
||||
brw_MUL(p, a0, a0, c->inv_w[0]);
|
||||
brw_MUL(p, a1, a1, c->inv_w[1]);
|
||||
brw_MUL(p, a2, a2, c->inv_w[2]);
|
||||
}
|
||||
|
||||
|
||||
/* Calculate coefficients for interpolated values:
|
||||
*/
|
||||
if (pc_linear)
|
||||
{
|
||||
set_predicate_control_flag_value(p, c, pc_linear);
|
||||
|
||||
brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
|
||||
brw_ADD(p, c->a2_sub_a0, a2, negate(a0));
|
||||
|
||||
/* calculate dA/dx
|
||||
*/
|
||||
brw_MUL(p, brw_null_reg(), c->a1_sub_a0, c->dy2);
|
||||
brw_MAC(p, c->tmp, c->a2_sub_a0, negate(c->dy0));
|
||||
brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
|
||||
|
||||
/* calculate dA/dy
|
||||
*/
|
||||
brw_MUL(p, brw_null_reg(), c->a2_sub_a0, c->dx0);
|
||||
brw_MAC(p, c->tmp, c->a1_sub_a0, negate(c->dx2));
|
||||
brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
|
||||
}
|
||||
|
||||
{
|
||||
set_predicate_control_flag_value(p, c, pc);
|
||||
/* start point for interpolation
|
||||
*/
|
||||
brw_MOV(p, c->m3C0, a0);
|
||||
|
||||
/* Copy m0..m3 to URB. m0 is implicitly copied from r0 in
|
||||
* the send instruction:
|
||||
*/
|
||||
brw_urb_WRITE(p,
|
||||
brw_null_reg(),
|
||||
0,
|
||||
brw_vec8_grf(0, 0), /* r0, will be copied to m0 */
|
||||
last ? BRW_URB_WRITE_EOT_COMPLETE
|
||||
: BRW_URB_WRITE_NO_FLAGS,
|
||||
4, /* msg len */
|
||||
0, /* response len */
|
||||
i*4, /* offset */
|
||||
BRW_URB_SWIZZLE_TRANSPOSE); /* XXX: Swizzle control "SF to windower" */
|
||||
}
|
||||
}
|
||||
|
||||
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void brw_emit_line_setup(struct brw_sf_compile *c, bool allocate)
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
GLuint i;
|
||||
|
||||
c->flag_value = 0xff;
|
||||
c->nr_verts = 2;
|
||||
|
||||
if (allocate)
|
||||
alloc_regs(c);
|
||||
|
||||
invert_det(c);
|
||||
copy_z_inv_w(c);
|
||||
|
||||
if (c->key.contains_flat_varying)
|
||||
do_flatshade_line(c);
|
||||
|
||||
for (i = 0; i < c->nr_setup_regs; i++)
|
||||
{
|
||||
/* Pair of incoming attributes:
|
||||
*/
|
||||
struct brw_reg a0 = offset(c->vert[0], i);
|
||||
struct brw_reg a1 = offset(c->vert[1], i);
|
||||
GLushort pc, pc_persp, pc_linear;
|
||||
bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
|
||||
|
||||
if (pc_persp)
|
||||
{
|
||||
set_predicate_control_flag_value(p, c, pc_persp);
|
||||
brw_MUL(p, a0, a0, c->inv_w[0]);
|
||||
brw_MUL(p, a1, a1, c->inv_w[1]);
|
||||
}
|
||||
|
||||
/* Calculate coefficients for position, color:
|
||||
*/
|
||||
if (pc_linear) {
|
||||
set_predicate_control_flag_value(p, c, pc_linear);
|
||||
|
||||
brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
|
||||
|
||||
brw_MUL(p, c->tmp, c->a1_sub_a0, c->dx0);
|
||||
brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
|
||||
|
||||
brw_MUL(p, c->tmp, c->a1_sub_a0, c->dy0);
|
||||
brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
|
||||
}
|
||||
|
||||
{
|
||||
set_predicate_control_flag_value(p, c, pc);
|
||||
|
||||
/* start point for interpolation
|
||||
*/
|
||||
brw_MOV(p, c->m3C0, a0);
|
||||
|
||||
/* Copy m0..m3 to URB.
|
||||
*/
|
||||
brw_urb_WRITE(p,
|
||||
brw_null_reg(),
|
||||
0,
|
||||
brw_vec8_grf(0, 0),
|
||||
last ? BRW_URB_WRITE_EOT_COMPLETE
|
||||
: BRW_URB_WRITE_NO_FLAGS,
|
||||
4, /* msg len */
|
||||
0, /* response len */
|
||||
i*4, /* urb destination offset */
|
||||
BRW_URB_SWIZZLE_TRANSPOSE);
|
||||
}
|
||||
}
|
||||
|
||||
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
|
||||
}
|
||||
|
||||
static void brw_emit_point_sprite_setup(struct brw_sf_compile *c, bool allocate)
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
GLuint i;
|
||||
|
||||
c->flag_value = 0xff;
|
||||
c->nr_verts = 1;
|
||||
|
||||
if (allocate)
|
||||
alloc_regs(c);
|
||||
|
||||
copy_z_inv_w(c);
|
||||
for (i = 0; i < c->nr_setup_regs; i++)
|
||||
{
|
||||
struct brw_reg a0 = offset(c->vert[0], i);
|
||||
GLushort pc, pc_persp, pc_linear, pc_coord_replace;
|
||||
bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
|
||||
|
||||
pc_coord_replace = calculate_point_sprite_mask(c, i);
|
||||
pc_persp &= ~pc_coord_replace;
|
||||
|
||||
if (pc_persp) {
|
||||
set_predicate_control_flag_value(p, c, pc_persp);
|
||||
brw_MUL(p, a0, a0, c->inv_w[0]);
|
||||
}
|
||||
|
||||
/* Point sprite coordinate replacement: A texcoord with this
|
||||
* enabled gets replaced with the value (x, y, 0, 1) where x and
|
||||
* y vary from 0 to 1 across the horizontal and vertical of the
|
||||
* point.
|
||||
*/
|
||||
if (pc_coord_replace) {
|
||||
set_predicate_control_flag_value(p, c, pc_coord_replace);
|
||||
/* Calculate 1.0/PointWidth */
|
||||
gfx4_math(&c->func,
|
||||
c->tmp,
|
||||
BRW_MATH_FUNCTION_INV,
|
||||
0,
|
||||
c->dx0,
|
||||
BRW_MATH_PRECISION_FULL);
|
||||
|
||||
brw_set_default_access_mode(p, BRW_ALIGN_16);
|
||||
|
||||
/* dA/dx, dA/dy */
|
||||
brw_MOV(p, c->m1Cx, brw_imm_f(0.0));
|
||||
brw_MOV(p, c->m2Cy, brw_imm_f(0.0));
|
||||
brw_MOV(p, brw_writemask(c->m1Cx, WRITEMASK_X), c->tmp);
|
||||
if (c->key.sprite_origin_lower_left) {
|
||||
brw_MOV(p, brw_writemask(c->m2Cy, WRITEMASK_Y), negate(c->tmp));
|
||||
} else {
|
||||
brw_MOV(p, brw_writemask(c->m2Cy, WRITEMASK_Y), c->tmp);
|
||||
}
|
||||
|
||||
/* attribute constant offset */
|
||||
brw_MOV(p, c->m3C0, brw_imm_f(0.0));
|
||||
if (c->key.sprite_origin_lower_left) {
|
||||
brw_MOV(p, brw_writemask(c->m3C0, WRITEMASK_YW), brw_imm_f(1.0));
|
||||
} else {
|
||||
brw_MOV(p, brw_writemask(c->m3C0, WRITEMASK_W), brw_imm_f(1.0));
|
||||
}
|
||||
|
||||
brw_set_default_access_mode(p, BRW_ALIGN_1);
|
||||
}
|
||||
|
||||
if (pc & ~pc_coord_replace) {
|
||||
set_predicate_control_flag_value(p, c, pc & ~pc_coord_replace);
|
||||
brw_MOV(p, c->m1Cx, brw_imm_ud(0));
|
||||
brw_MOV(p, c->m2Cy, brw_imm_ud(0));
|
||||
brw_MOV(p, c->m3C0, a0); /* constant value */
|
||||
}
|
||||
|
||||
|
||||
set_predicate_control_flag_value(p, c, pc);
|
||||
/* Copy m0..m3 to URB. */
|
||||
brw_urb_WRITE(p,
|
||||
brw_null_reg(),
|
||||
0,
|
||||
brw_vec8_grf(0, 0),
|
||||
last ? BRW_URB_WRITE_EOT_COMPLETE
|
||||
: BRW_URB_WRITE_NO_FLAGS,
|
||||
4, /* msg len */
|
||||
0, /* response len */
|
||||
i*4, /* urb destination offset */
|
||||
BRW_URB_SWIZZLE_TRANSPOSE);
|
||||
}
|
||||
|
||||
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
|
||||
}
|
||||
|
||||
/* Points setup - several simplifications as all attributes are
|
||||
* constant across the face of the point (point sprites excluded!)
|
||||
*/
|
||||
static void brw_emit_point_setup(struct brw_sf_compile *c, bool allocate)
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
GLuint i;
|
||||
|
||||
c->flag_value = 0xff;
|
||||
c->nr_verts = 1;
|
||||
|
||||
if (allocate)
|
||||
alloc_regs(c);
|
||||
|
||||
copy_z_inv_w(c);
|
||||
|
||||
brw_MOV(p, c->m1Cx, brw_imm_ud(0)); /* zero - move out of loop */
|
||||
brw_MOV(p, c->m2Cy, brw_imm_ud(0)); /* zero - move out of loop */
|
||||
|
||||
for (i = 0; i < c->nr_setup_regs; i++)
|
||||
{
|
||||
struct brw_reg a0 = offset(c->vert[0], i);
|
||||
GLushort pc, pc_persp, pc_linear;
|
||||
bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
|
||||
|
||||
if (pc_persp)
|
||||
{
|
||||
/* This seems odd as the values are all constant, but the
|
||||
* fragment shader will be expecting it:
|
||||
*/
|
||||
set_predicate_control_flag_value(p, c, pc_persp);
|
||||
brw_MUL(p, a0, a0, c->inv_w[0]);
|
||||
}
|
||||
|
||||
|
||||
/* The delta values are always zero, just send the starting
|
||||
* coordinate. Again, this is to fit in with the interpolation
|
||||
* code in the fragment shader.
|
||||
*/
|
||||
{
|
||||
set_predicate_control_flag_value(p, c, pc);
|
||||
|
||||
brw_MOV(p, c->m3C0, a0); /* constant value */
|
||||
|
||||
/* Copy m0..m3 to URB.
|
||||
*/
|
||||
brw_urb_WRITE(p,
|
||||
brw_null_reg(),
|
||||
0,
|
||||
brw_vec8_grf(0, 0),
|
||||
last ? BRW_URB_WRITE_EOT_COMPLETE
|
||||
: BRW_URB_WRITE_NO_FLAGS,
|
||||
4, /* msg len */
|
||||
0, /* response len */
|
||||
i*4, /* urb destination offset */
|
||||
BRW_URB_SWIZZLE_TRANSPOSE);
|
||||
}
|
||||
}
|
||||
|
||||
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
|
||||
}
|
||||
|
||||
static void brw_emit_anyprim_setup( struct brw_sf_compile *c )
|
||||
{
|
||||
struct brw_codegen *p = &c->func;
|
||||
struct brw_reg payload_prim = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0);
|
||||
struct brw_reg payload_attr = get_element_ud(brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0), 0);
|
||||
struct brw_reg primmask;
|
||||
int jmp;
|
||||
struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
|
||||
|
||||
c->nr_verts = 3;
|
||||
alloc_regs(c);
|
||||
|
||||
primmask = retype(get_element(c->tmp, 0), BRW_REGISTER_TYPE_UD);
|
||||
|
||||
brw_MOV(p, primmask, brw_imm_ud(1));
|
||||
brw_SHL(p, primmask, primmask, payload_prim);
|
||||
|
||||
brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_TRILIST) |
|
||||
(1<<_3DPRIM_TRISTRIP) |
|
||||
(1<<_3DPRIM_TRIFAN) |
|
||||
(1<<_3DPRIM_TRISTRIP_REVERSE) |
|
||||
(1<<_3DPRIM_POLYGON) |
|
||||
(1<<_3DPRIM_RECTLIST) |
|
||||
(1<<_3DPRIM_TRIFAN_NOSTIPPLE)));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
|
||||
jmp = brw_JMPI(p, brw_imm_d(0), BRW_PREDICATE_NORMAL) - p->store;
|
||||
brw_emit_tri_setup(c, false);
|
||||
brw_land_fwd_jump(p, jmp);
|
||||
|
||||
brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_LINELIST) |
|
||||
(1<<_3DPRIM_LINESTRIP) |
|
||||
(1<<_3DPRIM_LINELOOP) |
|
||||
(1<<_3DPRIM_LINESTRIP_CONT) |
|
||||
(1<<_3DPRIM_LINESTRIP_BF) |
|
||||
(1<<_3DPRIM_LINESTRIP_CONT_BF)));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
|
||||
jmp = brw_JMPI(p, brw_imm_d(0), BRW_PREDICATE_NORMAL) - p->store;
|
||||
brw_emit_line_setup(c, false);
|
||||
brw_land_fwd_jump(p, jmp);
|
||||
|
||||
brw_AND(p, v1_null_ud, payload_attr, brw_imm_ud(1<<BRW_SPRITE_POINT_ENABLE));
|
||||
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
|
||||
jmp = brw_JMPI(p, brw_imm_d(0), BRW_PREDICATE_NORMAL) - p->store;
|
||||
brw_emit_point_sprite_setup(c, false);
|
||||
brw_land_fwd_jump(p, jmp);
|
||||
|
||||
brw_emit_point_setup( c, false );
|
||||
}
|
||||
|
||||
const unsigned *
|
||||
brw_compile_sf(const struct brw_compiler *compiler,
|
||||
void *mem_ctx,
|
||||
const struct brw_sf_prog_key *key,
|
||||
struct brw_sf_prog_data *prog_data,
|
||||
struct intel_vue_map *vue_map,
|
||||
unsigned *final_assembly_size)
|
||||
{
|
||||
struct brw_sf_compile c;
|
||||
memset(&c, 0, sizeof(c));
|
||||
|
||||
/* Begin the compilation:
|
||||
*/
|
||||
brw_init_codegen(&compiler->isa, &c.func, mem_ctx);
|
||||
|
||||
c.key = *key;
|
||||
c.vue_map = *vue_map;
|
||||
if (c.key.do_point_coord) {
|
||||
/*
|
||||
* gl_PointCoord is a FS instead of VS builtin variable, thus it's
|
||||
* not included in c.vue_map generated in VS stage. Here we add
|
||||
* it manually to let SF shader generate the needed interpolation
|
||||
* coefficient for FS shader.
|
||||
*/
|
||||
c.vue_map.varying_to_slot[BRW_VARYING_SLOT_PNTC] = c.vue_map.num_slots;
|
||||
c.vue_map.slot_to_varying[c.vue_map.num_slots++] = BRW_VARYING_SLOT_PNTC;
|
||||
}
|
||||
c.urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET;
|
||||
c.nr_attr_regs = (c.vue_map.num_slots + 1)/2 - c.urb_entry_read_offset;
|
||||
c.nr_setup_regs = c.nr_attr_regs;
|
||||
|
||||
c.prog_data.urb_read_length = c.nr_attr_regs;
|
||||
c.prog_data.urb_entry_size = c.nr_setup_regs * 2;
|
||||
|
||||
/* Which primitive? Or all three?
|
||||
*/
|
||||
switch (key->primitive) {
|
||||
case BRW_SF_PRIM_TRIANGLES:
|
||||
c.nr_verts = 3;
|
||||
brw_emit_tri_setup( &c, true );
|
||||
break;
|
||||
case BRW_SF_PRIM_LINES:
|
||||
c.nr_verts = 2;
|
||||
brw_emit_line_setup( &c, true );
|
||||
break;
|
||||
case BRW_SF_PRIM_POINTS:
|
||||
c.nr_verts = 1;
|
||||
if (key->do_point_sprite)
|
||||
brw_emit_point_sprite_setup( &c, true );
|
||||
else
|
||||
brw_emit_point_setup( &c, true );
|
||||
break;
|
||||
case BRW_SF_PRIM_UNFILLED_TRIS:
|
||||
c.nr_verts = 3;
|
||||
brw_emit_anyprim_setup( &c );
|
||||
break;
|
||||
default:
|
||||
unreachable("not reached");
|
||||
}
|
||||
|
||||
/* FINISHME: SF programs use calculated jumps (i.e., JMPI with a register
|
||||
* source). Compacting would be difficult.
|
||||
*/
|
||||
/* brw_compact_instructions(&c.func, 0, 0, NULL); */
|
||||
|
||||
*prog_data = c.prog_data;
|
||||
|
||||
const unsigned *program = brw_get_program(&c.func, final_assembly_size);
|
||||
|
||||
if (INTEL_DEBUG(DEBUG_SF)) {
|
||||
fprintf(stderr, "sf:\n");
|
||||
brw_disassemble_with_labels(&compiler->isa,
|
||||
program, 0, *final_assembly_size, stderr);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
return program;
|
||||
}
|
||||
370
src/intel/compiler/elk/brw_compiler.c
Normal file
370
src/intel/compiler/elk/brw_compiler.c
Normal file
|
|
@ -0,0 +1,370 @@
|
|||
/*
|
||||
* Copyright © 2015-2016 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_compiler.h"
|
||||
#include "brw_shader.h"
|
||||
#include "brw_eu.h"
|
||||
#include "brw_nir.h"
|
||||
#include "dev/intel_debug.h"
|
||||
#include "compiler/nir/nir.h"
|
||||
#include "util/u_debug.h"
|
||||
|
||||
#define COMMON_OPTIONS \
|
||||
.has_uclz = true, \
|
||||
.lower_fdiv = true, \
|
||||
.lower_scmp = true, \
|
||||
.lower_flrp16 = true, \
|
||||
.lower_fmod = true, \
|
||||
.lower_ufind_msb = true, \
|
||||
.lower_uadd_carry = true, \
|
||||
.lower_usub_borrow = true, \
|
||||
.lower_flrp64 = true, \
|
||||
.lower_fisnormal = true, \
|
||||
.lower_isign = true, \
|
||||
.lower_ldexp = true, \
|
||||
.lower_bitfield_extract = true, \
|
||||
.lower_bitfield_insert = true, \
|
||||
.lower_device_index_to_zero = true, \
|
||||
.vectorize_io = true, \
|
||||
.vectorize_tess_levels = true, \
|
||||
.use_interpolated_input_intrinsics = true, \
|
||||
.lower_insert_byte = true, \
|
||||
.lower_insert_word = true, \
|
||||
.vertex_id_zero_based = true, \
|
||||
.lower_base_vertex = true, \
|
||||
.support_16bit_alu = true, \
|
||||
.lower_uniforms_to_ubo = true
|
||||
|
||||
#define COMMON_SCALAR_OPTIONS \
|
||||
.lower_to_scalar = true, \
|
||||
.lower_pack_half_2x16 = true, \
|
||||
.lower_pack_snorm_2x16 = true, \
|
||||
.lower_pack_snorm_4x8 = true, \
|
||||
.lower_pack_unorm_2x16 = true, \
|
||||
.lower_pack_unorm_4x8 = true, \
|
||||
.lower_unpack_half_2x16 = true, \
|
||||
.lower_unpack_snorm_2x16 = true, \
|
||||
.lower_unpack_snorm_4x8 = true, \
|
||||
.lower_unpack_unorm_2x16 = true, \
|
||||
.lower_unpack_unorm_4x8 = true, \
|
||||
.lower_hadd64 = true, \
|
||||
.avoid_ternary_with_two_constants = true, \
|
||||
.has_pack_32_4x8 = true, \
|
||||
.max_unroll_iterations = 32, \
|
||||
.force_indirect_unrolling = nir_var_function_temp, \
|
||||
.divergence_analysis_options = \
|
||||
(nir_divergence_single_patch_per_tcs_subgroup | \
|
||||
nir_divergence_single_patch_per_tes_subgroup | \
|
||||
nir_divergence_shader_record_ptr_uniform)
|
||||
|
||||
const struct nir_shader_compiler_options brw_scalar_nir_options = {
|
||||
COMMON_OPTIONS,
|
||||
COMMON_SCALAR_OPTIONS,
|
||||
};
|
||||
|
||||
const struct nir_shader_compiler_options brw_vector_nir_options = {
|
||||
COMMON_OPTIONS,
|
||||
|
||||
/* In the vec4 backend, our dpN instruction replicates its result to all the
|
||||
* components of a vec4. We would like NIR to give us replicated fdot
|
||||
* instructions because it can optimize better for us.
|
||||
*/
|
||||
.fdot_replicates = true,
|
||||
|
||||
.lower_usub_sat = true,
|
||||
.lower_pack_snorm_2x16 = true,
|
||||
.lower_pack_unorm_2x16 = true,
|
||||
.lower_unpack_snorm_2x16 = true,
|
||||
.lower_unpack_unorm_2x16 = true,
|
||||
.lower_extract_byte = true,
|
||||
.lower_extract_word = true,
|
||||
.intel_vec4 = true,
|
||||
.max_unroll_iterations = 32,
|
||||
};
|
||||
|
||||
struct brw_compiler *
|
||||
brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
|
||||
{
|
||||
struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler);
|
||||
|
||||
compiler->devinfo = devinfo;
|
||||
|
||||
brw_init_isa_info(&compiler->isa, devinfo);
|
||||
|
||||
brw_fs_alloc_reg_sets(compiler);
|
||||
if (devinfo->ver < 8)
|
||||
brw_vec4_alloc_reg_set(compiler);
|
||||
|
||||
compiler->precise_trig = debug_get_bool_option("INTEL_PRECISE_TRIG", false);
|
||||
|
||||
compiler->use_tcs_multi_patch = devinfo->ver >= 12;
|
||||
|
||||
/* Default to the sampler since that's what we've done since forever */
|
||||
compiler->indirect_ubos_use_sampler = true;
|
||||
|
||||
compiler->lower_dpas = devinfo->verx10 < 125 ||
|
||||
intel_device_info_is_mtl(devinfo) ||
|
||||
(intel_device_info_is_arl(devinfo) &&
|
||||
devinfo->platform != INTEL_PLATFORM_ARL_H) ||
|
||||
debug_get_bool_option("INTEL_LOWER_DPAS", false);
|
||||
|
||||
/* There is no vec4 mode on Gfx10+, and we don't use it at all on Gfx8+. */
|
||||
for (int i = MESA_SHADER_VERTEX; i < MESA_ALL_SHADER_STAGES; i++) {
|
||||
compiler->scalar_stage[i] = devinfo->ver >= 8 ||
|
||||
i == MESA_SHADER_FRAGMENT || i == MESA_SHADER_COMPUTE;
|
||||
}
|
||||
|
||||
for (int i = MESA_SHADER_TASK; i < MESA_VULKAN_SHADER_STAGES; i++)
|
||||
compiler->scalar_stage[i] = true;
|
||||
|
||||
nir_lower_int64_options int64_options =
|
||||
nir_lower_imul64 |
|
||||
nir_lower_isign64 |
|
||||
nir_lower_divmod64 |
|
||||
nir_lower_imul_high64 |
|
||||
nir_lower_find_lsb64 |
|
||||
nir_lower_ufind_msb64 |
|
||||
nir_lower_bit_count64;
|
||||
nir_lower_doubles_options fp64_options =
|
||||
nir_lower_drcp |
|
||||
nir_lower_dsqrt |
|
||||
nir_lower_drsq |
|
||||
nir_lower_dtrunc |
|
||||
nir_lower_dfloor |
|
||||
nir_lower_dceil |
|
||||
nir_lower_dfract |
|
||||
nir_lower_dround_even |
|
||||
nir_lower_dmod |
|
||||
nir_lower_dsub |
|
||||
nir_lower_ddiv;
|
||||
|
||||
if (!devinfo->has_64bit_float || INTEL_DEBUG(DEBUG_SOFT64))
|
||||
fp64_options |= nir_lower_fp64_full_software;
|
||||
if (!devinfo->has_64bit_int)
|
||||
int64_options |= (nir_lower_int64_options)~0;
|
||||
|
||||
/* The Bspec's section titled "Instruction_multiply[DevBDW+]" claims that
|
||||
* destination type can be Quadword and source type Doubleword for Gfx8 and
|
||||
* Gfx9. So, lower 64 bit multiply instruction on rest of the platforms.
|
||||
*/
|
||||
if (devinfo->ver < 8 || devinfo->ver > 9)
|
||||
int64_options |= nir_lower_imul_2x32_64;
|
||||
|
||||
/* We want the GLSL compiler to emit code that uses condition codes */
|
||||
for (int i = 0; i < MESA_ALL_SHADER_STAGES; i++) {
|
||||
struct nir_shader_compiler_options *nir_options =
|
||||
rzalloc(compiler, struct nir_shader_compiler_options);
|
||||
bool is_scalar = compiler->scalar_stage[i];
|
||||
if (is_scalar) {
|
||||
*nir_options = brw_scalar_nir_options;
|
||||
int64_options |= nir_lower_usub_sat64;
|
||||
} else {
|
||||
*nir_options = brw_vector_nir_options;
|
||||
}
|
||||
|
||||
/* Prior to Gfx6, there are no three source operations, and Gfx11 loses
|
||||
* LRP.
|
||||
*/
|
||||
nir_options->lower_ffma16 = devinfo->ver < 6;
|
||||
nir_options->lower_ffma32 = devinfo->ver < 6;
|
||||
nir_options->lower_ffma64 = devinfo->ver < 6;
|
||||
nir_options->lower_flrp32 = devinfo->ver < 6 || devinfo->ver >= 11;
|
||||
nir_options->lower_fpow = devinfo->ver >= 12;
|
||||
|
||||
nir_options->has_bfe = devinfo->ver >= 7;
|
||||
nir_options->has_bfm = devinfo->ver >= 7;
|
||||
nir_options->has_bfi = devinfo->ver >= 7;
|
||||
|
||||
nir_options->has_rotate16 = devinfo->ver >= 11;
|
||||
nir_options->has_rotate32 = devinfo->ver >= 11;
|
||||
nir_options->lower_bitfield_reverse = devinfo->ver < 7;
|
||||
nir_options->lower_find_lsb = devinfo->ver < 7;
|
||||
nir_options->lower_ifind_msb = devinfo->ver < 7;
|
||||
nir_options->has_iadd3 = devinfo->verx10 >= 125;
|
||||
|
||||
nir_options->has_sdot_4x8 = devinfo->ver >= 12;
|
||||
nir_options->has_udot_4x8 = devinfo->ver >= 12;
|
||||
nir_options->has_sudot_4x8 = devinfo->ver >= 12;
|
||||
nir_options->has_sdot_4x8_sat = devinfo->ver >= 12;
|
||||
nir_options->has_udot_4x8_sat = devinfo->ver >= 12;
|
||||
nir_options->has_sudot_4x8_sat = devinfo->ver >= 12;
|
||||
|
||||
nir_options->lower_int64_options = int64_options;
|
||||
nir_options->lower_doubles_options = fp64_options;
|
||||
|
||||
nir_options->unify_interfaces = i < MESA_SHADER_FRAGMENT;
|
||||
|
||||
nir_options->force_indirect_unrolling |=
|
||||
brw_nir_no_indirect_mask(compiler, i);
|
||||
nir_options->force_indirect_unrolling_sampler = devinfo->ver < 7;
|
||||
|
||||
if (compiler->use_tcs_multi_patch) {
|
||||
/* TCS MULTI_PATCH mode has multiple patches per subgroup */
|
||||
nir_options->divergence_analysis_options &=
|
||||
~nir_divergence_single_patch_per_tcs_subgroup;
|
||||
}
|
||||
|
||||
if (devinfo->ver < 12)
|
||||
nir_options->divergence_analysis_options |=
|
||||
nir_divergence_single_prim_per_subgroup;
|
||||
|
||||
compiler->nir_options[i] = nir_options;
|
||||
}
|
||||
|
||||
compiler->mesh.mue_header_packing =
|
||||
(unsigned)debug_get_num_option("INTEL_MESH_HEADER_PACKING", 3);
|
||||
compiler->mesh.mue_compaction =
|
||||
debug_get_bool_option("INTEL_MESH_COMPACTION", true);
|
||||
|
||||
return compiler;
|
||||
}
|
||||
|
||||
static void
|
||||
insert_u64_bit(uint64_t *val, bool add)
|
||||
{
|
||||
*val = (*val << 1) | !!add;
|
||||
}
|
||||
|
||||
uint64_t
|
||||
brw_get_compiler_config_value(const struct brw_compiler *compiler)
|
||||
{
|
||||
uint64_t config = 0;
|
||||
unsigned bits = 0;
|
||||
|
||||
insert_u64_bit(&config, compiler->precise_trig);
|
||||
bits++;
|
||||
insert_u64_bit(&config, compiler->lower_dpas);
|
||||
bits++;
|
||||
insert_u64_bit(&config, compiler->mesh.mue_compaction);
|
||||
bits++;
|
||||
|
||||
uint64_t mask = DEBUG_DISK_CACHE_MASK;
|
||||
bits += util_bitcount64(mask);
|
||||
|
||||
u_foreach_bit64(bit, mask)
|
||||
insert_u64_bit(&config, INTEL_DEBUG(1ULL << bit));
|
||||
|
||||
mask = SIMD_DISK_CACHE_MASK;
|
||||
bits += util_bitcount64(mask);
|
||||
|
||||
u_foreach_bit64(bit, mask)
|
||||
insert_u64_bit(&config, (intel_simd & (1ULL << bit)) != 0);
|
||||
|
||||
mask = 3;
|
||||
bits += util_bitcount64(mask);
|
||||
|
||||
u_foreach_bit64(bit, mask)
|
||||
insert_u64_bit(&config, (compiler->mesh.mue_header_packing & (1ULL << bit)) != 0);
|
||||
|
||||
assert(bits <= util_bitcount64(UINT64_MAX));
|
||||
|
||||
return config;
|
||||
}
|
||||
|
||||
void
|
||||
brw_device_sha1(char *hex,
|
||||
const struct intel_device_info *devinfo) {
|
||||
struct mesa_sha1 ctx;
|
||||
_mesa_sha1_init(&ctx);
|
||||
brw_device_sha1_update(&ctx, devinfo);
|
||||
unsigned char result[20];
|
||||
_mesa_sha1_final(&ctx, result);
|
||||
_mesa_sha1_format(hex, result);
|
||||
}
|
||||
|
||||
unsigned
|
||||
brw_prog_data_size(gl_shader_stage stage)
|
||||
{
|
||||
static const size_t stage_sizes[] = {
|
||||
[MESA_SHADER_VERTEX] = sizeof(struct brw_vs_prog_data),
|
||||
[MESA_SHADER_TESS_CTRL] = sizeof(struct brw_tcs_prog_data),
|
||||
[MESA_SHADER_TESS_EVAL] = sizeof(struct brw_tes_prog_data),
|
||||
[MESA_SHADER_GEOMETRY] = sizeof(struct brw_gs_prog_data),
|
||||
[MESA_SHADER_FRAGMENT] = sizeof(struct brw_wm_prog_data),
|
||||
[MESA_SHADER_COMPUTE] = sizeof(struct brw_cs_prog_data),
|
||||
[MESA_SHADER_TASK] = sizeof(struct brw_task_prog_data),
|
||||
[MESA_SHADER_MESH] = sizeof(struct brw_mesh_prog_data),
|
||||
[MESA_SHADER_RAYGEN] = sizeof(struct brw_bs_prog_data),
|
||||
[MESA_SHADER_ANY_HIT] = sizeof(struct brw_bs_prog_data),
|
||||
[MESA_SHADER_CLOSEST_HIT] = sizeof(struct brw_bs_prog_data),
|
||||
[MESA_SHADER_MISS] = sizeof(struct brw_bs_prog_data),
|
||||
[MESA_SHADER_INTERSECTION] = sizeof(struct brw_bs_prog_data),
|
||||
[MESA_SHADER_CALLABLE] = sizeof(struct brw_bs_prog_data),
|
||||
[MESA_SHADER_KERNEL] = sizeof(struct brw_cs_prog_data),
|
||||
};
|
||||
assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_sizes));
|
||||
return stage_sizes[stage];
|
||||
}
|
||||
|
||||
unsigned
|
||||
brw_prog_key_size(gl_shader_stage stage)
|
||||
{
|
||||
static const size_t stage_sizes[] = {
|
||||
[MESA_SHADER_VERTEX] = sizeof(struct brw_vs_prog_key),
|
||||
[MESA_SHADER_TESS_CTRL] = sizeof(struct brw_tcs_prog_key),
|
||||
[MESA_SHADER_TESS_EVAL] = sizeof(struct brw_tes_prog_key),
|
||||
[MESA_SHADER_GEOMETRY] = sizeof(struct brw_gs_prog_key),
|
||||
[MESA_SHADER_FRAGMENT] = sizeof(struct brw_wm_prog_key),
|
||||
[MESA_SHADER_COMPUTE] = sizeof(struct brw_cs_prog_key),
|
||||
[MESA_SHADER_TASK] = sizeof(struct brw_task_prog_key),
|
||||
[MESA_SHADER_MESH] = sizeof(struct brw_mesh_prog_key),
|
||||
[MESA_SHADER_RAYGEN] = sizeof(struct brw_bs_prog_key),
|
||||
[MESA_SHADER_ANY_HIT] = sizeof(struct brw_bs_prog_key),
|
||||
[MESA_SHADER_CLOSEST_HIT] = sizeof(struct brw_bs_prog_key),
|
||||
[MESA_SHADER_MISS] = sizeof(struct brw_bs_prog_key),
|
||||
[MESA_SHADER_INTERSECTION] = sizeof(struct brw_bs_prog_key),
|
||||
[MESA_SHADER_CALLABLE] = sizeof(struct brw_bs_prog_key),
|
||||
[MESA_SHADER_KERNEL] = sizeof(struct brw_cs_prog_key),
|
||||
};
|
||||
assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_sizes));
|
||||
return stage_sizes[stage];
|
||||
}
|
||||
|
||||
void
|
||||
brw_write_shader_relocs(const struct brw_isa_info *isa,
|
||||
void *program,
|
||||
const struct brw_stage_prog_data *prog_data,
|
||||
struct brw_shader_reloc_value *values,
|
||||
unsigned num_values)
|
||||
{
|
||||
for (unsigned i = 0; i < prog_data->num_relocs; i++) {
|
||||
assert(prog_data->relocs[i].offset % 8 == 0);
|
||||
void *dst = program + prog_data->relocs[i].offset;
|
||||
for (unsigned j = 0; j < num_values; j++) {
|
||||
if (prog_data->relocs[i].id == values[j].id) {
|
||||
uint32_t value = values[j].value + prog_data->relocs[i].delta;
|
||||
switch (prog_data->relocs[i].type) {
|
||||
case BRW_SHADER_RELOC_TYPE_U32:
|
||||
*(uint32_t *)dst = value;
|
||||
break;
|
||||
case BRW_SHADER_RELOC_TYPE_MOV_IMM:
|
||||
brw_update_reloc_imm(isa, dst, value);
|
||||
break;
|
||||
default:
|
||||
unreachable("Invalid relocation type");
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
2131
src/intel/compiler/elk/brw_compiler.h
Normal file
2131
src/intel/compiler/elk/brw_compiler.h
Normal file
File diff suppressed because it is too large
Load diff
121
src/intel/compiler/elk/brw_dead_control_flow.cpp
Normal file
121
src/intel/compiler/elk/brw_dead_control_flow.cpp
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
/*
|
||||
* Copyright © 2013 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/** @file brw_dead_control_flow.cpp
|
||||
*
|
||||
* This file implements the dead control flow elimination optimization pass.
|
||||
*/
|
||||
|
||||
#include "brw_shader.h"
|
||||
#include "brw_cfg.h"
|
||||
|
||||
using namespace brw;
|
||||
|
||||
/* Look for and eliminate dead control flow:
|
||||
*
|
||||
* - if/endif
|
||||
* - else in else/endif
|
||||
* - then in if/else/endif
|
||||
*/
|
||||
bool
|
||||
dead_control_flow_eliminate(backend_shader *s)
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
foreach_block_safe (block, s->cfg) {
|
||||
bblock_t *prev_block = block->prev();
|
||||
|
||||
if (!prev_block)
|
||||
continue;
|
||||
|
||||
backend_instruction *const inst = block->start();
|
||||
backend_instruction *const prev_inst = prev_block->end();
|
||||
|
||||
/* ENDIF instructions, by definition, can only be found at the start of
|
||||
* basic blocks.
|
||||
*/
|
||||
if (inst->opcode == BRW_OPCODE_ENDIF &&
|
||||
prev_inst->opcode == BRW_OPCODE_ELSE) {
|
||||
bblock_t *const else_block = prev_block;
|
||||
backend_instruction *const else_inst = prev_inst;
|
||||
|
||||
else_inst->remove(else_block);
|
||||
progress = true;
|
||||
} else if (inst->opcode == BRW_OPCODE_ENDIF &&
|
||||
prev_inst->opcode == BRW_OPCODE_IF) {
|
||||
bblock_t *const endif_block = block;
|
||||
bblock_t *const if_block = prev_block;
|
||||
backend_instruction *const endif_inst = inst;
|
||||
backend_instruction *const if_inst = prev_inst;
|
||||
|
||||
bblock_t *earlier_block = NULL, *later_block = NULL;
|
||||
|
||||
if (if_block->start_ip == if_block->end_ip) {
|
||||
earlier_block = if_block->prev();
|
||||
} else {
|
||||
earlier_block = if_block;
|
||||
}
|
||||
if_inst->remove(if_block);
|
||||
|
||||
if (endif_block->start_ip == endif_block->end_ip) {
|
||||
later_block = endif_block->next();
|
||||
} else {
|
||||
later_block = endif_block;
|
||||
}
|
||||
endif_inst->remove(endif_block);
|
||||
|
||||
assert((earlier_block == NULL) == (later_block == NULL));
|
||||
if (earlier_block && earlier_block->can_combine_with(later_block)) {
|
||||
earlier_block->combine_with(later_block);
|
||||
|
||||
/* If ENDIF was in its own block, then we've now deleted it and
|
||||
* merged the two surrounding blocks, the latter of which the
|
||||
* __next block pointer was pointing to.
|
||||
*/
|
||||
if (endif_block != later_block) {
|
||||
__next = earlier_block->next();
|
||||
}
|
||||
}
|
||||
|
||||
progress = true;
|
||||
} else if (inst->opcode == BRW_OPCODE_ELSE &&
|
||||
prev_inst->opcode == BRW_OPCODE_IF) {
|
||||
bblock_t *const else_block = block;
|
||||
backend_instruction *const if_inst = prev_inst;
|
||||
backend_instruction *const else_inst = inst;
|
||||
|
||||
/* Since the else-branch is becoming the new then-branch, the
|
||||
* condition has to be inverted.
|
||||
*/
|
||||
if_inst->predicate_inverse = !if_inst->predicate_inverse;
|
||||
else_inst->remove(else_block);
|
||||
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (progress)
|
||||
s->invalidate_analysis(DEPENDENCY_BLOCKS | DEPENDENCY_INSTRUCTIONS);
|
||||
|
||||
return progress;
|
||||
}
|
||||
31
src/intel/compiler/elk/brw_dead_control_flow.h
Normal file
31
src/intel/compiler/elk/brw_dead_control_flow.h
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
/*
|
||||
* Copyright © 2013 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BRW_DEAD_CONTROL_FLOW_H
|
||||
#define BRW_DEAD_CONTROL_FLOW_H
|
||||
|
||||
#include "brw_shader.h"
|
||||
|
||||
bool dead_control_flow_eliminate(backend_shader *s);
|
||||
|
||||
#endif /* BRW_DEAD_CONTROL_FLOW_H */
|
||||
238
src/intel/compiler/elk/brw_debug_recompile.c
Normal file
238
src/intel/compiler/elk/brw_debug_recompile.c
Normal file
|
|
@ -0,0 +1,238 @@
|
|||
/*
|
||||
* Copyright © 2019 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included
|
||||
* in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file brw_debug_recompiles.c
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "brw_compiler.h"
|
||||
|
||||
static bool
|
||||
key_debug(const struct brw_compiler *c, void *log,
|
||||
const char *name, int a, int b)
|
||||
{
|
||||
if (a != b) {
|
||||
brw_shader_perf_log(c, log, " %s %d->%d\n", name, a, b);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool
|
||||
key_debug_float(const struct brw_compiler *c, void *log,
|
||||
const char *name, float a, float b)
|
||||
{
|
||||
if (a != b) {
|
||||
brw_shader_perf_log(c, log, " %s %f->%f\n", name, a, b);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
#define check(name, field) \
|
||||
key_debug(c, log, name, old_key->field, key->field)
|
||||
#define check_float(name, field) \
|
||||
key_debug_float(c, log, name, old_key->field, key->field)
|
||||
|
||||
static bool
|
||||
debug_sampler_recompile(const struct brw_compiler *c, void *log,
|
||||
const struct brw_sampler_prog_key_data *old_key,
|
||||
const struct brw_sampler_prog_key_data *key)
|
||||
{
|
||||
bool found = false;
|
||||
|
||||
found |= check("gather channel quirk", gather_channel_quirk_mask);
|
||||
|
||||
for (unsigned i = 0; i < BRW_MAX_SAMPLERS; i++) {
|
||||
found |= check("EXT_texture_swizzle or DEPTH_TEXTURE_MODE", swizzles[i]);
|
||||
found |= check("textureGather workarounds", gfx6_gather_wa[i]);
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
found |= check("GL_CLAMP enabled on any texture unit", gl_clamp_mask[i]);
|
||||
}
|
||||
|
||||
return found;
|
||||
}
|
||||
|
||||
static bool
|
||||
debug_base_recompile(const struct brw_compiler *c, void *log,
|
||||
const struct brw_base_prog_key *old_key,
|
||||
const struct brw_base_prog_key *key)
|
||||
{
|
||||
return debug_sampler_recompile(c, log, &old_key->tex, &key->tex);
|
||||
}
|
||||
|
||||
static void
|
||||
debug_vs_recompile(const struct brw_compiler *c, void *log,
|
||||
const struct brw_vs_prog_key *old_key,
|
||||
const struct brw_vs_prog_key *key)
|
||||
{
|
||||
bool found = debug_base_recompile(c, log, &old_key->base, &key->base);
|
||||
|
||||
for (unsigned i = 0; i < VERT_ATTRIB_MAX; i++) {
|
||||
found |= check("vertex attrib w/a flags", gl_attrib_wa_flags[i]);
|
||||
}
|
||||
|
||||
found |= check("legacy user clipping", nr_userclip_plane_consts);
|
||||
found |= check("copy edgeflag", copy_edgeflag);
|
||||
found |= check("pointcoord replace", point_coord_replace);
|
||||
found |= check("vertex color clamping", clamp_vertex_color);
|
||||
|
||||
if (!found) {
|
||||
brw_shader_perf_log(c, log, " something else\n");
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
debug_tcs_recompile(const struct brw_compiler *c, void *log,
|
||||
const struct brw_tcs_prog_key *old_key,
|
||||
const struct brw_tcs_prog_key *key)
|
||||
{
|
||||
bool found = debug_base_recompile(c, log, &old_key->base, &key->base);
|
||||
|
||||
found |= check("input vertices", input_vertices);
|
||||
found |= check("outputs written", outputs_written);
|
||||
found |= check("patch outputs written", patch_outputs_written);
|
||||
found |= check("tes primitive mode", _tes_primitive_mode);
|
||||
found |= check("quads and equal_spacing workaround", quads_workaround);
|
||||
|
||||
if (!found) {
|
||||
brw_shader_perf_log(c, log, " something else\n");
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
debug_tes_recompile(const struct brw_compiler *c, void *log,
|
||||
const struct brw_tes_prog_key *old_key,
|
||||
const struct brw_tes_prog_key *key)
|
||||
{
|
||||
bool found = debug_base_recompile(c, log, &old_key->base, &key->base);
|
||||
|
||||
found |= check("inputs read", inputs_read);
|
||||
found |= check("patch inputs read", patch_inputs_read);
|
||||
|
||||
if (!found) {
|
||||
brw_shader_perf_log(c, log, " something else\n");
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
debug_gs_recompile(const struct brw_compiler *c, void *log,
|
||||
const struct brw_gs_prog_key *old_key,
|
||||
const struct brw_gs_prog_key *key)
|
||||
{
|
||||
bool found = debug_base_recompile(c, log, &old_key->base, &key->base);
|
||||
|
||||
if (!found) {
|
||||
brw_shader_perf_log(c, log, " something else\n");
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
debug_fs_recompile(const struct brw_compiler *c, void *log,
|
||||
const struct brw_wm_prog_key *old_key,
|
||||
const struct brw_wm_prog_key *key)
|
||||
{
|
||||
bool found = false;
|
||||
|
||||
found |= check("alphatest, computed depth, depth test, or depth write",
|
||||
iz_lookup);
|
||||
found |= check("depth statistics", stats_wm);
|
||||
found |= check("flat shading", flat_shade);
|
||||
found |= check("number of color buffers", nr_color_regions);
|
||||
found |= check("MRT alpha test", alpha_test_replicate_alpha);
|
||||
found |= check("alpha to coverage", alpha_to_coverage);
|
||||
found |= check("fragment color clamping", clamp_fragment_color);
|
||||
found |= check("per-sample interpolation", persample_interp);
|
||||
found |= check("multisampled FBO", multisample_fbo);
|
||||
found |= check("line smoothing", line_aa);
|
||||
found |= check("force dual color blending", force_dual_color_blend);
|
||||
found |= check("coherent fb fetch", coherent_fb_fetch);
|
||||
found |= check("ignore sample mask out", ignore_sample_mask_out);
|
||||
found |= check("coarse pixel", coarse_pixel);
|
||||
|
||||
found |= check("input slots valid", input_slots_valid);
|
||||
found |= check("mrt alpha test function", alpha_test_func);
|
||||
found |= check("mrt alpha test reference value", alpha_test_ref);
|
||||
|
||||
found |= debug_base_recompile(c, log, &old_key->base, &key->base);
|
||||
|
||||
if (!found) {
|
||||
brw_shader_perf_log(c, log, " something else\n");
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
debug_cs_recompile(const struct brw_compiler *c, void *log,
|
||||
const struct brw_cs_prog_key *old_key,
|
||||
const struct brw_cs_prog_key *key)
|
||||
{
|
||||
bool found = debug_base_recompile(c, log, &old_key->base, &key->base);
|
||||
|
||||
if (!found) {
|
||||
brw_shader_perf_log(c, log, " something else\n");
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
brw_debug_key_recompile(const struct brw_compiler *c, void *log,
|
||||
gl_shader_stage stage,
|
||||
const struct brw_base_prog_key *old_key,
|
||||
const struct brw_base_prog_key *key)
|
||||
{
|
||||
if (!old_key) {
|
||||
brw_shader_perf_log(c, log, " No previous compile found...\n");
|
||||
return;
|
||||
}
|
||||
|
||||
switch (stage) {
|
||||
case MESA_SHADER_VERTEX:
|
||||
debug_vs_recompile(c, log, (const struct brw_vs_prog_key *)old_key,
|
||||
(const struct brw_vs_prog_key *)key);
|
||||
break;
|
||||
case MESA_SHADER_TESS_CTRL:
|
||||
debug_tcs_recompile(c, log, (const struct brw_tcs_prog_key *)old_key,
|
||||
(const struct brw_tcs_prog_key *)key);
|
||||
break;
|
||||
case MESA_SHADER_TESS_EVAL:
|
||||
debug_tes_recompile(c, log, (const struct brw_tes_prog_key *)old_key,
|
||||
(const struct brw_tes_prog_key *)key);
|
||||
break;
|
||||
case MESA_SHADER_GEOMETRY:
|
||||
debug_gs_recompile(c, log, (const struct brw_gs_prog_key *)old_key,
|
||||
(const struct brw_gs_prog_key *)key);
|
||||
break;
|
||||
case MESA_SHADER_FRAGMENT:
|
||||
debug_fs_recompile(c, log, (const struct brw_wm_prog_key *)old_key,
|
||||
(const struct brw_wm_prog_key *)key);
|
||||
break;
|
||||
case MESA_SHADER_COMPUTE:
|
||||
debug_cs_recompile(c, log, (const struct brw_cs_prog_key *)old_key,
|
||||
(const struct brw_cs_prog_key *)key);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
74
src/intel/compiler/elk/brw_device_sha1_gen_c.py
Executable file
74
src/intel/compiler/elk/brw_device_sha1_gen_c.py
Executable file
|
|
@ -0,0 +1,74 @@
|
|||
#!/usr/bin/env python3
|
||||
COPYRIGHT = """\
|
||||
/*
|
||||
* Copyright 2024 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sub license, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the
|
||||
* next paragraph) shall be included in all copies or substantial portions
|
||||
* of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
|
||||
* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
|
||||
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
||||
from mako.template import Template
|
||||
from mako import exceptions
|
||||
|
||||
sys.path.append(f"{os.path.dirname(sys.argv[0])}/../dev")
|
||||
import intel_device_info
|
||||
|
||||
template = COPYRIGHT + """
|
||||
|
||||
/* DO NOT EDIT - This file generated automatically by intel_device_serialize_c.py script */
|
||||
|
||||
#include "dev/intel_device_info.h"
|
||||
#include "brw_compiler.h"
|
||||
#define SHA_UPDATE_FIELD(field) _mesa_sha1_update(ctx, &devinfo->field, sizeof(devinfo->field))
|
||||
|
||||
void
|
||||
brw_device_sha1_update(struct mesa_sha1 *ctx,
|
||||
const struct intel_device_info *devinfo) {
|
||||
% for member in compiler_fields:
|
||||
SHA_UPDATE_FIELD(${member.name});
|
||||
% endfor
|
||||
}
|
||||
|
||||
#undef SHA_UPDATE_FIELD
|
||||
|
||||
"""
|
||||
|
||||
def main():
|
||||
"""print intel_device_serialize.c at the specified path"""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--outdir', required=True,
|
||||
help='Directory to put the generated files in')
|
||||
args = parser.parse_args()
|
||||
path = os.path.join(args.outdir, 'brw_device_sha1_gen.c')
|
||||
device_members = intel_device_info.TYPES_BY_NAME["intel_device_info"].members
|
||||
compiler_fields = [field for field in device_members if field.compiler_field]
|
||||
with open(path, 'w', encoding='utf-8') as f:
|
||||
try:
|
||||
f.write(Template(template).render(compiler_fields=compiler_fields))
|
||||
except:
|
||||
print(exceptions.text_error_template().render(compiler_fields=compiler_fields))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
2887
src/intel/compiler/elk/brw_disasm.c
Normal file
2887
src/intel/compiler/elk/brw_disasm.c
Normal file
File diff suppressed because it is too large
Load diff
42
src/intel/compiler/elk/brw_disasm.h
Normal file
42
src/intel/compiler/elk/brw_disasm.h
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
/*
|
||||
* Copyright 2024 Intel Corporation
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#ifndef BRW_DISASM_H
|
||||
#define BRW_DISASM_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct brw_isa_info;
|
||||
struct brw_inst;
|
||||
|
||||
const struct brw_label *brw_find_label(const struct brw_label *root, int offset);
|
||||
void brw_create_label(struct brw_label **labels, int offset, void *mem_ctx);
|
||||
int brw_disassemble_inst(FILE *file, const struct brw_isa_info *isa,
|
||||
const struct brw_inst *inst, bool is_compacted,
|
||||
int offset, const struct brw_label *root_label);
|
||||
const struct
|
||||
brw_label *brw_label_assembly(const struct brw_isa_info *isa,
|
||||
const void *assembly, int start, int end,
|
||||
void *mem_ctx);
|
||||
void brw_disassemble_with_labels(const struct brw_isa_info *isa,
|
||||
const void *assembly, int start, int end, FILE *out);
|
||||
void brw_disassemble(const struct brw_isa_info *isa,
|
||||
const void *assembly, int start, int end,
|
||||
const struct brw_label *root_label, FILE *out);
|
||||
int brw_disassemble_find_end(const struct brw_isa_info *isa,
|
||||
const void *assembly, int start);
|
||||
void brw_disassemble_with_errors(const struct brw_isa_info *isa,
|
||||
const void *assembly, int start, FILE *out);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* BRW_DISASM_H */
|
||||
207
src/intel/compiler/elk/brw_disasm_info.c
Normal file
207
src/intel/compiler/elk/brw_disasm_info.c
Normal file
|
|
@ -0,0 +1,207 @@
|
|||
/*
|
||||
* Copyright © 2014 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_cfg.h"
|
||||
#include "brw_eu.h"
|
||||
#include "brw_disasm.h"
|
||||
#include "brw_disasm_info.h"
|
||||
#include "dev/intel_debug.h"
|
||||
#include "compiler/nir/nir.h"
|
||||
|
||||
__attribute__((weak)) void nir_print_instr(UNUSED const nir_instr *instr,
|
||||
UNUSED FILE *fp) {}
|
||||
|
||||
void
|
||||
dump_assembly(void *assembly, int start_offset, int end_offset,
|
||||
struct disasm_info *disasm, const unsigned *block_latency)
|
||||
{
|
||||
const struct brw_isa_info *isa = disasm->isa;
|
||||
const char *last_annotation_string = NULL;
|
||||
const void *last_annotation_ir = NULL;
|
||||
|
||||
void *mem_ctx = ralloc_context(NULL);
|
||||
const struct brw_label *root_label =
|
||||
brw_label_assembly(isa, assembly, start_offset, end_offset, mem_ctx);
|
||||
|
||||
foreach_list_typed(struct inst_group, group, link, &disasm->group_list) {
|
||||
struct exec_node *next_node = exec_node_get_next(&group->link);
|
||||
if (exec_node_is_tail_sentinel(next_node))
|
||||
break;
|
||||
|
||||
struct inst_group *next =
|
||||
exec_node_data(struct inst_group, next_node, link);
|
||||
|
||||
int start_offset = group->offset;
|
||||
int end_offset = next->offset;
|
||||
|
||||
if (group->block_start) {
|
||||
fprintf(stderr, " START B%d", group->block_start->num);
|
||||
foreach_list_typed(struct bblock_link, predecessor_link, link,
|
||||
&group->block_start->parents) {
|
||||
struct bblock_t *predecessor_block = predecessor_link->block;
|
||||
fprintf(stderr, " <-B%d", predecessor_block->num);
|
||||
}
|
||||
if (block_latency)
|
||||
fprintf(stderr, " (%u cycles)",
|
||||
block_latency[group->block_start->num]);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
if (last_annotation_ir != group->ir) {
|
||||
last_annotation_ir = group->ir;
|
||||
if (last_annotation_ir) {
|
||||
fprintf(stderr, " ");
|
||||
nir_print_instr(group->ir, stderr);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
if (last_annotation_string != group->annotation) {
|
||||
last_annotation_string = group->annotation;
|
||||
if (last_annotation_string)
|
||||
fprintf(stderr, " %s\n", last_annotation_string);
|
||||
}
|
||||
|
||||
brw_disassemble(isa, assembly, start_offset, end_offset,
|
||||
root_label, stderr);
|
||||
|
||||
if (group->error) {
|
||||
fputs(group->error, stderr);
|
||||
}
|
||||
|
||||
if (group->block_end) {
|
||||
fprintf(stderr, " END B%d", group->block_end->num);
|
||||
foreach_list_typed(struct bblock_link, successor_link, link,
|
||||
&group->block_end->children) {
|
||||
struct bblock_t *successor_block = successor_link->block;
|
||||
fprintf(stderr, " ->B%d", successor_block->num);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
ralloc_free(mem_ctx);
|
||||
}
|
||||
|
||||
struct disasm_info *
|
||||
disasm_initialize(const struct brw_isa_info *isa,
|
||||
const struct cfg_t *cfg)
|
||||
{
|
||||
struct disasm_info *disasm = ralloc(NULL, struct disasm_info);
|
||||
exec_list_make_empty(&disasm->group_list);
|
||||
disasm->isa = isa;
|
||||
disasm->cfg = cfg;
|
||||
disasm->cur_block = 0;
|
||||
disasm->use_tail = false;
|
||||
return disasm;
|
||||
}
|
||||
|
||||
struct inst_group *
|
||||
disasm_new_inst_group(struct disasm_info *disasm, unsigned next_inst_offset)
|
||||
{
|
||||
struct inst_group *tail = rzalloc(disasm, struct inst_group);
|
||||
tail->offset = next_inst_offset;
|
||||
exec_list_push_tail(&disasm->group_list, &tail->link);
|
||||
return tail;
|
||||
}
|
||||
|
||||
void
|
||||
disasm_annotate(struct disasm_info *disasm,
|
||||
struct backend_instruction *inst, unsigned offset)
|
||||
{
|
||||
const struct intel_device_info *devinfo = disasm->isa->devinfo;
|
||||
const struct cfg_t *cfg = disasm->cfg;
|
||||
|
||||
struct inst_group *group;
|
||||
if (!disasm->use_tail) {
|
||||
group = disasm_new_inst_group(disasm, offset);
|
||||
} else {
|
||||
disasm->use_tail = false;
|
||||
group = exec_node_data(struct inst_group,
|
||||
exec_list_get_tail_raw(&disasm->group_list), link);
|
||||
}
|
||||
|
||||
if (INTEL_DEBUG(DEBUG_ANNOTATION)) {
|
||||
group->ir = inst->ir;
|
||||
group->annotation = inst->annotation;
|
||||
}
|
||||
|
||||
if (bblock_start(cfg->blocks[disasm->cur_block]) == inst) {
|
||||
group->block_start = cfg->blocks[disasm->cur_block];
|
||||
}
|
||||
|
||||
/* There is no hardware DO instruction on Gfx6+, so since DO always
|
||||
* starts a basic block, we need to set the .block_start of the next
|
||||
* instruction's annotation with a pointer to the bblock started by
|
||||
* the DO.
|
||||
*
|
||||
* There's also only complication from emitting an annotation without
|
||||
* a corresponding hardware instruction to disassemble.
|
||||
*/
|
||||
if (devinfo->ver >= 6 && inst->opcode == BRW_OPCODE_DO) {
|
||||
disasm->use_tail = true;
|
||||
}
|
||||
|
||||
if (bblock_end(cfg->blocks[disasm->cur_block]) == inst) {
|
||||
group->block_end = cfg->blocks[disasm->cur_block];
|
||||
disasm->cur_block++;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
disasm_insert_error(struct disasm_info *disasm, unsigned offset,
|
||||
unsigned inst_size, const char *error)
|
||||
{
|
||||
foreach_list_typed(struct inst_group, cur, link, &disasm->group_list) {
|
||||
struct exec_node *next_node = exec_node_get_next(&cur->link);
|
||||
if (exec_node_is_tail_sentinel(next_node))
|
||||
break;
|
||||
|
||||
struct inst_group *next =
|
||||
exec_node_data(struct inst_group, next_node, link);
|
||||
|
||||
if (next->offset <= offset)
|
||||
continue;
|
||||
|
||||
if (offset + inst_size != next->offset) {
|
||||
struct inst_group *new = ralloc(disasm, struct inst_group);
|
||||
memcpy(new, cur, sizeof(struct inst_group));
|
||||
|
||||
cur->error = NULL;
|
||||
cur->error_length = 0;
|
||||
cur->block_end = NULL;
|
||||
|
||||
new->offset = offset + inst_size;
|
||||
new->block_start = NULL;
|
||||
|
||||
exec_node_insert_after(&cur->link, &new->link);
|
||||
}
|
||||
|
||||
if (cur->error)
|
||||
ralloc_strcat(&cur->error, error);
|
||||
else
|
||||
cur->error = ralloc_strdup(disasm, error);
|
||||
return;
|
||||
}
|
||||
}
|
||||
90
src/intel/compiler/elk/brw_disasm_info.h
Normal file
90
src/intel/compiler/elk/brw_disasm_info.h
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
/*
|
||||
* Copyright © 2014 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef _INTEL_ASM_ANNOTATION_H
|
||||
#define _INTEL_ASM_ANNOTATION_H
|
||||
|
||||
#include "compiler/glsl/list.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct cfg_t;
|
||||
struct backend_instruction;
|
||||
struct intel_device_info;
|
||||
|
||||
struct inst_group {
|
||||
struct exec_node link;
|
||||
|
||||
int offset;
|
||||
|
||||
size_t error_length;
|
||||
char *error;
|
||||
|
||||
/* Pointers to the basic block in the CFG if the instruction group starts
|
||||
* or ends a basic block.
|
||||
*/
|
||||
struct bblock_t *block_start;
|
||||
struct bblock_t *block_end;
|
||||
|
||||
/* Annotation for the generated IR. One of the two can be set. */
|
||||
const void *ir;
|
||||
const char *annotation;
|
||||
};
|
||||
|
||||
struct disasm_info {
|
||||
struct exec_list group_list;
|
||||
|
||||
const struct brw_isa_info *isa;
|
||||
const struct cfg_t *cfg;
|
||||
|
||||
/** Block index in the cfg. */
|
||||
int cur_block;
|
||||
bool use_tail;
|
||||
};
|
||||
|
||||
void
|
||||
dump_assembly(void *assembly, int start_offset, int end_offset,
|
||||
struct disasm_info *disasm, const unsigned *block_latency);
|
||||
|
||||
struct disasm_info *
|
||||
disasm_initialize(const struct brw_isa_info *isa,
|
||||
const struct cfg_t *cfg);
|
||||
|
||||
struct inst_group *
|
||||
disasm_new_inst_group(struct disasm_info *disasm, unsigned offset);
|
||||
|
||||
void
|
||||
disasm_annotate(struct disasm_info *disasm,
|
||||
struct backend_instruction *inst, unsigned offset);
|
||||
|
||||
void
|
||||
disasm_insert_error(struct disasm_info *disasm, unsigned offset,
|
||||
unsigned inst_size, const char *error);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* _INTEL_ASM_ANNOTATION_H */
|
||||
242
src/intel/compiler/elk/brw_disasm_tool.c
Normal file
242
src/intel/compiler/elk/brw_disasm_tool.c
Normal file
|
|
@ -0,0 +1,242 @@
|
|||
/*
|
||||
* Copyright © 2018 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <getopt.h>
|
||||
|
||||
#include "compiler/brw_disasm.h"
|
||||
#include "compiler/brw_isa_info.h"
|
||||
#include "dev/intel_device_info.h"
|
||||
#include "util/u_dynarray.h"
|
||||
|
||||
enum opt_input_type {
|
||||
OPT_INPUT_BINARY,
|
||||
OPT_INPUT_C_LITERAL,
|
||||
};
|
||||
|
||||
static enum opt_input_type input_type = OPT_INPUT_BINARY;
|
||||
|
||||
/* Return size of file in bytes pointed by fp */
|
||||
static long
|
||||
i965_disasm_get_file_size(FILE *fp)
|
||||
{
|
||||
long size;
|
||||
|
||||
fseek(fp, 0L, SEEK_END);
|
||||
size = ftell(fp);
|
||||
fseek(fp, 0L, SEEK_SET);
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
/* Read hex file which should be in following format:
|
||||
* for example :
|
||||
* { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }
|
||||
*/
|
||||
static void *
|
||||
i965_disasm_read_c_literal_file(FILE *fp, size_t *end)
|
||||
{
|
||||
struct util_dynarray assembly = {};
|
||||
uint32_t temp[2];
|
||||
|
||||
if (fscanf(fp, " { ") == EOF) {
|
||||
fprintf(stderr, "Couldn't find opening `{`\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (fscanf(fp, "0x%x , 0x%x", &temp[0], &temp[1]) == 2) {
|
||||
util_dynarray_append(&assembly, uint32_t, temp[0]);
|
||||
util_dynarray_append(&assembly, uint32_t, temp[1]);
|
||||
} else {
|
||||
fprintf(stderr, "Couldn't read hex values\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
while (fscanf(fp, " , 0x%x , 0x%x ", &temp[0], &temp[1]) == 2) {
|
||||
util_dynarray_append(&assembly, uint32_t, temp[0]);
|
||||
util_dynarray_append(&assembly, uint32_t, temp[1]);
|
||||
}
|
||||
|
||||
if (fscanf(fp, "}") == EOF) {
|
||||
fprintf(stderr, "Couldn't find closing `}`\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
*end = assembly.size;
|
||||
return assembly.data;
|
||||
}
|
||||
|
||||
static void *
|
||||
i965_disasm_read_binary(FILE *fp, size_t *end)
|
||||
{
|
||||
size_t size;
|
||||
void *assembly;
|
||||
|
||||
long sz = i965_disasm_get_file_size(fp);
|
||||
if (sz < 0)
|
||||
return NULL;
|
||||
|
||||
*end = (size_t)sz;
|
||||
if (!*end)
|
||||
return NULL;
|
||||
|
||||
assembly = malloc(*end + 1);
|
||||
if (assembly == NULL)
|
||||
return NULL;
|
||||
|
||||
size = fread(assembly, *end, 1, fp);
|
||||
if (!size) {
|
||||
free(assembly);
|
||||
return NULL;
|
||||
}
|
||||
return assembly;
|
||||
}
|
||||
|
||||
static void
|
||||
print_help(const char *progname, FILE *file)
|
||||
{
|
||||
fprintf(file,
|
||||
"Usage: %s [OPTION]...\n"
|
||||
"Disassemble i965 instructions from binary file.\n\n"
|
||||
" --help display this help and exit\n"
|
||||
" --input-path=PATH read binary file from binary file PATH\n"
|
||||
" --type=INPUT_TYPE INPUT_TYPE can be 'bin' (default if omitted),\n"
|
||||
" 'c_literal'.\n"
|
||||
" --gen=platform disassemble instructions for given \n"
|
||||
" platform (3 letter platform name)\n",
|
||||
progname);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
FILE *fp = NULL;
|
||||
void *assembly = NULL;
|
||||
char *file_path = NULL;
|
||||
size_t start = 0, end = 0;
|
||||
uint16_t pci_id = 0;
|
||||
int c;
|
||||
int result = EXIT_FAILURE;
|
||||
|
||||
bool help = false;
|
||||
const struct option i965_disasm_opts[] = {
|
||||
{ "help", no_argument, (int *) &help, true },
|
||||
{ "input-path", required_argument, NULL, 'i' },
|
||||
{ "type", required_argument, NULL, 't' },
|
||||
{ "gen", required_argument, NULL, 'g'},
|
||||
{ NULL, 0, NULL, 0 }
|
||||
};
|
||||
|
||||
while ((c = getopt_long(argc, argv, ":i:t:g:h", i965_disasm_opts, NULL)) != -1) {
|
||||
switch (c) {
|
||||
case 'g': {
|
||||
const int id = intel_device_name_to_pci_device_id(optarg);
|
||||
if (id < 0) {
|
||||
fprintf(stderr, "can't parse gen: '%s', expected 3 letter "
|
||||
"platform name\n", optarg);
|
||||
goto end;
|
||||
} else {
|
||||
pci_id = id;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 'i':
|
||||
file_path = strdup(optarg);
|
||||
fp = fopen(file_path, "r");
|
||||
if (!fp) {
|
||||
fprintf(stderr, "Unable to read input file : %s\n",
|
||||
file_path);
|
||||
goto end;
|
||||
}
|
||||
break;
|
||||
case 't':
|
||||
if (strcmp(optarg, "c_literal") == 0) {
|
||||
input_type = OPT_INPUT_C_LITERAL;
|
||||
} else if (strcmp(optarg, "bin") == 0) {
|
||||
input_type = OPT_INPUT_BINARY;
|
||||
} else {
|
||||
fprintf(stderr, "invalid value for --type: %s\n", optarg);
|
||||
goto end;
|
||||
}
|
||||
break;
|
||||
case 'h':
|
||||
help = true;
|
||||
print_help(argv[0], stderr);
|
||||
goto end;
|
||||
case 0:
|
||||
break;
|
||||
case ':':
|
||||
fprintf(stderr, "%s: option `-%c' requires an argument\n",
|
||||
argv[0], optopt);
|
||||
goto end;
|
||||
case '?':
|
||||
default:
|
||||
fprintf(stderr, "%s: option `-%c' is invalid: ignored\n",
|
||||
argv[0], optopt);
|
||||
goto end;
|
||||
}
|
||||
}
|
||||
|
||||
if (help || !file_path || !pci_id) {
|
||||
print_help(argv[0], stderr);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
struct intel_device_info devinfo;
|
||||
if (!intel_get_device_info_from_pci_id(pci_id, &devinfo)) {
|
||||
fprintf(stderr, "can't find device information: pci_id=0x%x\n", pci_id);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
struct brw_isa_info isa;
|
||||
brw_init_isa_info(&isa, &devinfo);
|
||||
|
||||
if (input_type == OPT_INPUT_BINARY)
|
||||
assembly = i965_disasm_read_binary(fp, &end);
|
||||
else if (input_type == OPT_INPUT_C_LITERAL)
|
||||
assembly = i965_disasm_read_c_literal_file(fp, &end);
|
||||
|
||||
if (!assembly) {
|
||||
if (end)
|
||||
fprintf(stderr, "Unable to allocate buffer to read input file\n");
|
||||
else
|
||||
fprintf(stderr, "Failed to read input file\n");
|
||||
|
||||
goto end;
|
||||
}
|
||||
|
||||
/* Disassemble i965 instructions from buffer assembly */
|
||||
brw_disassemble_with_labels(&isa, assembly, start, end, stdout);
|
||||
|
||||
result = EXIT_SUCCESS;
|
||||
|
||||
end:
|
||||
if (fp)
|
||||
fclose(fp);
|
||||
|
||||
free(file_path);
|
||||
free(assembly);
|
||||
|
||||
exit(result);
|
||||
}
|
||||
856
src/intel/compiler/elk/brw_eu.c
Normal file
856
src/intel/compiler/elk/brw_eu.c
Normal file
|
|
@ -0,0 +1,856 @@
|
|||
/*
|
||||
Copyright (C) Intel Corp. 2006. All Rights Reserved.
|
||||
Intel funded Tungsten Graphics to
|
||||
develop this 3D driver.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice (including the
|
||||
next paragraph) shall be included in all copies or substantial
|
||||
portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
|
||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
**********************************************************************/
|
||||
/*
|
||||
* Authors:
|
||||
* Keith Whitwell <keithw@vmware.com>
|
||||
*/
|
||||
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#include "brw_disasm.h"
|
||||
#include "brw_eu_defines.h"
|
||||
#include "brw_eu.h"
|
||||
#include "brw_shader.h"
|
||||
#include "intel_gfx_ver_enum.h"
|
||||
#include "dev/intel_debug.h"
|
||||
|
||||
#include "util/u_debug.h"
|
||||
#include "util/ralloc.h"
|
||||
|
||||
/* Returns a conditional modifier that negates the condition. */
|
||||
enum brw_conditional_mod
|
||||
brw_negate_cmod(enum brw_conditional_mod cmod)
|
||||
{
|
||||
switch (cmod) {
|
||||
case BRW_CONDITIONAL_Z:
|
||||
return BRW_CONDITIONAL_NZ;
|
||||
case BRW_CONDITIONAL_NZ:
|
||||
return BRW_CONDITIONAL_Z;
|
||||
case BRW_CONDITIONAL_G:
|
||||
return BRW_CONDITIONAL_LE;
|
||||
case BRW_CONDITIONAL_GE:
|
||||
return BRW_CONDITIONAL_L;
|
||||
case BRW_CONDITIONAL_L:
|
||||
return BRW_CONDITIONAL_GE;
|
||||
case BRW_CONDITIONAL_LE:
|
||||
return BRW_CONDITIONAL_G;
|
||||
default:
|
||||
unreachable("Can't negate this cmod");
|
||||
}
|
||||
}
|
||||
|
||||
/* Returns the corresponding conditional mod for swapping src0 and
|
||||
* src1 in e.g. CMP.
|
||||
*/
|
||||
enum brw_conditional_mod
|
||||
brw_swap_cmod(enum brw_conditional_mod cmod)
|
||||
{
|
||||
switch (cmod) {
|
||||
case BRW_CONDITIONAL_Z:
|
||||
case BRW_CONDITIONAL_NZ:
|
||||
return cmod;
|
||||
case BRW_CONDITIONAL_G:
|
||||
return BRW_CONDITIONAL_L;
|
||||
case BRW_CONDITIONAL_GE:
|
||||
return BRW_CONDITIONAL_LE;
|
||||
case BRW_CONDITIONAL_L:
|
||||
return BRW_CONDITIONAL_G;
|
||||
case BRW_CONDITIONAL_LE:
|
||||
return BRW_CONDITIONAL_GE;
|
||||
default:
|
||||
return BRW_CONDITIONAL_NONE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the least significant bit offset of the i+1-th component of immediate
|
||||
* type \p type. For \p i equal to the two's complement of j, return the
|
||||
* offset of the j-th component starting from the end of the vector. For
|
||||
* scalar register types return zero.
|
||||
*/
|
||||
static unsigned
|
||||
imm_shift(enum brw_reg_type type, unsigned i)
|
||||
{
|
||||
assert(type != BRW_REGISTER_TYPE_UV && type != BRW_REGISTER_TYPE_V &&
|
||||
"Not implemented.");
|
||||
|
||||
if (type == BRW_REGISTER_TYPE_VF)
|
||||
return 8 * (i & 3);
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Swizzle an arbitrary immediate \p x of the given type according to the
|
||||
* permutation specified as \p swz.
|
||||
*/
|
||||
uint32_t
|
||||
brw_swizzle_immediate(enum brw_reg_type type, uint32_t x, unsigned swz)
|
||||
{
|
||||
if (imm_shift(type, 1)) {
|
||||
const unsigned n = 32 / imm_shift(type, 1);
|
||||
uint32_t y = 0;
|
||||
|
||||
for (unsigned i = 0; i < n; i++) {
|
||||
/* Shift the specified component all the way to the right and left to
|
||||
* discard any undesired L/MSBs, then shift it right into component i.
|
||||
*/
|
||||
y |= x >> imm_shift(type, (i & ~3) + BRW_GET_SWZ(swz, i & 3))
|
||||
<< imm_shift(type, ~0u)
|
||||
>> imm_shift(type, ~0u - i);
|
||||
}
|
||||
|
||||
return y;
|
||||
} else {
|
||||
return x;
|
||||
}
|
||||
}
|
||||
|
||||
unsigned
|
||||
brw_get_default_exec_size(struct brw_codegen *p)
|
||||
{
|
||||
return p->current->exec_size;
|
||||
}
|
||||
|
||||
unsigned
|
||||
brw_get_default_group(struct brw_codegen *p)
|
||||
{
|
||||
return p->current->group;
|
||||
}
|
||||
|
||||
unsigned
|
||||
brw_get_default_access_mode(struct brw_codegen *p)
|
||||
{
|
||||
return p->current->access_mode;
|
||||
}
|
||||
|
||||
struct tgl_swsb
|
||||
brw_get_default_swsb(struct brw_codegen *p)
|
||||
{
|
||||
return p->current->swsb;
|
||||
}
|
||||
|
||||
void
|
||||
brw_set_default_exec_size(struct brw_codegen *p, unsigned value)
|
||||
{
|
||||
p->current->exec_size = value;
|
||||
}
|
||||
|
||||
void brw_set_default_predicate_control(struct brw_codegen *p, enum brw_predicate pc)
|
||||
{
|
||||
p->current->predicate = pc;
|
||||
}
|
||||
|
||||
void brw_set_default_predicate_inverse(struct brw_codegen *p, bool predicate_inverse)
|
||||
{
|
||||
p->current->pred_inv = predicate_inverse;
|
||||
}
|
||||
|
||||
void brw_set_default_flag_reg(struct brw_codegen *p, int reg, int subreg)
|
||||
{
|
||||
assert(subreg < 2);
|
||||
p->current->flag_subreg = reg * 2 + subreg;
|
||||
}
|
||||
|
||||
void brw_set_default_access_mode( struct brw_codegen *p, unsigned access_mode )
|
||||
{
|
||||
p->current->access_mode = access_mode;
|
||||
}
|
||||
|
||||
void
|
||||
brw_set_default_compression_control(struct brw_codegen *p,
|
||||
enum brw_compression compression_control)
|
||||
{
|
||||
switch (compression_control) {
|
||||
case BRW_COMPRESSION_NONE:
|
||||
/* This is the "use the first set of bits of dmask/vmask/arf
|
||||
* according to execsize" option.
|
||||
*/
|
||||
p->current->group = 0;
|
||||
break;
|
||||
case BRW_COMPRESSION_2NDHALF:
|
||||
/* For SIMD8, this is "use the second set of 8 bits." */
|
||||
p->current->group = 8;
|
||||
break;
|
||||
case BRW_COMPRESSION_COMPRESSED:
|
||||
/* For SIMD16 instruction compression, use the first set of 16 bits
|
||||
* since we don't do SIMD32 dispatch.
|
||||
*/
|
||||
p->current->group = 0;
|
||||
break;
|
||||
default:
|
||||
unreachable("not reached");
|
||||
}
|
||||
|
||||
if (p->devinfo->ver <= 6) {
|
||||
p->current->compressed =
|
||||
(compression_control == BRW_COMPRESSION_COMPRESSED);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Enable or disable instruction compression on the given instruction leaving
|
||||
* the currently selected channel enable group untouched.
|
||||
*/
|
||||
void
|
||||
brw_inst_set_compression(const struct intel_device_info *devinfo,
|
||||
brw_inst *inst, bool on)
|
||||
{
|
||||
if (devinfo->ver >= 6) {
|
||||
/* No-op, the EU will figure out for us whether the instruction needs to
|
||||
* be compressed.
|
||||
*/
|
||||
} else {
|
||||
/* The channel group and compression controls are non-orthogonal, there
|
||||
* are two possible representations for uncompressed instructions and we
|
||||
* may need to preserve the current one to avoid changing the selected
|
||||
* channel group inadvertently.
|
||||
*/
|
||||
if (on)
|
||||
brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_COMPRESSED);
|
||||
else if (brw_inst_qtr_control(devinfo, inst)
|
||||
== BRW_COMPRESSION_COMPRESSED)
|
||||
brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
brw_set_default_compression(struct brw_codegen *p, bool on)
|
||||
{
|
||||
p->current->compressed = on;
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply the range of channel enable signals given by
|
||||
* [group, group + exec_size) to the instruction passed as argument.
|
||||
*/
|
||||
void
|
||||
brw_inst_set_group(const struct intel_device_info *devinfo,
|
||||
brw_inst *inst, unsigned group)
|
||||
{
|
||||
if (devinfo->ver >= 20) {
|
||||
assert(group % 8 == 0 && group < 32);
|
||||
brw_inst_set_qtr_control(devinfo, inst, group / 8);
|
||||
|
||||
} else if (devinfo->ver >= 7) {
|
||||
assert(group % 4 == 0 && group < 32);
|
||||
brw_inst_set_qtr_control(devinfo, inst, group / 8);
|
||||
brw_inst_set_nib_control(devinfo, inst, (group / 4) % 2);
|
||||
|
||||
} else if (devinfo->ver == 6) {
|
||||
assert(group % 8 == 0 && group < 32);
|
||||
brw_inst_set_qtr_control(devinfo, inst, group / 8);
|
||||
|
||||
} else {
|
||||
assert(group % 8 == 0 && group < 16);
|
||||
/* The channel group and compression controls are non-orthogonal, there
|
||||
* are two possible representations for group zero and we may need to
|
||||
* preserve the current one to avoid changing the selected compression
|
||||
* enable inadvertently.
|
||||
*/
|
||||
if (group == 8)
|
||||
brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_2NDHALF);
|
||||
else if (brw_inst_qtr_control(devinfo, inst) == BRW_COMPRESSION_2NDHALF)
|
||||
brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
brw_set_default_group(struct brw_codegen *p, unsigned group)
|
||||
{
|
||||
p->current->group = group;
|
||||
}
|
||||
|
||||
void brw_set_default_mask_control( struct brw_codegen *p, unsigned value )
|
||||
{
|
||||
p->current->mask_control = value;
|
||||
}
|
||||
|
||||
void brw_set_default_saturate( struct brw_codegen *p, bool enable )
|
||||
{
|
||||
p->current->saturate = enable;
|
||||
}
|
||||
|
||||
void brw_set_default_acc_write_control(struct brw_codegen *p, unsigned value)
|
||||
{
|
||||
p->current->acc_wr_control = value;
|
||||
}
|
||||
|
||||
void brw_set_default_swsb(struct brw_codegen *p, struct tgl_swsb value)
|
||||
{
|
||||
p->current->swsb = value;
|
||||
}
|
||||
|
||||
void brw_push_insn_state( struct brw_codegen *p )
|
||||
{
|
||||
assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]);
|
||||
*(p->current + 1) = *p->current;
|
||||
p->current++;
|
||||
}
|
||||
|
||||
void brw_pop_insn_state( struct brw_codegen *p )
|
||||
{
|
||||
assert(p->current != p->stack);
|
||||
p->current--;
|
||||
}
|
||||
|
||||
|
||||
/***********************************************************************
|
||||
*/
|
||||
void
|
||||
brw_init_codegen(const struct brw_isa_info *isa,
|
||||
struct brw_codegen *p, void *mem_ctx)
|
||||
{
|
||||
memset(p, 0, sizeof(*p));
|
||||
|
||||
p->isa = isa;
|
||||
p->devinfo = isa->devinfo;
|
||||
p->automatic_exec_sizes = true;
|
||||
/*
|
||||
* Set the initial instruction store array size to 1024, if found that
|
||||
* isn't enough, then it will double the store size at brw_next_insn()
|
||||
* until out of memory.
|
||||
*/
|
||||
p->store_size = 1024;
|
||||
p->store = rzalloc_array(mem_ctx, brw_inst, p->store_size);
|
||||
p->nr_insn = 0;
|
||||
p->current = p->stack;
|
||||
memset(p->current, 0, sizeof(p->current[0]));
|
||||
|
||||
p->mem_ctx = mem_ctx;
|
||||
|
||||
/* Some defaults?
|
||||
*/
|
||||
brw_set_default_exec_size(p, BRW_EXECUTE_8);
|
||||
brw_set_default_mask_control(p, BRW_MASK_ENABLE); /* what does this do? */
|
||||
brw_set_default_saturate(p, 0);
|
||||
brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
|
||||
|
||||
/* Set up control flow stack */
|
||||
p->if_stack_depth = 0;
|
||||
p->if_stack_array_size = 16;
|
||||
p->if_stack = rzalloc_array(mem_ctx, int, p->if_stack_array_size);
|
||||
|
||||
p->loop_stack_depth = 0;
|
||||
p->loop_stack_array_size = 16;
|
||||
p->loop_stack = rzalloc_array(mem_ctx, int, p->loop_stack_array_size);
|
||||
p->if_depth_in_loop = rzalloc_array(mem_ctx, int, p->loop_stack_array_size);
|
||||
}
|
||||
|
||||
|
||||
const unsigned *brw_get_program( struct brw_codegen *p,
|
||||
unsigned *sz )
|
||||
{
|
||||
*sz = p->next_insn_offset;
|
||||
return (const unsigned *)p->store;
|
||||
}
|
||||
|
||||
const struct brw_shader_reloc *
|
||||
brw_get_shader_relocs(struct brw_codegen *p, unsigned *num_relocs)
|
||||
{
|
||||
*num_relocs = p->num_relocs;
|
||||
return p->relocs;
|
||||
}
|
||||
|
||||
DEBUG_GET_ONCE_OPTION(shader_bin_dump_path, "INTEL_SHADER_BIN_DUMP_PATH", NULL);
|
||||
|
||||
bool brw_should_dump_shader_bin(void)
|
||||
{
|
||||
return debug_get_option_shader_bin_dump_path() != NULL;
|
||||
}
|
||||
|
||||
void brw_dump_shader_bin(void *assembly, int start_offset, int end_offset,
|
||||
const char *identifier)
|
||||
{
|
||||
char *name = ralloc_asprintf(NULL, "%s/%s.bin",
|
||||
debug_get_option_shader_bin_dump_path(),
|
||||
identifier);
|
||||
|
||||
int fd = open(name, O_CREAT | O_WRONLY, 0777);
|
||||
ralloc_free(name);
|
||||
|
||||
if (fd < 0)
|
||||
return;
|
||||
|
||||
struct stat sb;
|
||||
if (fstat(fd, &sb) != 0 || (!S_ISREG(sb.st_mode))) {
|
||||
close(fd);
|
||||
return;
|
||||
}
|
||||
|
||||
size_t to_write = end_offset - start_offset;
|
||||
void *write_ptr = assembly + start_offset;
|
||||
|
||||
while (to_write) {
|
||||
ssize_t ret = write(fd, write_ptr, to_write);
|
||||
|
||||
if (ret <= 0) {
|
||||
close(fd);
|
||||
return;
|
||||
}
|
||||
|
||||
to_write -= ret;
|
||||
write_ptr += ret;
|
||||
}
|
||||
|
||||
close(fd);
|
||||
}
|
||||
|
||||
bool brw_try_override_assembly(struct brw_codegen *p, int start_offset,
|
||||
const char *identifier)
|
||||
{
|
||||
const char *read_path = getenv("INTEL_SHADER_ASM_READ_PATH");
|
||||
if (!read_path) {
|
||||
return false;
|
||||
}
|
||||
|
||||
char *name = ralloc_asprintf(NULL, "%s/%s.bin", read_path, identifier);
|
||||
|
||||
int fd = open(name, O_RDONLY);
|
||||
ralloc_free(name);
|
||||
|
||||
if (fd == -1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
struct stat sb;
|
||||
if (fstat(fd, &sb) != 0 || (!S_ISREG(sb.st_mode))) {
|
||||
close(fd);
|
||||
return false;
|
||||
}
|
||||
|
||||
p->nr_insn -= (p->next_insn_offset - start_offset) / sizeof(brw_inst);
|
||||
p->nr_insn += sb.st_size / sizeof(brw_inst);
|
||||
|
||||
p->next_insn_offset = start_offset + sb.st_size;
|
||||
p->store_size = (start_offset + sb.st_size) / sizeof(brw_inst);
|
||||
p->store = (brw_inst *)reralloc_size(p->mem_ctx, p->store, p->next_insn_offset);
|
||||
assert(p->store);
|
||||
|
||||
ssize_t ret = read(fd, (char *)p->store + start_offset, sb.st_size);
|
||||
close(fd);
|
||||
if (ret != sb.st_size) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ASSERTED bool valid =
|
||||
brw_validate_instructions(p->isa, p->store,
|
||||
start_offset, p->next_insn_offset,
|
||||
NULL);
|
||||
assert(valid);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
const struct brw_label *
|
||||
brw_find_label(const struct brw_label *root, int offset)
|
||||
{
|
||||
const struct brw_label *curr = root;
|
||||
|
||||
if (curr != NULL)
|
||||
{
|
||||
do {
|
||||
if (curr->offset == offset)
|
||||
return curr;
|
||||
|
||||
curr = curr->next;
|
||||
} while (curr != NULL);
|
||||
}
|
||||
|
||||
return curr;
|
||||
}
|
||||
|
||||
void
|
||||
brw_create_label(struct brw_label **labels, int offset, void *mem_ctx)
|
||||
{
|
||||
if (*labels != NULL) {
|
||||
struct brw_label *curr = *labels;
|
||||
struct brw_label *prev;
|
||||
|
||||
do {
|
||||
prev = curr;
|
||||
|
||||
if (curr->offset == offset)
|
||||
return;
|
||||
|
||||
curr = curr->next;
|
||||
} while (curr != NULL);
|
||||
|
||||
curr = ralloc(mem_ctx, struct brw_label);
|
||||
curr->offset = offset;
|
||||
curr->number = prev->number + 1;
|
||||
curr->next = NULL;
|
||||
prev->next = curr;
|
||||
} else {
|
||||
struct brw_label *root = ralloc(mem_ctx, struct brw_label);
|
||||
root->number = 0;
|
||||
root->offset = offset;
|
||||
root->next = NULL;
|
||||
*labels = root;
|
||||
}
|
||||
}
|
||||
|
||||
const struct brw_label *
|
||||
brw_label_assembly(const struct brw_isa_info *isa,
|
||||
const void *assembly, int start, int end, void *mem_ctx)
|
||||
{
|
||||
const struct intel_device_info *const devinfo = isa->devinfo;
|
||||
|
||||
struct brw_label *root_label = NULL;
|
||||
|
||||
int to_bytes_scale = sizeof(brw_inst) / brw_jump_scale(devinfo);
|
||||
|
||||
for (int offset = start; offset < end;) {
|
||||
const brw_inst *inst = (const brw_inst *) ((const char *) assembly + offset);
|
||||
brw_inst uncompacted;
|
||||
|
||||
bool is_compact = brw_inst_cmpt_control(devinfo, inst);
|
||||
|
||||
if (is_compact) {
|
||||
brw_compact_inst *compacted = (brw_compact_inst *)inst;
|
||||
brw_uncompact_instruction(isa, &uncompacted, compacted);
|
||||
inst = &uncompacted;
|
||||
}
|
||||
|
||||
if (brw_has_uip(devinfo, brw_inst_opcode(isa, inst))) {
|
||||
/* Instructions that have UIP also have JIP. */
|
||||
brw_create_label(&root_label,
|
||||
offset + brw_inst_uip(devinfo, inst) * to_bytes_scale, mem_ctx);
|
||||
brw_create_label(&root_label,
|
||||
offset + brw_inst_jip(devinfo, inst) * to_bytes_scale, mem_ctx);
|
||||
} else if (brw_has_jip(devinfo, brw_inst_opcode(isa, inst))) {
|
||||
int jip;
|
||||
if (devinfo->ver >= 7) {
|
||||
jip = brw_inst_jip(devinfo, inst);
|
||||
} else {
|
||||
jip = brw_inst_gfx6_jump_count(devinfo, inst);
|
||||
}
|
||||
|
||||
brw_create_label(&root_label, offset + jip * to_bytes_scale, mem_ctx);
|
||||
}
|
||||
|
||||
if (is_compact) {
|
||||
offset += sizeof(brw_compact_inst);
|
||||
} else {
|
||||
offset += sizeof(brw_inst);
|
||||
}
|
||||
}
|
||||
|
||||
return root_label;
|
||||
}
|
||||
|
||||
void
|
||||
brw_disassemble_with_labels(const struct brw_isa_info *isa,
|
||||
const void *assembly, int start, int end, FILE *out)
|
||||
{
|
||||
void *mem_ctx = ralloc_context(NULL);
|
||||
const struct brw_label *root_label =
|
||||
brw_label_assembly(isa, assembly, start, end, mem_ctx);
|
||||
|
||||
brw_disassemble(isa, assembly, start, end, root_label, out);
|
||||
|
||||
ralloc_free(mem_ctx);
|
||||
}
|
||||
|
||||
void
|
||||
brw_disassemble(const struct brw_isa_info *isa,
|
||||
const void *assembly, int start, int end,
|
||||
const struct brw_label *root_label, FILE *out)
|
||||
{
|
||||
const struct intel_device_info *devinfo = isa->devinfo;
|
||||
|
||||
bool dump_hex = INTEL_DEBUG(DEBUG_HEX);
|
||||
|
||||
for (int offset = start; offset < end;) {
|
||||
const brw_inst *insn = (const brw_inst *)((char *)assembly + offset);
|
||||
brw_inst uncompacted;
|
||||
|
||||
if (root_label != NULL) {
|
||||
const struct brw_label *label = brw_find_label(root_label, offset);
|
||||
if (label != NULL) {
|
||||
fprintf(out, "\nLABEL%d:\n", label->number);
|
||||
}
|
||||
}
|
||||
|
||||
bool compacted = brw_inst_cmpt_control(devinfo, insn);
|
||||
if (0)
|
||||
fprintf(out, "0x%08x: ", offset);
|
||||
|
||||
if (compacted) {
|
||||
brw_compact_inst *compacted = (brw_compact_inst *)insn;
|
||||
if (dump_hex) {
|
||||
unsigned char * insn_ptr = ((unsigned char *)&insn[0]);
|
||||
const unsigned int blank_spaces = 24;
|
||||
for (int i = 0 ; i < 8; i = i + 4) {
|
||||
fprintf(out, "%02x %02x %02x %02x ",
|
||||
insn_ptr[i],
|
||||
insn_ptr[i + 1],
|
||||
insn_ptr[i + 2],
|
||||
insn_ptr[i + 3]);
|
||||
}
|
||||
/* Make compacted instructions hex value output vertically aligned
|
||||
* with uncompacted instructions hex value
|
||||
*/
|
||||
fprintf(out, "%*c", blank_spaces, ' ');
|
||||
}
|
||||
|
||||
brw_uncompact_instruction(isa, &uncompacted, compacted);
|
||||
insn = &uncompacted;
|
||||
} else {
|
||||
if (dump_hex) {
|
||||
unsigned char * insn_ptr = ((unsigned char *)&insn[0]);
|
||||
for (int i = 0 ; i < 16; i = i + 4) {
|
||||
fprintf(out, "%02x %02x %02x %02x ",
|
||||
insn_ptr[i],
|
||||
insn_ptr[i + 1],
|
||||
insn_ptr[i + 2],
|
||||
insn_ptr[i + 3]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
brw_disassemble_inst(out, isa, insn, compacted, offset, root_label);
|
||||
|
||||
if (compacted) {
|
||||
offset += sizeof(brw_compact_inst);
|
||||
} else {
|
||||
offset += sizeof(brw_inst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const struct opcode_desc opcode_descs[] = {
|
||||
/* IR, HW, name, nsrc, ndst, gfx_vers */
|
||||
{ BRW_OPCODE_ILLEGAL, 0, "illegal", 0, 0, GFX_ALL },
|
||||
{ BRW_OPCODE_SYNC, 1, "sync", 1, 0, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_MOV, 1, "mov", 1, 1, GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_MOV, 97, "mov", 1, 1, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_SEL, 2, "sel", 2, 1, GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_SEL, 98, "sel", 2, 1, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_MOVI, 3, "movi", 2, 1, GFX_GE(GFX45) & GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_MOVI, 99, "movi", 2, 1, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_NOT, 4, "not", 1, 1, GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_NOT, 100, "not", 1, 1, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_AND, 5, "and", 2, 1, GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_AND, 101, "and", 2, 1, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_OR, 6, "or", 2, 1, GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_OR, 102, "or", 2, 1, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_XOR, 7, "xor", 2, 1, GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_XOR, 103, "xor", 2, 1, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_SHR, 8, "shr", 2, 1, GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_SHR, 104, "shr", 2, 1, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_SHL, 9, "shl", 2, 1, GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_SHL, 105, "shl", 2, 1, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_DIM, 10, "dim", 1, 1, GFX75 },
|
||||
{ BRW_OPCODE_SMOV, 10, "smov", 0, 0, GFX_GE(GFX8) & GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_SMOV, 106, "smov", 0, 0, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_ASR, 12, "asr", 2, 1, GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_ASR, 108, "asr", 2, 1, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_ROR, 14, "ror", 2, 1, GFX11 },
|
||||
{ BRW_OPCODE_ROR, 110, "ror", 2, 1, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_ROL, 15, "rol", 2, 1, GFX11 },
|
||||
{ BRW_OPCODE_ROL, 111, "rol", 2, 1, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_CMP, 16, "cmp", 2, 1, GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_CMP, 112, "cmp", 2, 1, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_CMPN, 17, "cmpn", 2, 1, GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_CMPN, 113, "cmpn", 2, 1, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_CSEL, 18, "csel", 3, 1, GFX_GE(GFX8) & GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_CSEL, 114, "csel", 3, 1, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_F32TO16, 19, "f32to16", 1, 1, GFX7 | GFX75 },
|
||||
{ BRW_OPCODE_F16TO32, 20, "f16to32", 1, 1, GFX7 | GFX75 },
|
||||
{ BRW_OPCODE_BFREV, 23, "bfrev", 1, 1, GFX_GE(GFX7) & GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_BFREV, 119, "bfrev", 1, 1, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_BFE, 24, "bfe", 3, 1, GFX_GE(GFX7) & GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_BFE, 120, "bfe", 3, 1, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_BFI1, 25, "bfi1", 2, 1, GFX_GE(GFX7) & GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_BFI1, 121, "bfi1", 2, 1, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_BFI2, 26, "bfi2", 3, 1, GFX_GE(GFX7) & GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_BFI2, 122, "bfi2", 3, 1, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_JMPI, 32, "jmpi", 0, 0, GFX_ALL },
|
||||
{ BRW_OPCODE_BRD, 33, "brd", 0, 0, GFX_GE(GFX7) },
|
||||
{ BRW_OPCODE_IF, 34, "if", 0, 0, GFX_ALL },
|
||||
{ BRW_OPCODE_IFF, 35, "iff", 0, 0, GFX_LE(GFX5) },
|
||||
{ BRW_OPCODE_BRC, 35, "brc", 0, 0, GFX_GE(GFX7) },
|
||||
{ BRW_OPCODE_ELSE, 36, "else", 0, 0, GFX_ALL },
|
||||
{ BRW_OPCODE_ENDIF, 37, "endif", 0, 0, GFX_ALL },
|
||||
{ BRW_OPCODE_DO, 38, "do", 0, 0, GFX_LE(GFX5) },
|
||||
{ BRW_OPCODE_CASE, 38, "case", 0, 0, GFX6 },
|
||||
{ BRW_OPCODE_WHILE, 39, "while", 0, 0, GFX_ALL },
|
||||
{ BRW_OPCODE_BREAK, 40, "break", 0, 0, GFX_ALL },
|
||||
{ BRW_OPCODE_CONTINUE, 41, "cont", 0, 0, GFX_ALL },
|
||||
{ BRW_OPCODE_HALT, 42, "halt", 0, 0, GFX_ALL },
|
||||
{ BRW_OPCODE_CALLA, 43, "calla", 0, 0, GFX_GE(GFX75) },
|
||||
{ BRW_OPCODE_MSAVE, 44, "msave", 0, 0, GFX_LE(GFX5) },
|
||||
{ BRW_OPCODE_CALL, 44, "call", 0, 0, GFX_GE(GFX6) },
|
||||
{ BRW_OPCODE_MREST, 45, "mrest", 0, 0, GFX_LE(GFX5) },
|
||||
{ BRW_OPCODE_RET, 45, "ret", 0, 0, GFX_GE(GFX6) },
|
||||
{ BRW_OPCODE_PUSH, 46, "push", 0, 0, GFX_LE(GFX5) },
|
||||
{ BRW_OPCODE_FORK, 46, "fork", 0, 0, GFX6 },
|
||||
{ BRW_OPCODE_GOTO, 46, "goto", 0, 0, GFX_GE(GFX8) },
|
||||
{ BRW_OPCODE_POP, 47, "pop", 2, 0, GFX_LE(GFX5) },
|
||||
{ BRW_OPCODE_WAIT, 48, "wait", 0, 1, GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_SEND, 49, "send", 1, 1, GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_SENDC, 50, "sendc", 1, 1, GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_SEND, 49, "send", 2, 1, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_SENDC, 50, "sendc", 2, 1, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_SENDS, 51, "sends", 2, 1, GFX_GE(GFX9) & GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_SENDSC, 52, "sendsc", 2, 1, GFX_GE(GFX9) & GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_MATH, 56, "math", 2, 1, GFX_GE(GFX6) },
|
||||
{ BRW_OPCODE_ADD, 64, "add", 2, 1, GFX_ALL },
|
||||
{ BRW_OPCODE_MUL, 65, "mul", 2, 1, GFX_ALL },
|
||||
{ BRW_OPCODE_AVG, 66, "avg", 2, 1, GFX_ALL },
|
||||
{ BRW_OPCODE_FRC, 67, "frc", 1, 1, GFX_ALL },
|
||||
{ BRW_OPCODE_RNDU, 68, "rndu", 1, 1, GFX_ALL },
|
||||
{ BRW_OPCODE_RNDD, 69, "rndd", 1, 1, GFX_ALL },
|
||||
{ BRW_OPCODE_RNDE, 70, "rnde", 1, 1, GFX_ALL },
|
||||
{ BRW_OPCODE_RNDZ, 71, "rndz", 1, 1, GFX_ALL },
|
||||
{ BRW_OPCODE_MAC, 72, "mac", 2, 1, GFX_ALL },
|
||||
{ BRW_OPCODE_MACH, 73, "mach", 2, 1, GFX_ALL },
|
||||
{ BRW_OPCODE_LZD, 74, "lzd", 1, 1, GFX_ALL },
|
||||
{ BRW_OPCODE_FBH, 75, "fbh", 1, 1, GFX_GE(GFX7) },
|
||||
{ BRW_OPCODE_FBL, 76, "fbl", 1, 1, GFX_GE(GFX7) },
|
||||
{ BRW_OPCODE_CBIT, 77, "cbit", 1, 1, GFX_GE(GFX7) },
|
||||
{ BRW_OPCODE_ADDC, 78, "addc", 2, 1, GFX_GE(GFX7) },
|
||||
{ BRW_OPCODE_SUBB, 79, "subb", 2, 1, GFX_GE(GFX7) },
|
||||
{ BRW_OPCODE_SAD2, 80, "sad2", 2, 1, GFX_ALL },
|
||||
{ BRW_OPCODE_SADA2, 81, "sada2", 2, 1, GFX_ALL },
|
||||
{ BRW_OPCODE_ADD3, 82, "add3", 3, 1, GFX_GE(GFX125) },
|
||||
{ BRW_OPCODE_DP4, 84, "dp4", 2, 1, GFX_LT(GFX11) },
|
||||
{ BRW_OPCODE_DPH, 85, "dph", 2, 1, GFX_LT(GFX11) },
|
||||
{ BRW_OPCODE_DP3, 86, "dp3", 2, 1, GFX_LT(GFX11) },
|
||||
{ BRW_OPCODE_DP2, 87, "dp2", 2, 1, GFX_LT(GFX11) },
|
||||
{ BRW_OPCODE_DP4A, 88, "dp4a", 3, 1, GFX_GE(GFX12) },
|
||||
{ BRW_OPCODE_LINE, 89, "line", 2, 1, GFX_LE(GFX10) },
|
||||
{ BRW_OPCODE_DPAS, 89, "dpas", 3, 1, GFX_GE(GFX125) },
|
||||
{ BRW_OPCODE_PLN, 90, "pln", 2, 1, GFX_GE(GFX45) & GFX_LE(GFX10) },
|
||||
{ BRW_OPCODE_MAD, 91, "mad", 3, 1, GFX_GE(GFX6) },
|
||||
{ BRW_OPCODE_LRP, 92, "lrp", 3, 1, GFX_GE(GFX6) & GFX_LE(GFX10) },
|
||||
{ BRW_OPCODE_MADM, 93, "madm", 3, 1, GFX_GE(GFX8) },
|
||||
{ BRW_OPCODE_NENOP, 125, "nenop", 0, 0, GFX45 },
|
||||
{ BRW_OPCODE_NOP, 126, "nop", 0, 0, GFX_LT(GFX12) },
|
||||
{ BRW_OPCODE_NOP, 96, "nop", 0, 0, GFX_GE(GFX12) }
|
||||
};
|
||||
|
||||
void
|
||||
brw_init_isa_info(struct brw_isa_info *isa,
|
||||
const struct intel_device_info *devinfo)
|
||||
{
|
||||
isa->devinfo = devinfo;
|
||||
|
||||
enum gfx_ver ver = gfx_ver_from_devinfo(devinfo);
|
||||
|
||||
memset(isa->ir_to_descs, 0, sizeof(isa->ir_to_descs));
|
||||
memset(isa->hw_to_descs, 0, sizeof(isa->hw_to_descs));
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(opcode_descs); i++) {
|
||||
if (opcode_descs[i].gfx_vers & ver) {
|
||||
const unsigned e = opcode_descs[i].ir;
|
||||
const unsigned h = opcode_descs[i].hw;
|
||||
assert(e < ARRAY_SIZE(isa->ir_to_descs) && !isa->ir_to_descs[e]);
|
||||
assert(h < ARRAY_SIZE(isa->hw_to_descs) && !isa->hw_to_descs[h]);
|
||||
isa->ir_to_descs[e] = &opcode_descs[i];
|
||||
isa->hw_to_descs[h] = &opcode_descs[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the matching opcode_desc for the specified IR opcode and hardware
|
||||
* generation, or NULL if the opcode is not supported by the device.
|
||||
*/
|
||||
const struct opcode_desc *
|
||||
brw_opcode_desc(const struct brw_isa_info *isa, enum opcode op)
|
||||
{
|
||||
return op < ARRAY_SIZE(isa->ir_to_descs) ? isa->ir_to_descs[op] : NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the matching opcode_desc for the specified HW opcode and hardware
|
||||
* generation, or NULL if the opcode is not supported by the device.
|
||||
*/
|
||||
const struct opcode_desc *
|
||||
brw_opcode_desc_from_hw(const struct brw_isa_info *isa, unsigned hw)
|
||||
{
|
||||
return hw < ARRAY_SIZE(isa->hw_to_descs) ? isa->hw_to_descs[hw] : NULL;
|
||||
}
|
||||
|
||||
unsigned
|
||||
brw_num_sources_from_inst(const struct brw_isa_info *isa,
|
||||
const brw_inst *inst)
|
||||
{
|
||||
const struct intel_device_info *devinfo = isa->devinfo;
|
||||
const struct opcode_desc *desc =
|
||||
brw_opcode_desc(isa, brw_inst_opcode(isa, inst));
|
||||
unsigned math_function;
|
||||
|
||||
if (brw_inst_opcode(isa, inst) == BRW_OPCODE_MATH) {
|
||||
math_function = brw_inst_math_function(devinfo, inst);
|
||||
} else if (devinfo->ver < 6 &&
|
||||
brw_inst_opcode(isa, inst) == BRW_OPCODE_SEND) {
|
||||
if (brw_inst_sfid(devinfo, inst) == BRW_SFID_MATH) {
|
||||
/* src1 must be a descriptor (including the information to determine
|
||||
* that the SEND is doing an extended math operation), but src0 can
|
||||
* actually be null since it serves as the source of the implicit GRF
|
||||
* to MRF move.
|
||||
*
|
||||
* If we stop using that functionality, we'll have to revisit this.
|
||||
*/
|
||||
return 2;
|
||||
} else {
|
||||
/* Send instructions are allowed to have null sources since they use
|
||||
* the base_mrf field to specify which message register source.
|
||||
*/
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
assert(desc->nsrc < 4);
|
||||
return desc->nsrc;
|
||||
}
|
||||
|
||||
switch (math_function) {
|
||||
case BRW_MATH_FUNCTION_INV:
|
||||
case BRW_MATH_FUNCTION_LOG:
|
||||
case BRW_MATH_FUNCTION_EXP:
|
||||
case BRW_MATH_FUNCTION_SQRT:
|
||||
case BRW_MATH_FUNCTION_RSQ:
|
||||
case BRW_MATH_FUNCTION_SIN:
|
||||
case BRW_MATH_FUNCTION_COS:
|
||||
case BRW_MATH_FUNCTION_SINCOS:
|
||||
case GFX8_MATH_FUNCTION_INVM:
|
||||
case GFX8_MATH_FUNCTION_RSQRTM:
|
||||
return 1;
|
||||
case BRW_MATH_FUNCTION_FDIV:
|
||||
case BRW_MATH_FUNCTION_POW:
|
||||
case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
|
||||
case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
|
||||
case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
|
||||
return 2;
|
||||
default:
|
||||
unreachable("not reached");
|
||||
}
|
||||
}
|
||||
2089
src/intel/compiler/elk/brw_eu.h
Normal file
2089
src/intel/compiler/elk/brw_eu.h
Normal file
File diff suppressed because it is too large
Load diff
3081
src/intel/compiler/elk/brw_eu_compact.c
Normal file
3081
src/intel/compiler/elk/brw_eu_compact.c
Normal file
File diff suppressed because it is too large
Load diff
2218
src/intel/compiler/elk/brw_eu_defines.h
Normal file
2218
src/intel/compiler/elk/brw_eu_defines.h
Normal file
File diff suppressed because it is too large
Load diff
3770
src/intel/compiler/elk/brw_eu_emit.c
Normal file
3770
src/intel/compiler/elk/brw_eu_emit.c
Normal file
File diff suppressed because it is too large
Load diff
119
src/intel/compiler/elk/brw_eu_util.c
Normal file
119
src/intel/compiler/elk/brw_eu_util.c
Normal file
|
|
@ -0,0 +1,119 @@
|
|||
/*
|
||||
Copyright (C) Intel Corp. 2006. All Rights Reserved.
|
||||
Intel funded Tungsten Graphics to
|
||||
develop this 3D driver.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice (including the
|
||||
next paragraph) shall be included in all copies or substantial
|
||||
portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
|
||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
**********************************************************************/
|
||||
/*
|
||||
* Authors:
|
||||
* Keith Whitwell <keithw@vmware.com>
|
||||
*/
|
||||
|
||||
|
||||
#include "brw_eu_defines.h"
|
||||
#include "brw_eu.h"
|
||||
|
||||
|
||||
void brw_math_invert( struct brw_codegen *p,
|
||||
struct brw_reg dst,
|
||||
struct brw_reg src)
|
||||
{
|
||||
gfx4_math(p,
|
||||
dst,
|
||||
BRW_MATH_FUNCTION_INV,
|
||||
0,
|
||||
src,
|
||||
BRW_MATH_PRECISION_FULL);
|
||||
}
|
||||
|
||||
|
||||
|
||||
void brw_copy4(struct brw_codegen *p,
|
||||
struct brw_reg dst,
|
||||
struct brw_reg src,
|
||||
unsigned count)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
dst = vec4(dst);
|
||||
src = vec4(src);
|
||||
|
||||
for (i = 0; i < count; i++)
|
||||
{
|
||||
unsigned delta = i*32;
|
||||
brw_MOV(p, byte_offset(dst, delta), byte_offset(src, delta));
|
||||
brw_MOV(p, byte_offset(dst, delta+16), byte_offset(src, delta+16));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void brw_copy8(struct brw_codegen *p,
|
||||
struct brw_reg dst,
|
||||
struct brw_reg src,
|
||||
unsigned count)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
dst = vec8(dst);
|
||||
src = vec8(src);
|
||||
|
||||
for (i = 0; i < count; i++)
|
||||
{
|
||||
unsigned delta = i*32;
|
||||
brw_MOV(p, byte_offset(dst, delta), byte_offset(src, delta));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void brw_copy_indirect_to_indirect(struct brw_codegen *p,
|
||||
struct brw_indirect dst_ptr,
|
||||
struct brw_indirect src_ptr,
|
||||
unsigned count)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < count; i++)
|
||||
{
|
||||
unsigned delta = i*32;
|
||||
brw_MOV(p, deref_4f(dst_ptr, delta), deref_4f(src_ptr, delta));
|
||||
brw_MOV(p, deref_4f(dst_ptr, delta+16), deref_4f(src_ptr, delta+16));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void brw_copy_from_indirect(struct brw_codegen *p,
|
||||
struct brw_reg dst,
|
||||
struct brw_indirect ptr,
|
||||
unsigned count)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
dst = vec4(dst);
|
||||
|
||||
for (i = 0; i < count; i++)
|
||||
{
|
||||
unsigned delta = i*32;
|
||||
brw_MOV(p, byte_offset(dst, delta), deref_4f(ptr, delta));
|
||||
brw_MOV(p, byte_offset(dst, delta+16), deref_4f(ptr, delta+16));
|
||||
}
|
||||
}
|
||||
2827
src/intel/compiler/elk/brw_eu_validate.c
Normal file
2827
src/intel/compiler/elk/brw_eu_validate.c
Normal file
File diff suppressed because it is too large
Load diff
8561
src/intel/compiler/elk/brw_fs.cpp
Normal file
8561
src/intel/compiler/elk/brw_fs.cpp
Normal file
File diff suppressed because it is too large
Load diff
637
src/intel/compiler/elk/brw_fs.h
Normal file
637
src/intel/compiler/elk/brw_fs.h
Normal file
|
|
@ -0,0 +1,637 @@
|
|||
/*
|
||||
* Copyright © 2010 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* Authors:
|
||||
* Eric Anholt <eric@anholt.net>
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef BRW_FS_H
|
||||
#define BRW_FS_H
|
||||
|
||||
#include "brw_shader.h"
|
||||
#include "brw_ir_fs.h"
|
||||
#include "brw_fs_live_variables.h"
|
||||
#include "brw_ir_performance.h"
|
||||
#include "compiler/nir/nir.h"
|
||||
|
||||
struct bblock_t;
|
||||
namespace {
|
||||
struct acp_entry;
|
||||
}
|
||||
|
||||
class fs_visitor;
|
||||
|
||||
namespace brw {
|
||||
/**
|
||||
* Register pressure analysis of a shader. Estimates how many registers
|
||||
* are live at any point of the program in GRF units.
|
||||
*/
|
||||
struct register_pressure {
|
||||
register_pressure(const fs_visitor *v);
|
||||
~register_pressure();
|
||||
|
||||
analysis_dependency_class
|
||||
dependency_class() const
|
||||
{
|
||||
return (DEPENDENCY_INSTRUCTION_IDENTITY |
|
||||
DEPENDENCY_INSTRUCTION_DATA_FLOW |
|
||||
DEPENDENCY_VARIABLES);
|
||||
}
|
||||
|
||||
bool
|
||||
validate(const fs_visitor *) const
|
||||
{
|
||||
/* FINISHME */
|
||||
return true;
|
||||
}
|
||||
|
||||
unsigned *regs_live_at_ip;
|
||||
};
|
||||
}
|
||||
|
||||
struct brw_gs_compile;
|
||||
|
||||
namespace brw {
|
||||
class fs_builder;
|
||||
}
|
||||
|
||||
struct shader_stats {
|
||||
const char *scheduler_mode;
|
||||
unsigned promoted_constants;
|
||||
unsigned spill_count;
|
||||
unsigned fill_count;
|
||||
unsigned max_register_pressure;
|
||||
};
|
||||
|
||||
/** Register numbers for thread payload fields. */
|
||||
struct thread_payload {
|
||||
/** The number of thread payload registers the hardware will supply. */
|
||||
uint8_t num_regs;
|
||||
|
||||
virtual ~thread_payload() = default;
|
||||
|
||||
protected:
|
||||
thread_payload() : num_regs() {}
|
||||
};
|
||||
|
||||
struct vs_thread_payload : public thread_payload {
|
||||
vs_thread_payload(const fs_visitor &v);
|
||||
|
||||
fs_reg urb_handles;
|
||||
};
|
||||
|
||||
struct tcs_thread_payload : public thread_payload {
|
||||
tcs_thread_payload(const fs_visitor &v);
|
||||
|
||||
fs_reg patch_urb_output;
|
||||
fs_reg primitive_id;
|
||||
fs_reg icp_handle_start;
|
||||
};
|
||||
|
||||
struct tes_thread_payload : public thread_payload {
|
||||
tes_thread_payload(const fs_visitor &v);
|
||||
|
||||
fs_reg patch_urb_input;
|
||||
fs_reg primitive_id;
|
||||
fs_reg coords[3];
|
||||
fs_reg urb_output;
|
||||
};
|
||||
|
||||
struct gs_thread_payload : public thread_payload {
|
||||
gs_thread_payload(fs_visitor &v);
|
||||
|
||||
fs_reg urb_handles;
|
||||
fs_reg primitive_id;
|
||||
fs_reg instance_id;
|
||||
fs_reg icp_handle_start;
|
||||
};
|
||||
|
||||
struct fs_thread_payload : public thread_payload {
|
||||
fs_thread_payload(const fs_visitor &v,
|
||||
bool &source_depth_to_render_target,
|
||||
bool &runtime_check_aads_emit);
|
||||
|
||||
uint8_t subspan_coord_reg[2];
|
||||
uint8_t source_depth_reg[2];
|
||||
uint8_t source_w_reg[2];
|
||||
uint8_t aa_dest_stencil_reg[2];
|
||||
uint8_t dest_depth_reg[2];
|
||||
uint8_t sample_pos_reg[2];
|
||||
uint8_t sample_mask_in_reg[2];
|
||||
uint8_t depth_w_coef_reg;
|
||||
uint8_t barycentric_coord_reg[BRW_BARYCENTRIC_MODE_COUNT][2];
|
||||
};
|
||||
|
||||
struct cs_thread_payload : public thread_payload {
|
||||
cs_thread_payload(const fs_visitor &v);
|
||||
|
||||
void load_subgroup_id(const brw::fs_builder &bld, fs_reg &dest) const;
|
||||
|
||||
fs_reg local_invocation_id[3];
|
||||
|
||||
protected:
|
||||
fs_reg subgroup_id_;
|
||||
};
|
||||
|
||||
struct task_mesh_thread_payload : public cs_thread_payload {
|
||||
task_mesh_thread_payload(fs_visitor &v);
|
||||
|
||||
fs_reg extended_parameter_0;
|
||||
fs_reg local_index;
|
||||
fs_reg inline_parameter;
|
||||
|
||||
fs_reg urb_output;
|
||||
|
||||
/* URB to read Task memory inputs. Only valid for MESH stage. */
|
||||
fs_reg task_urb_input;
|
||||
};
|
||||
|
||||
struct bs_thread_payload : public thread_payload {
|
||||
bs_thread_payload(const fs_visitor &v);
|
||||
|
||||
fs_reg global_arg_ptr;
|
||||
fs_reg local_arg_ptr;
|
||||
|
||||
void load_shader_type(const brw::fs_builder &bld, fs_reg &dest) const;
|
||||
};
|
||||
|
||||
class fs_instruction_scheduler;
|
||||
|
||||
/**
|
||||
* The fragment shader front-end.
|
||||
*
|
||||
* Translates either GLSL IR or Mesa IR (for ARB_fragment_program) into FS IR.
|
||||
*/
|
||||
class fs_visitor : public backend_shader
|
||||
{
|
||||
public:
|
||||
fs_visitor(const struct brw_compiler *compiler,
|
||||
const struct brw_compile_params *params,
|
||||
const brw_base_prog_key *key,
|
||||
struct brw_stage_prog_data *prog_data,
|
||||
const nir_shader *shader,
|
||||
unsigned dispatch_width,
|
||||
bool needs_register_pressure,
|
||||
bool debug_enabled);
|
||||
fs_visitor(const struct brw_compiler *compiler,
|
||||
const struct brw_compile_params *params,
|
||||
const brw_wm_prog_key *key,
|
||||
struct brw_wm_prog_data *prog_data,
|
||||
const nir_shader *shader,
|
||||
unsigned dispatch_width,
|
||||
unsigned num_polygons,
|
||||
bool needs_register_pressure,
|
||||
bool debug_enabled);
|
||||
fs_visitor(const struct brw_compiler *compiler,
|
||||
const struct brw_compile_params *params,
|
||||
struct brw_gs_compile *gs_compile,
|
||||
struct brw_gs_prog_data *prog_data,
|
||||
const nir_shader *shader,
|
||||
bool needs_register_pressure,
|
||||
bool debug_enabled);
|
||||
void init();
|
||||
~fs_visitor();
|
||||
|
||||
fs_reg vgrf(const glsl_type *const type);
|
||||
void import_uniforms(fs_visitor *v);
|
||||
|
||||
void VARYING_PULL_CONSTANT_LOAD(const brw::fs_builder &bld,
|
||||
const fs_reg &dst,
|
||||
const fs_reg &surface,
|
||||
const fs_reg &surface_handle,
|
||||
const fs_reg &varying_offset,
|
||||
uint32_t const_offset,
|
||||
uint8_t alignment,
|
||||
unsigned components);
|
||||
void DEP_RESOLVE_MOV(const brw::fs_builder &bld, int grf);
|
||||
|
||||
bool run_fs(bool allow_spilling, bool do_rep_send);
|
||||
bool run_vs();
|
||||
bool run_tcs();
|
||||
bool run_tes();
|
||||
bool run_gs();
|
||||
bool run_cs(bool allow_spilling);
|
||||
bool run_bs(bool allow_spilling);
|
||||
bool run_task(bool allow_spilling);
|
||||
bool run_mesh(bool allow_spilling);
|
||||
void optimize();
|
||||
void allocate_registers(bool allow_spilling);
|
||||
uint32_t compute_max_register_pressure();
|
||||
bool fixup_sends_duplicate_payload();
|
||||
void fixup_3src_null_dest();
|
||||
void emit_dummy_memory_fence_before_eot();
|
||||
void emit_dummy_mov_instruction();
|
||||
bool fixup_nomask_control_flow();
|
||||
void assign_curb_setup();
|
||||
void assign_urb_setup();
|
||||
void convert_attr_sources_to_hw_regs(fs_inst *inst);
|
||||
void assign_vs_urb_setup();
|
||||
void assign_tcs_urb_setup();
|
||||
void assign_tes_urb_setup();
|
||||
void assign_gs_urb_setup();
|
||||
bool assign_regs(bool allow_spilling, bool spill_all);
|
||||
void assign_regs_trivial();
|
||||
void calculate_payload_ranges(unsigned payload_node_count,
|
||||
int *payload_last_use_ip) const;
|
||||
bool split_virtual_grfs();
|
||||
bool compact_virtual_grfs();
|
||||
void assign_constant_locations();
|
||||
bool get_pull_locs(const fs_reg &src, unsigned *out_surf_index,
|
||||
unsigned *out_pull_index);
|
||||
bool lower_constant_loads();
|
||||
virtual void invalidate_analysis(brw::analysis_dependency_class c);
|
||||
|
||||
#ifndef NDEBUG
|
||||
void validate();
|
||||
#else
|
||||
void validate() {}
|
||||
#endif
|
||||
|
||||
bool opt_algebraic();
|
||||
bool opt_redundant_halt();
|
||||
bool opt_cse();
|
||||
bool opt_cse_local(const brw::fs_live_variables &live, bblock_t *block, int &ip);
|
||||
|
||||
bool opt_copy_propagation();
|
||||
bool opt_bank_conflicts();
|
||||
bool opt_split_sends();
|
||||
bool register_coalesce();
|
||||
bool compute_to_mrf();
|
||||
bool eliminate_find_live_channel();
|
||||
bool dead_code_eliminate();
|
||||
bool remove_duplicate_mrf_writes();
|
||||
bool remove_extra_rounding_modes();
|
||||
|
||||
fs_instruction_scheduler *prepare_scheduler(void *mem_ctx);
|
||||
void schedule_instructions_pre_ra(fs_instruction_scheduler *sched,
|
||||
instruction_scheduler_mode mode);
|
||||
void schedule_instructions_post_ra();
|
||||
|
||||
void insert_gfx4_send_dependency_workarounds();
|
||||
void insert_gfx4_pre_send_dependency_workarounds(bblock_t *block,
|
||||
fs_inst *inst);
|
||||
void insert_gfx4_post_send_dependency_workarounds(bblock_t *block,
|
||||
fs_inst *inst);
|
||||
void vfail(const char *msg, va_list args);
|
||||
void fail(const char *msg, ...);
|
||||
void limit_dispatch_width(unsigned n, const char *msg);
|
||||
bool lower_uniform_pull_constant_loads();
|
||||
bool lower_load_payload();
|
||||
bool lower_pack();
|
||||
bool lower_regioning();
|
||||
bool lower_logical_sends();
|
||||
bool lower_integer_multiplication();
|
||||
bool lower_minmax();
|
||||
bool lower_simd_width();
|
||||
bool lower_barycentrics();
|
||||
bool lower_derivatives();
|
||||
bool lower_find_live_channel();
|
||||
bool lower_scoreboard();
|
||||
bool lower_sub_sat();
|
||||
bool opt_combine_constants();
|
||||
|
||||
void emit_repclear_shader();
|
||||
void emit_interpolation_setup_gfx4();
|
||||
void emit_interpolation_setup_gfx6();
|
||||
bool opt_peephole_sel();
|
||||
bool opt_saturate_propagation();
|
||||
bool opt_cmod_propagation();
|
||||
bool opt_zero_samples();
|
||||
|
||||
void set_tcs_invocation_id();
|
||||
|
||||
void emit_alpha_test();
|
||||
fs_inst *emit_single_fb_write(const brw::fs_builder &bld,
|
||||
fs_reg color1, fs_reg color2,
|
||||
fs_reg src0_alpha, unsigned components);
|
||||
void do_emit_fb_writes(int nr_color_regions, bool replicate_alpha);
|
||||
void emit_fb_writes();
|
||||
void emit_urb_writes(const fs_reg &gs_vertex_count = fs_reg());
|
||||
void emit_gs_control_data_bits(const fs_reg &vertex_count);
|
||||
void emit_gs_thread_end();
|
||||
bool mark_last_urb_write_with_eot();
|
||||
void emit_tcs_thread_end();
|
||||
void emit_urb_fence();
|
||||
void emit_cs_terminate();
|
||||
|
||||
fs_reg interp_reg(const brw::fs_builder &bld, unsigned location,
|
||||
unsigned channel, unsigned comp);
|
||||
fs_reg per_primitive_reg(const brw::fs_builder &bld,
|
||||
int location, unsigned comp);
|
||||
|
||||
virtual void dump_instruction_to_file(const backend_instruction *inst, FILE *file) const;
|
||||
virtual void dump_instructions_to_file(FILE *file) const;
|
||||
|
||||
const brw_base_prog_key *const key;
|
||||
const struct brw_sampler_prog_key_data *key_tex;
|
||||
|
||||
struct brw_gs_compile *gs_compile;
|
||||
|
||||
struct brw_stage_prog_data *prog_data;
|
||||
|
||||
brw_analysis<brw::fs_live_variables, backend_shader> live_analysis;
|
||||
brw_analysis<brw::register_pressure, fs_visitor> regpressure_analysis;
|
||||
brw_analysis<brw::performance, fs_visitor> performance_analysis;
|
||||
|
||||
/** Number of uniform variable components visited. */
|
||||
unsigned uniforms;
|
||||
|
||||
/** Byte-offset for the next available spot in the scratch space buffer. */
|
||||
unsigned last_scratch;
|
||||
|
||||
/**
|
||||
* Array mapping UNIFORM register numbers to the push parameter index,
|
||||
* or -1 if this uniform register isn't being uploaded as a push constant.
|
||||
*/
|
||||
int *push_constant_loc;
|
||||
|
||||
fs_reg frag_depth;
|
||||
fs_reg frag_stencil;
|
||||
fs_reg sample_mask;
|
||||
fs_reg outputs[VARYING_SLOT_MAX];
|
||||
fs_reg dual_src_output;
|
||||
int first_non_payload_grf;
|
||||
/** Either BRW_MAX_GRF or GFX7_MRF_HACK_START */
|
||||
unsigned max_grf;
|
||||
|
||||
bool failed;
|
||||
char *fail_msg;
|
||||
|
||||
thread_payload *payload_;
|
||||
|
||||
thread_payload &payload() {
|
||||
return *this->payload_;
|
||||
}
|
||||
|
||||
vs_thread_payload &vs_payload() {
|
||||
assert(stage == MESA_SHADER_VERTEX);
|
||||
return *static_cast<vs_thread_payload *>(this->payload_);
|
||||
}
|
||||
|
||||
tcs_thread_payload &tcs_payload() {
|
||||
assert(stage == MESA_SHADER_TESS_CTRL);
|
||||
return *static_cast<tcs_thread_payload *>(this->payload_);
|
||||
}
|
||||
|
||||
tes_thread_payload &tes_payload() {
|
||||
assert(stage == MESA_SHADER_TESS_EVAL);
|
||||
return *static_cast<tes_thread_payload *>(this->payload_);
|
||||
}
|
||||
|
||||
gs_thread_payload &gs_payload() {
|
||||
assert(stage == MESA_SHADER_GEOMETRY);
|
||||
return *static_cast<gs_thread_payload *>(this->payload_);
|
||||
}
|
||||
|
||||
fs_thread_payload &fs_payload() {
|
||||
assert(stage == MESA_SHADER_FRAGMENT);
|
||||
return *static_cast<fs_thread_payload *>(this->payload_);
|
||||
};
|
||||
|
||||
cs_thread_payload &cs_payload() {
|
||||
assert(gl_shader_stage_uses_workgroup(stage));
|
||||
return *static_cast<cs_thread_payload *>(this->payload_);
|
||||
}
|
||||
|
||||
task_mesh_thread_payload &task_mesh_payload() {
|
||||
assert(stage == MESA_SHADER_TASK || stage == MESA_SHADER_MESH);
|
||||
return *static_cast<task_mesh_thread_payload *>(this->payload_);
|
||||
}
|
||||
|
||||
bs_thread_payload &bs_payload() {
|
||||
assert(stage >= MESA_SHADER_RAYGEN && stage <= MESA_SHADER_CALLABLE);
|
||||
return *static_cast<bs_thread_payload *>(this->payload_);
|
||||
}
|
||||
|
||||
bool source_depth_to_render_target;
|
||||
bool runtime_check_aads_emit;
|
||||
|
||||
fs_reg pixel_x;
|
||||
fs_reg pixel_y;
|
||||
fs_reg pixel_z;
|
||||
fs_reg wpos_w;
|
||||
fs_reg pixel_w;
|
||||
fs_reg delta_xy[BRW_BARYCENTRIC_MODE_COUNT];
|
||||
fs_reg final_gs_vertex_count;
|
||||
fs_reg control_data_bits;
|
||||
fs_reg invocation_id;
|
||||
|
||||
unsigned grf_used;
|
||||
bool spilled_any_registers;
|
||||
bool needs_register_pressure;
|
||||
|
||||
const unsigned dispatch_width; /**< 8, 16 or 32 */
|
||||
const unsigned max_polygons;
|
||||
unsigned max_dispatch_width;
|
||||
|
||||
/* The API selected subgroup size */
|
||||
unsigned api_subgroup_size; /**< 0, 8, 16, 32 */
|
||||
|
||||
struct shader_stats shader_stats;
|
||||
|
||||
void lower_mul_dword_inst(fs_inst *inst, bblock_t *block);
|
||||
void lower_mul_qword_inst(fs_inst *inst, bblock_t *block);
|
||||
void lower_mulh_inst(fs_inst *inst, bblock_t *block);
|
||||
|
||||
unsigned workgroup_size() const;
|
||||
|
||||
void debug_optimizer(const nir_shader *nir,
|
||||
const char *pass_name,
|
||||
int iteration, int pass_num) const;
|
||||
};
|
||||
|
||||
/**
|
||||
* Return the flag register used in fragment shaders to keep track of live
|
||||
* samples. On Gfx7+ we use f1.0-f1.1 to allow discard jumps in SIMD32
|
||||
* dispatch mode, while earlier generations are constrained to f0.1, which
|
||||
* limits the dispatch width to SIMD16 for fragment shaders that use discard.
|
||||
*/
|
||||
static inline unsigned
|
||||
sample_mask_flag_subreg(const fs_visitor &s)
|
||||
{
|
||||
assert(s.stage == MESA_SHADER_FRAGMENT);
|
||||
return s.devinfo->ver >= 7 ? 2 : 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* The fragment shader code generator.
|
||||
*
|
||||
* Translates FS IR to actual i965 assembly code.
|
||||
*/
|
||||
class fs_generator
|
||||
{
|
||||
public:
|
||||
fs_generator(const struct brw_compiler *compiler,
|
||||
const struct brw_compile_params *params,
|
||||
struct brw_stage_prog_data *prog_data,
|
||||
bool runtime_check_aads_emit,
|
||||
gl_shader_stage stage);
|
||||
~fs_generator();
|
||||
|
||||
void enable_debug(const char *shader_name);
|
||||
int generate_code(const cfg_t *cfg, int dispatch_width,
|
||||
struct shader_stats shader_stats,
|
||||
const brw::performance &perf,
|
||||
struct brw_compile_stats *stats,
|
||||
unsigned max_polygons = 0);
|
||||
void add_const_data(void *data, unsigned size);
|
||||
void add_resume_sbt(unsigned num_resume_shaders, uint64_t *sbt);
|
||||
const unsigned *get_assembly();
|
||||
|
||||
private:
|
||||
void fire_fb_write(fs_inst *inst,
|
||||
struct brw_reg payload,
|
||||
struct brw_reg implied_header,
|
||||
GLuint nr);
|
||||
void generate_send(fs_inst *inst,
|
||||
struct brw_reg dst,
|
||||
struct brw_reg desc,
|
||||
struct brw_reg ex_desc,
|
||||
struct brw_reg payload,
|
||||
struct brw_reg payload2);
|
||||
void generate_fb_write(fs_inst *inst, struct brw_reg payload);
|
||||
void generate_fb_read(fs_inst *inst, struct brw_reg dst,
|
||||
struct brw_reg payload);
|
||||
void generate_cs_terminate(fs_inst *inst, struct brw_reg payload);
|
||||
void generate_barrier(fs_inst *inst, struct brw_reg src);
|
||||
bool generate_linterp(fs_inst *inst, struct brw_reg dst,
|
||||
struct brw_reg *src);
|
||||
void generate_tex(fs_inst *inst, struct brw_reg dst,
|
||||
struct brw_reg surface_index,
|
||||
struct brw_reg sampler_index);
|
||||
void generate_ddx(const fs_inst *inst,
|
||||
struct brw_reg dst, struct brw_reg src);
|
||||
void generate_ddy(const fs_inst *inst,
|
||||
struct brw_reg dst, struct brw_reg src);
|
||||
void generate_scratch_write(fs_inst *inst, struct brw_reg src);
|
||||
void generate_scratch_read(fs_inst *inst, struct brw_reg dst);
|
||||
void generate_scratch_read_gfx7(fs_inst *inst, struct brw_reg dst);
|
||||
void generate_scratch_header(fs_inst *inst, struct brw_reg dst);
|
||||
void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst,
|
||||
struct brw_reg index,
|
||||
struct brw_reg offset);
|
||||
void generate_varying_pull_constant_load_gfx4(fs_inst *inst,
|
||||
struct brw_reg dst,
|
||||
struct brw_reg index);
|
||||
|
||||
void generate_set_sample_id(fs_inst *inst,
|
||||
struct brw_reg dst,
|
||||
struct brw_reg src0,
|
||||
struct brw_reg src1);
|
||||
|
||||
void generate_halt(fs_inst *inst);
|
||||
|
||||
void generate_mov_indirect(fs_inst *inst,
|
||||
struct brw_reg dst,
|
||||
struct brw_reg reg,
|
||||
struct brw_reg indirect_byte_offset);
|
||||
|
||||
void generate_shuffle(fs_inst *inst,
|
||||
struct brw_reg dst,
|
||||
struct brw_reg src,
|
||||
struct brw_reg idx);
|
||||
|
||||
void generate_quad_swizzle(const fs_inst *inst,
|
||||
struct brw_reg dst, struct brw_reg src,
|
||||
unsigned swiz);
|
||||
|
||||
bool patch_halt_jumps();
|
||||
|
||||
const struct brw_compiler *compiler;
|
||||
const struct brw_compile_params *params;
|
||||
|
||||
const struct intel_device_info *devinfo;
|
||||
|
||||
struct brw_codegen *p;
|
||||
struct brw_stage_prog_data * const prog_data;
|
||||
|
||||
unsigned dispatch_width; /**< 8, 16 or 32 */
|
||||
|
||||
exec_list discard_halt_patches;
|
||||
bool runtime_check_aads_emit;
|
||||
bool debug_flag;
|
||||
const char *shader_name;
|
||||
gl_shader_stage stage;
|
||||
void *mem_ctx;
|
||||
};
|
||||
|
||||
namespace brw {
|
||||
fs_reg
|
||||
fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2],
|
||||
brw_reg_type type = BRW_REGISTER_TYPE_F,
|
||||
unsigned n = 1);
|
||||
|
||||
fs_reg
|
||||
fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2]);
|
||||
|
||||
inline fs_reg
|
||||
dynamic_msaa_flags(const struct brw_wm_prog_data *wm_prog_data)
|
||||
{
|
||||
return fs_reg(UNIFORM, wm_prog_data->msaa_flags_param,
|
||||
BRW_REGISTER_TYPE_UD);
|
||||
}
|
||||
|
||||
void
|
||||
check_dynamic_msaa_flag(const fs_builder &bld,
|
||||
const struct brw_wm_prog_data *wm_prog_data,
|
||||
enum intel_msaa_flags flag);
|
||||
|
||||
bool
|
||||
lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i);
|
||||
}
|
||||
|
||||
void shuffle_from_32bit_read(const brw::fs_builder &bld,
|
||||
const fs_reg &dst,
|
||||
const fs_reg &src,
|
||||
uint32_t first_component,
|
||||
uint32_t components);
|
||||
|
||||
fs_reg setup_imm_df(const brw::fs_builder &bld,
|
||||
double v);
|
||||
|
||||
fs_reg setup_imm_b(const brw::fs_builder &bld,
|
||||
int8_t v);
|
||||
|
||||
fs_reg setup_imm_ub(const brw::fs_builder &bld,
|
||||
uint8_t v);
|
||||
|
||||
enum brw_barycentric_mode brw_barycentric_mode(nir_intrinsic_instr *intr);
|
||||
|
||||
uint32_t brw_fb_write_msg_control(const fs_inst *inst,
|
||||
const struct brw_wm_prog_data *prog_data);
|
||||
|
||||
void brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data);
|
||||
|
||||
bool brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width);
|
||||
|
||||
fs_reg brw_sample_mask_reg(const brw::fs_builder &bld);
|
||||
void brw_emit_predicate_on_sample_mask(const brw::fs_builder &bld, fs_inst *inst);
|
||||
|
||||
int brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
|
||||
const brw_stage_prog_data *prog_data);
|
||||
|
||||
bool brw_lower_dpas(fs_visitor &v);
|
||||
|
||||
void nir_to_brw(fs_visitor *s);
|
||||
|
||||
#endif /* BRW_FS_H */
|
||||
955
src/intel/compiler/elk/brw_fs_bank_conflicts.cpp
Normal file
955
src/intel/compiler/elk/brw_fs_bank_conflicts.cpp
Normal file
|
|
@ -0,0 +1,955 @@
|
|||
/*
|
||||
* Copyright © 2017 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/** @file brw_fs_bank_conflicts.cpp
|
||||
*
|
||||
* This file contains a GRF bank conflict mitigation pass. The pass is
|
||||
* intended to be run after register allocation and works by rearranging the
|
||||
* layout of the GRF space (without altering the semantics of the program) in
|
||||
* a way that minimizes the number of GRF bank conflicts incurred by ternary
|
||||
* instructions.
|
||||
*
|
||||
* Unfortunately there is close to no information about bank conflicts in the
|
||||
* hardware spec, but experimentally on Gfx7-Gfx9 ternary instructions seem to
|
||||
* incur an average bank conflict penalty of one cycle per SIMD8 op whenever
|
||||
* the second and third source are stored in the same GRF bank (\sa bank_of()
|
||||
* for the exact bank layout) which cannot be fetched during the same cycle by
|
||||
* the EU, unless the EU logic manages to optimize out the read cycle of a
|
||||
* duplicate source register (\sa is_conflict_optimized_out()).
|
||||
*
|
||||
* The asymptotic run-time of the algorithm is dominated by the
|
||||
* shader_conflict_weight_matrix() computation below, which is O(n) on the
|
||||
* number of instructions in the program, however for small and medium-sized
|
||||
* programs the run-time is likely to be dominated by
|
||||
* optimize_reg_permutation() which is O(m^3) on the number of GRF atoms of
|
||||
* the program (\sa partitioning), which is bounded (since the program uses a
|
||||
* bounded number of registers post-regalloc) and of the order of 100. For
|
||||
* that reason optimize_reg_permutation() is vectorized in order to keep the
|
||||
* cubic term within reasonable bounds for m close to its theoretical maximum.
|
||||
*/
|
||||
|
||||
#include "brw_fs.h"
|
||||
#include "brw_cfg.h"
|
||||
|
||||
#ifdef __SSE2__
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
/**
|
||||
* Thin layer around vector intrinsics so they can be easily replaced with
|
||||
* e.g. the fall-back scalar path, an implementation with different vector
|
||||
* width or using different SIMD architectures (AVX-512?!).
|
||||
*
|
||||
* This implementation operates on pairs of independent SSE2 integer vectors à
|
||||
* la SIMD16 for somewhat improved throughput. SSE2 is supported by virtually
|
||||
* all platforms that care about bank conflicts, so this path should almost
|
||||
* always be available in practice.
|
||||
*/
|
||||
namespace {
|
||||
/**
|
||||
* SIMD integer vector data type.
|
||||
*/
|
||||
struct vector_type {
|
||||
__m128i v[2];
|
||||
};
|
||||
|
||||
/**
|
||||
* Scalar data type matching the representation of a single component of \p
|
||||
* vector_type.
|
||||
*/
|
||||
typedef int16_t scalar_type;
|
||||
|
||||
/**
|
||||
* Maximum integer value representable as a \p scalar_type.
|
||||
*/
|
||||
const scalar_type max_scalar = INT16_MAX;
|
||||
|
||||
/**
|
||||
* Number of components of a \p vector_type.
|
||||
*/
|
||||
const unsigned vector_width = 2 * sizeof(__m128i) / sizeof(scalar_type);
|
||||
|
||||
/**
|
||||
* Set the i-th component of vector \p v to \p x.
|
||||
*/
|
||||
void
|
||||
set(vector_type &v, unsigned i, scalar_type x)
|
||||
{
|
||||
assert(i < vector_width);
|
||||
memcpy((char *)v.v + i * sizeof(x), &x, sizeof(x));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the i-th component of vector \p v.
|
||||
*/
|
||||
scalar_type
|
||||
get(const vector_type &v, unsigned i)
|
||||
{
|
||||
assert(i < vector_width);
|
||||
scalar_type x;
|
||||
memcpy(&x, (char *)v.v + i * sizeof(x), sizeof(x));
|
||||
return x;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add two vectors with saturation.
|
||||
*/
|
||||
vector_type
|
||||
adds(const vector_type &v, const vector_type &w)
|
||||
{
|
||||
const vector_type u = {{
|
||||
_mm_adds_epi16(v.v[0], w.v[0]),
|
||||
_mm_adds_epi16(v.v[1], w.v[1])
|
||||
}};
|
||||
return u;
|
||||
}
|
||||
|
||||
/**
|
||||
* Subtract two vectors with saturation.
|
||||
*/
|
||||
vector_type
|
||||
subs(const vector_type &v, const vector_type &w)
|
||||
{
|
||||
const vector_type u = {{
|
||||
_mm_subs_epi16(v.v[0], w.v[0]),
|
||||
_mm_subs_epi16(v.v[1], w.v[1])
|
||||
}};
|
||||
return u;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the bitwise conjunction of two vectors.
|
||||
*/
|
||||
vector_type
|
||||
mask(const vector_type &v, const vector_type &w)
|
||||
{
|
||||
const vector_type u = {{
|
||||
_mm_and_si128(v.v[0], w.v[0]),
|
||||
_mm_and_si128(v.v[1], w.v[1])
|
||||
}};
|
||||
return u;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reduce the components of a vector using saturating addition.
|
||||
*/
|
||||
scalar_type
|
||||
sums(const vector_type &v)
|
||||
{
|
||||
const __m128i v8 = _mm_adds_epi16(v.v[0], v.v[1]);
|
||||
const __m128i v4 = _mm_adds_epi16(v8, _mm_shuffle_epi32(v8, 0x4e));
|
||||
const __m128i v2 = _mm_adds_epi16(v4, _mm_shuffle_epi32(v4, 0xb1));
|
||||
const __m128i v1 = _mm_adds_epi16(v2, _mm_shufflelo_epi16(v2, 0xb1));
|
||||
return _mm_extract_epi16(v1, 0);
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/**
|
||||
* Thin layer around vector intrinsics so they can be easily replaced with
|
||||
* e.g. the fall-back scalar path, an implementation with different vector
|
||||
* width or using different SIMD architectures (AVX-512?!).
|
||||
*
|
||||
* This implementation operates on scalar values and doesn't rely on
|
||||
* any vector extensions. This is mainly intended for debugging and
|
||||
* to keep this file building on exotic platforms.
|
||||
*/
|
||||
namespace {
|
||||
/**
|
||||
* SIMD integer vector data type.
|
||||
*/
|
||||
typedef int16_t vector_type;
|
||||
|
||||
/**
|
||||
* Scalar data type matching the representation of a single component of \p
|
||||
* vector_type.
|
||||
*/
|
||||
typedef int16_t scalar_type;
|
||||
|
||||
/**
|
||||
* Maximum integer value representable as a \p scalar_type.
|
||||
*/
|
||||
const scalar_type max_scalar = INT16_MAX;
|
||||
|
||||
/**
|
||||
* Number of components of a \p vector_type.
|
||||
*/
|
||||
const unsigned vector_width = 1;
|
||||
|
||||
/**
|
||||
* Set the i-th component of vector \p v to \p x.
|
||||
*/
|
||||
void
|
||||
set(vector_type &v, unsigned i, scalar_type x)
|
||||
{
|
||||
assert(i < vector_width);
|
||||
v = x;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the i-th component of vector \p v.
|
||||
*/
|
||||
scalar_type
|
||||
get(const vector_type &v, unsigned i)
|
||||
{
|
||||
assert(i < vector_width);
|
||||
return v;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add two vectors with saturation.
|
||||
*/
|
||||
vector_type
|
||||
adds(vector_type v, vector_type w)
|
||||
{
|
||||
return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) + w));
|
||||
}
|
||||
|
||||
/**
|
||||
* Subtract two vectors with saturation.
|
||||
*/
|
||||
vector_type
|
||||
subs(vector_type v, vector_type w)
|
||||
{
|
||||
return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) - w));
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the bitwise conjunction of two vectors.
|
||||
*/
|
||||
vector_type
|
||||
mask(vector_type v, vector_type w)
|
||||
{
|
||||
return v & w;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reduce the components of a vector using saturating addition.
|
||||
*/
|
||||
scalar_type
|
||||
sums(vector_type v)
|
||||
{
|
||||
return v;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Swap \p x and \p y.
|
||||
*/
|
||||
#define SWAP(x, y) do { \
|
||||
__typeof(y) _swap_tmp = y; \
|
||||
y = x; \
|
||||
x = _swap_tmp; \
|
||||
} while (0)
|
||||
|
||||
namespace {
|
||||
/**
|
||||
* Variable-length vector type intended to represent cycle-count costs for
|
||||
* arbitrary atom-to-bank assignments. It's indexed by a pair of integers
|
||||
* (i, p), where i is an atom index and p in {0, 1} indicates the parity of
|
||||
* the conflict (respectively, whether the cost is incurred whenever the
|
||||
* atoms are assigned the same bank b or opposite-parity banks b and b^1).
|
||||
* \sa shader_conflict_weight_matrix()
|
||||
*/
|
||||
struct weight_vector_type {
|
||||
weight_vector_type() : v(NULL), size(0) {}
|
||||
|
||||
weight_vector_type(unsigned n) : v(alloc(n)), size(n) {}
|
||||
|
||||
weight_vector_type(const weight_vector_type &u) :
|
||||
v(alloc(u.size)), size(u.size)
|
||||
{
|
||||
memcpy(v, u.v,
|
||||
DIV_ROUND_UP(u.size, vector_width) * sizeof(vector_type));
|
||||
}
|
||||
|
||||
~weight_vector_type()
|
||||
{
|
||||
free(v);
|
||||
}
|
||||
|
||||
weight_vector_type &
|
||||
operator=(weight_vector_type u)
|
||||
{
|
||||
SWAP(v, u.v);
|
||||
SWAP(size, u.size);
|
||||
return *this;
|
||||
}
|
||||
|
||||
vector_type *v;
|
||||
unsigned size;
|
||||
|
||||
private:
|
||||
static vector_type *
|
||||
alloc(unsigned n)
|
||||
{
|
||||
const unsigned align = MAX2(sizeof(void *), __alignof__(vector_type));
|
||||
const unsigned size = DIV_ROUND_UP(n, vector_width) * sizeof(vector_type);
|
||||
void *p;
|
||||
if (posix_memalign(&p, align, size))
|
||||
return NULL;
|
||||
memset(p, 0, size);
|
||||
return reinterpret_cast<vector_type *>(p);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Set the (i, p)-th component of weight vector \p v to \p x.
|
||||
*/
|
||||
void
|
||||
set(weight_vector_type &v, unsigned i, unsigned p, scalar_type x)
|
||||
{
|
||||
set(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width, x);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the (i, p)-th component of weight vector \p v.
|
||||
*/
|
||||
scalar_type
|
||||
get(const weight_vector_type &v, unsigned i, unsigned p)
|
||||
{
|
||||
return get(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width);
|
||||
}
|
||||
|
||||
/**
|
||||
* Swap the (i, p)-th and (j, q)-th components of weight vector \p v.
|
||||
*/
|
||||
void
|
||||
swap(weight_vector_type &v,
|
||||
unsigned i, unsigned p,
|
||||
unsigned j, unsigned q)
|
||||
{
|
||||
const scalar_type tmp = get(v, i, p);
|
||||
set(v, i, p, get(v, j, q));
|
||||
set(v, j, q, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
/**
|
||||
* Object that represents the partitioning of an arbitrary register space
|
||||
* into indivisible units (referred to as atoms below) that can potentially
|
||||
* be rearranged independently from other registers. The partitioning is
|
||||
* inferred from a number of contiguity requirements specified using
|
||||
* require_contiguous(). This allows efficient look-up of the atom index a
|
||||
* given register address belongs to, or conversely the range of register
|
||||
* addresses that belong to a given atom.
|
||||
*/
|
||||
struct partitioning {
|
||||
/**
|
||||
* Create a (for the moment unrestricted) partitioning of a register
|
||||
* file of size \p n. The units are arbitrary.
|
||||
*/
|
||||
partitioning(unsigned n) :
|
||||
max_reg(n),
|
||||
offsets(new unsigned[n + num_terminator_atoms]),
|
||||
atoms(new unsigned[n + num_terminator_atoms])
|
||||
{
|
||||
for (unsigned i = 0; i < n + num_terminator_atoms; i++) {
|
||||
offsets[i] = i;
|
||||
atoms[i] = i;
|
||||
}
|
||||
}
|
||||
|
||||
partitioning(const partitioning &p) :
|
||||
max_reg(p.max_reg),
|
||||
offsets(new unsigned[p.num_atoms() + num_terminator_atoms]),
|
||||
atoms(new unsigned[p.max_reg + num_terminator_atoms])
|
||||
{
|
||||
memcpy(offsets, p.offsets,
|
||||
sizeof(unsigned) * (p.num_atoms() + num_terminator_atoms));
|
||||
memcpy(atoms, p.atoms,
|
||||
sizeof(unsigned) * (p.max_reg + num_terminator_atoms));
|
||||
}
|
||||
|
||||
~partitioning()
|
||||
{
|
||||
delete[] offsets;
|
||||
delete[] atoms;
|
||||
}
|
||||
|
||||
partitioning &
|
||||
operator=(partitioning p)
|
||||
{
|
||||
SWAP(max_reg, p.max_reg);
|
||||
SWAP(offsets, p.offsets);
|
||||
SWAP(atoms, p.atoms);
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Require register range [reg, reg + n[ to be considered part of the
|
||||
* same atom.
|
||||
*/
|
||||
void
|
||||
require_contiguous(unsigned reg, unsigned n)
|
||||
{
|
||||
unsigned r = atoms[reg];
|
||||
|
||||
/* Renumber atoms[reg...] = { r... } and their offsets[r...] for the
|
||||
* case that the specified contiguity requirement leads to the fusion
|
||||
* (yay) of one or more existing atoms.
|
||||
*/
|
||||
for (unsigned reg1 = reg + 1; reg1 <= max_reg; reg1++) {
|
||||
if (offsets[atoms[reg1]] < reg + n) {
|
||||
atoms[reg1] = r;
|
||||
} else {
|
||||
if (offsets[atoms[reg1 - 1]] != offsets[atoms[reg1]])
|
||||
r++;
|
||||
|
||||
offsets[r] = offsets[atoms[reg1]];
|
||||
atoms[reg1] = r;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the atom index register address \p reg belongs to.
|
||||
*/
|
||||
unsigned
|
||||
atom_of_reg(unsigned reg) const
|
||||
{
|
||||
return atoms[reg];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the base register address that belongs to atom \p r.
|
||||
*/
|
||||
unsigned
|
||||
reg_of_atom(unsigned r) const
|
||||
{
|
||||
return offsets[r];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the size of atom \p r in register address units.
|
||||
*/
|
||||
unsigned
|
||||
size_of_atom(unsigned r) const
|
||||
{
|
||||
assert(r < num_atoms());
|
||||
return reg_of_atom(r + 1) - reg_of_atom(r);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of atoms the whole register space is partitioned into.
|
||||
*/
|
||||
unsigned
|
||||
num_atoms() const
|
||||
{
|
||||
return atoms[max_reg];
|
||||
}
|
||||
|
||||
private:
|
||||
/**
|
||||
* Number of trailing atoms inserted for convenience so among other
|
||||
* things we don't need to special-case the last element in
|
||||
* size_of_atom().
|
||||
*/
|
||||
static const unsigned num_terminator_atoms = 1;
|
||||
unsigned max_reg;
|
||||
unsigned *offsets;
|
||||
unsigned *atoms;
|
||||
};
|
||||
|
||||
/**
|
||||
* Only GRF sources (whether they have been register-allocated or not) can
|
||||
* possibly incur bank conflicts.
|
||||
*/
|
||||
bool
|
||||
is_grf(const fs_reg &r)
|
||||
{
|
||||
return r.file == VGRF || r.file == FIXED_GRF;
|
||||
}
|
||||
|
||||
/**
|
||||
* Register offset of \p r in GRF units. Useful because the representation
|
||||
* of GRFs post-register allocation is somewhat inconsistent and depends on
|
||||
* whether the register already had a fixed GRF offset prior to register
|
||||
* allocation or whether it was part of a VGRF allocation.
|
||||
*/
|
||||
unsigned
|
||||
reg_of(const fs_reg &r)
|
||||
{
|
||||
assert(is_grf(r));
|
||||
if (r.file == VGRF)
|
||||
return r.nr + r.offset / REG_SIZE;
|
||||
else
|
||||
return reg_offset(r) / REG_SIZE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate the finest partitioning of the GRF space compatible with the
|
||||
* register contiguity requirements derived from all instructions part of
|
||||
* the program.
|
||||
*/
|
||||
partitioning
|
||||
shader_reg_partitioning(const fs_visitor *v)
|
||||
{
|
||||
partitioning p(BRW_MAX_GRF);
|
||||
|
||||
foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
|
||||
if (is_grf(inst->dst))
|
||||
p.require_contiguous(reg_of(inst->dst), regs_written(inst));
|
||||
|
||||
for (int i = 0; i < inst->sources; i++) {
|
||||
if (is_grf(inst->src[i]))
|
||||
p.require_contiguous(reg_of(inst->src[i]), regs_read(inst, i));
|
||||
}
|
||||
}
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the set of GRF atoms that should be left untouched at their
|
||||
* original location to avoid violating hardware or software assumptions.
|
||||
*/
|
||||
bool *
|
||||
shader_reg_constraints(const fs_visitor *v, const partitioning &p)
|
||||
{
|
||||
bool *constrained = new bool[p.num_atoms()]();
|
||||
|
||||
/* These are read implicitly by some send-message instructions without
|
||||
* any indication at the IR level. Assume they are unsafe to move
|
||||
* around.
|
||||
*/
|
||||
for (unsigned reg = 0; reg < 2; reg++)
|
||||
constrained[p.atom_of_reg(reg)] = true;
|
||||
|
||||
/* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference",
|
||||
* subsection "EUISA Instructions", Send Message (page 990):
|
||||
*
|
||||
* "r127 must not be used for return address when there is a src and
|
||||
* dest overlap in send instruction."
|
||||
*
|
||||
* Register allocation ensures that, so don't move 127 around to avoid
|
||||
* breaking that property.
|
||||
*/
|
||||
if (v->devinfo->ver >= 8)
|
||||
constrained[p.atom_of_reg(127)] = true;
|
||||
|
||||
foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
|
||||
/* Assume that anything referenced via fixed GRFs is baked into the
|
||||
* hardware's fixed-function logic and may be unsafe to move around.
|
||||
* Also take into account the source GRF restrictions of EOT
|
||||
* send-message instructions.
|
||||
*/
|
||||
if (inst->dst.file == FIXED_GRF)
|
||||
constrained[p.atom_of_reg(reg_of(inst->dst))] = true;
|
||||
|
||||
for (int i = 0; i < inst->sources; i++) {
|
||||
if (inst->src[i].file == FIXED_GRF ||
|
||||
(is_grf(inst->src[i]) && inst->eot))
|
||||
constrained[p.atom_of_reg(reg_of(inst->src[i]))] = true;
|
||||
}
|
||||
|
||||
/* Preserve the original allocation of VGRFs used by the barycentric
|
||||
* source of the LINTERP instruction on Gfx6, since pair-aligned
|
||||
* barycentrics allow the PLN instruction to be used.
|
||||
*/
|
||||
if (v->devinfo->has_pln && v->devinfo->ver <= 6 &&
|
||||
inst->opcode == FS_OPCODE_LINTERP)
|
||||
constrained[p.atom_of_reg(reg_of(inst->src[0]))] = true;
|
||||
|
||||
/* The location of the Gfx7 MRF hack registers is hard-coded in the
|
||||
* rest of the compiler back-end. Don't attempt to move them around.
|
||||
*/
|
||||
if (v->devinfo->ver >= 7) {
|
||||
assert(inst->dst.file != MRF);
|
||||
|
||||
for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
|
||||
const unsigned reg = GFX7_MRF_HACK_START + inst->base_mrf + i;
|
||||
constrained[p.atom_of_reg(reg)] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return constrained;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return whether the hardware will be able to prevent a bank conflict by
|
||||
* optimizing out the read cycle of a source register. The formula was
|
||||
* found experimentally.
|
||||
*/
|
||||
bool
|
||||
is_conflict_optimized_out(const intel_device_info *devinfo,
|
||||
const fs_inst *inst)
|
||||
{
|
||||
return devinfo->ver >= 9 &&
|
||||
((is_grf(inst->src[0]) && (reg_of(inst->src[0]) == reg_of(inst->src[1]) ||
|
||||
reg_of(inst->src[0]) == reg_of(inst->src[2]))) ||
|
||||
reg_of(inst->src[1]) == reg_of(inst->src[2]));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a matrix that allows reasonably efficient computation of the
|
||||
* cycle-count cost of bank conflicts incurred throughout the whole program
|
||||
* for any given atom-to-bank assignment.
|
||||
*
|
||||
* More precisely, if C_r_s_p is the result of this function, the total
|
||||
* cost of all bank conflicts involving any given atom r can be readily
|
||||
* recovered as follows:
|
||||
*
|
||||
* S(B) = Sum_s_p(d_(p^B_r)_(B_s) * C_r_s_p)
|
||||
*
|
||||
* where d_i_j is the Kronecker delta, and B_r indicates the bank
|
||||
* assignment of r. \sa delta_conflicts() for a vectorized implementation
|
||||
* of the expression above.
|
||||
*
|
||||
* FINISHME: Teach this about the Gfx10+ bank conflict rules, which are
|
||||
* somewhat more relaxed than on previous generations. In the
|
||||
* meantime optimizing based on Gfx9 weights is likely to be more
|
||||
* helpful than not optimizing at all.
|
||||
*/
|
||||
weight_vector_type *
|
||||
shader_conflict_weight_matrix(const fs_visitor *v, const partitioning &p)
|
||||
{
|
||||
weight_vector_type *conflicts = new weight_vector_type[p.num_atoms()];
|
||||
for (unsigned r = 0; r < p.num_atoms(); r++)
|
||||
conflicts[r] = weight_vector_type(2 * p.num_atoms());
|
||||
|
||||
/* Crude approximation of the number of times the current basic block
|
||||
* will be executed at run-time.
|
||||
*/
|
||||
unsigned block_scale = 1;
|
||||
|
||||
foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
|
||||
if (inst->opcode == BRW_OPCODE_DO) {
|
||||
block_scale *= 10;
|
||||
|
||||
} else if (inst->opcode == BRW_OPCODE_WHILE) {
|
||||
block_scale /= 10;
|
||||
|
||||
} else if (inst->is_3src(v->compiler) &&
|
||||
is_grf(inst->src[1]) && is_grf(inst->src[2])) {
|
||||
const unsigned r = p.atom_of_reg(reg_of(inst->src[1]));
|
||||
const unsigned s = p.atom_of_reg(reg_of(inst->src[2]));
|
||||
|
||||
/* Estimate of the cycle-count cost of incurring a bank conflict
|
||||
* for this instruction. This is only true on the average, for a
|
||||
* sequence of back-to-back ternary instructions, since the EU
|
||||
* front-end only seems to be able to issue a new instruction at
|
||||
* an even cycle. The cost of a bank conflict incurred by an
|
||||
* isolated ternary instruction may be higher.
|
||||
*/
|
||||
const unsigned exec_size = inst->dst.component_size(inst->exec_size);
|
||||
const unsigned cycle_scale = block_scale * DIV_ROUND_UP(exec_size,
|
||||
REG_SIZE);
|
||||
|
||||
/* Neglect same-atom conflicts (since they're either trivial or
|
||||
* impossible to avoid without splitting the atom), and conflicts
|
||||
* known to be optimized out by the hardware.
|
||||
*/
|
||||
if (r != s && !is_conflict_optimized_out(v->devinfo, inst)) {
|
||||
/* Calculate the parity of the sources relative to the start of
|
||||
* their respective atoms. If their parity is the same (and
|
||||
* none of the atoms straddle the 2KB mark), the instruction
|
||||
* will incur a conflict iff both atoms are assigned the same
|
||||
* bank b. If their parity is opposite, the instruction will
|
||||
* incur a conflict iff they are assigned opposite banks (b and
|
||||
* b^1).
|
||||
*/
|
||||
const bool p_r = 1 & (reg_of(inst->src[1]) - p.reg_of_atom(r));
|
||||
const bool p_s = 1 & (reg_of(inst->src[2]) - p.reg_of_atom(s));
|
||||
const unsigned p = p_r ^ p_s;
|
||||
|
||||
/* Calculate the updated cost of a hypothetical conflict
|
||||
* between atoms r and s. Note that the weight matrix is
|
||||
* symmetric with respect to indices r and s by construction.
|
||||
*/
|
||||
const scalar_type w = MIN2(unsigned(max_scalar),
|
||||
get(conflicts[r], s, p) + cycle_scale);
|
||||
set(conflicts[r], s, p, w);
|
||||
set(conflicts[s], r, p, w);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return conflicts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the set of GRF atoms that could potentially lead to bank
|
||||
* conflicts if laid out unfavorably in the GRF space according to
|
||||
* the specified \p conflicts matrix (\sa
|
||||
* shader_conflict_weight_matrix()).
|
||||
*/
|
||||
bool *
|
||||
have_any_conflicts(const partitioning &p,
|
||||
const weight_vector_type *conflicts)
|
||||
{
|
||||
bool *any_conflicts = new bool[p.num_atoms()]();
|
||||
|
||||
for (unsigned r = 0; r < p.num_atoms(); r++) {
|
||||
const unsigned m = DIV_ROUND_UP(conflicts[r].size, vector_width);
|
||||
for (unsigned s = 0; s < m; s++)
|
||||
any_conflicts[r] |= sums(conflicts[r].v[s]);
|
||||
}
|
||||
|
||||
return any_conflicts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate the difference between two S(B) cost estimates as defined
|
||||
* above (\sa shader_conflict_weight_matrix()). This represents the
|
||||
* (partial) cycle-count benefit from moving an atom r from bank p to n.
|
||||
* The respective bank assignments Bp and Bn are encoded as the \p
|
||||
* bank_mask_p and \p bank_mask_n bitmasks for efficient computation,
|
||||
* according to the formula:
|
||||
*
|
||||
* bank_mask(B)_s_p = -d_(p^B_r)_(B_s)
|
||||
*
|
||||
* Notice the similarity with the delta function in the S(B) expression
|
||||
* above, and how bank_mask(B) can be precomputed for every possible
|
||||
* selection of r since bank_mask(B) only depends on it via B_r that may
|
||||
* only assume one of four different values, so the caller can keep every
|
||||
* possible bank_mask(B) vector in memory without much hassle (\sa
|
||||
* bank_characteristics()).
|
||||
*/
|
||||
int
|
||||
delta_conflicts(const weight_vector_type &bank_mask_p,
|
||||
const weight_vector_type &bank_mask_n,
|
||||
const weight_vector_type &conflicts)
|
||||
{
|
||||
const unsigned m = DIV_ROUND_UP(conflicts.size, vector_width);
|
||||
vector_type s_p = {}, s_n = {};
|
||||
|
||||
for (unsigned r = 0; r < m; r++) {
|
||||
s_p = adds(s_p, mask(bank_mask_p.v[r], conflicts.v[r]));
|
||||
s_n = adds(s_n, mask(bank_mask_n.v[r], conflicts.v[r]));
|
||||
}
|
||||
|
||||
return sums(subs(s_p, s_n));
|
||||
}
|
||||
|
||||
/**
|
||||
* Register atom permutation, represented as the start GRF offset each atom
|
||||
* is mapped into.
|
||||
*/
|
||||
struct permutation {
|
||||
permutation() : v(NULL), size(0) {}
|
||||
|
||||
permutation(unsigned n) :
|
||||
v(new unsigned[n]()), size(n) {}
|
||||
|
||||
permutation(const permutation &p) :
|
||||
v(new unsigned[p.size]), size(p.size)
|
||||
{
|
||||
memcpy(v, p.v, p.size * sizeof(unsigned));
|
||||
}
|
||||
|
||||
~permutation()
|
||||
{
|
||||
delete[] v;
|
||||
}
|
||||
|
||||
permutation &
|
||||
operator=(permutation p)
|
||||
{
|
||||
SWAP(v, p.v);
|
||||
SWAP(size, p.size);
|
||||
return *this;
|
||||
}
|
||||
|
||||
unsigned *v;
|
||||
unsigned size;
|
||||
};
|
||||
|
||||
/**
|
||||
* Return an identity permutation of GRF atoms.
|
||||
*/
|
||||
permutation
|
||||
identity_reg_permutation(const partitioning &p)
|
||||
{
|
||||
permutation map(p.num_atoms());
|
||||
|
||||
for (unsigned r = 0; r < map.size; r++)
|
||||
map.v[r] = p.reg_of_atom(r);
|
||||
|
||||
return map;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the bank index of GRF address \p reg, numbered according to the
|
||||
* table:
|
||||
* Even Odd
|
||||
* Lo 0 1
|
||||
* Hi 2 3
|
||||
*/
|
||||
unsigned
|
||||
bank_of(unsigned reg)
|
||||
{
|
||||
return (reg & 0x40) >> 5 | (reg & 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return bitmasks suitable for use as bank mask arguments for the
|
||||
* delta_conflicts() computation. Note that this is just the (negative)
|
||||
* characteristic function of each bank, if you regard it as a set
|
||||
* containing all atoms assigned to it according to the \p map array.
|
||||
*/
|
||||
weight_vector_type *
|
||||
bank_characteristics(const permutation &map)
|
||||
{
|
||||
weight_vector_type *banks = new weight_vector_type[4];
|
||||
|
||||
for (unsigned b = 0; b < 4; b++) {
|
||||
banks[b] = weight_vector_type(2 * map.size);
|
||||
|
||||
for (unsigned j = 0; j < map.size; j++) {
|
||||
for (unsigned p = 0; p < 2; p++)
|
||||
set(banks[b], j, p,
|
||||
(b ^ p) == bank_of(map.v[j]) ? -1 : 0);
|
||||
}
|
||||
}
|
||||
|
||||
return banks;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an improved permutation of GRF atoms based on \p map attempting
|
||||
* to reduce the total cycle-count cost of bank conflicts greedily.
|
||||
*
|
||||
* Note that this doesn't attempt to merge multiple atoms into one, which
|
||||
* may allow it to do a better job in some cases -- It simply reorders
|
||||
* existing atoms in the GRF space without affecting their identity.
|
||||
*/
|
||||
permutation
|
||||
optimize_reg_permutation(const partitioning &p,
|
||||
const bool *constrained,
|
||||
const weight_vector_type *conflicts,
|
||||
permutation map)
|
||||
{
|
||||
const bool *any_conflicts = have_any_conflicts(p, conflicts);
|
||||
weight_vector_type *banks = bank_characteristics(map);
|
||||
|
||||
for (unsigned r = 0; r < map.size; r++) {
|
||||
const unsigned bank_r = bank_of(map.v[r]);
|
||||
|
||||
if (!constrained[r]) {
|
||||
unsigned best_s = r;
|
||||
int best_benefit = 0;
|
||||
|
||||
for (unsigned s = 0; s < map.size; s++) {
|
||||
const unsigned bank_s = bank_of(map.v[s]);
|
||||
|
||||
if (bank_r != bank_s && !constrained[s] &&
|
||||
p.size_of_atom(r) == p.size_of_atom(s) &&
|
||||
(any_conflicts[r] || any_conflicts[s])) {
|
||||
const int benefit =
|
||||
delta_conflicts(banks[bank_r], banks[bank_s], conflicts[r]) +
|
||||
delta_conflicts(banks[bank_s], banks[bank_r], conflicts[s]);
|
||||
|
||||
if (benefit > best_benefit) {
|
||||
best_s = s;
|
||||
best_benefit = benefit;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (best_s != r) {
|
||||
for (unsigned b = 0; b < 4; b++) {
|
||||
for (unsigned p = 0; p < 2; p++)
|
||||
swap(banks[b], r, p, best_s, p);
|
||||
}
|
||||
|
||||
SWAP(map.v[r], map.v[best_s]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
delete[] banks;
|
||||
delete[] any_conflicts;
|
||||
return map;
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply the GRF atom permutation given by \p map to register \p r and
|
||||
* return the result.
|
||||
*/
|
||||
fs_reg
|
||||
transform(const partitioning &p, const permutation &map, fs_reg r)
|
||||
{
|
||||
if (r.file == VGRF) {
|
||||
const unsigned reg = reg_of(r);
|
||||
const unsigned s = p.atom_of_reg(reg);
|
||||
r.nr = map.v[s] + reg - p.reg_of_atom(s);
|
||||
r.offset = r.offset % REG_SIZE;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
fs_visitor::opt_bank_conflicts()
|
||||
{
|
||||
assert(grf_used || !"Must be called after register allocation");
|
||||
|
||||
/* TODO: Re-work this pass for Gfx20+. */
|
||||
if (devinfo->ver >= 20)
|
||||
return false;
|
||||
|
||||
/* No ternary instructions -- No bank conflicts. */
|
||||
if (devinfo->ver < 6)
|
||||
return false;
|
||||
|
||||
const partitioning p = shader_reg_partitioning(this);
|
||||
const bool *constrained = shader_reg_constraints(this, p);
|
||||
const weight_vector_type *conflicts =
|
||||
shader_conflict_weight_matrix(this, p);
|
||||
const permutation map =
|
||||
optimize_reg_permutation(p, constrained, conflicts,
|
||||
identity_reg_permutation(p));
|
||||
|
||||
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
||||
inst->dst = transform(p, map, inst->dst);
|
||||
|
||||
for (int i = 0; i < inst->sources; i++)
|
||||
inst->src[i] = transform(p, map, inst->src[i]);
|
||||
}
|
||||
|
||||
delete[] conflicts;
|
||||
delete[] constrained;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return whether the instruction incurs GRF bank conflict cycles.
|
||||
*
|
||||
* Note that this is only accurate after register allocation because otherwise
|
||||
* we don't know which bank each VGRF is going to end up aligned to.
|
||||
*/
|
||||
bool
|
||||
has_bank_conflict(const struct brw_isa_info *isa, const fs_inst *inst)
|
||||
{
|
||||
return is_3src(isa, inst->opcode) &&
|
||||
is_grf(inst->src[1]) && is_grf(inst->src[2]) &&
|
||||
bank_of(reg_of(inst->src[1])) == bank_of(reg_of(inst->src[2])) &&
|
||||
!is_conflict_optimized_out(isa->devinfo, inst);
|
||||
}
|
||||
965
src/intel/compiler/elk/brw_fs_builder.h
Normal file
965
src/intel/compiler/elk/brw_fs_builder.h
Normal file
|
|
@ -0,0 +1,965 @@
|
|||
/* -*- c++ -*- */
|
||||
/*
|
||||
* Copyright © 2010-2015 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BRW_FS_BUILDER_H
|
||||
#define BRW_FS_BUILDER_H
|
||||
|
||||
#include "brw_ir_fs.h"
|
||||
#include "brw_shader.h"
|
||||
#include "brw_eu.h"
|
||||
#include "brw_fs.h"
|
||||
|
||||
namespace brw {
|
||||
/**
|
||||
* Toolbox to assemble an FS IR program out of individual instructions.
|
||||
*
|
||||
* This object is meant to have an interface consistent with
|
||||
* brw::vec4_builder. They cannot be fully interchangeable because
|
||||
* brw::fs_builder generates scalar code while brw::vec4_builder generates
|
||||
* vector code.
|
||||
*/
|
||||
class fs_builder {
|
||||
public:
|
||||
/** Type used in this IR to represent a source of an instruction. */
|
||||
typedef fs_reg src_reg;
|
||||
|
||||
/** Type used in this IR to represent the destination of an instruction. */
|
||||
typedef fs_reg dst_reg;
|
||||
|
||||
/** Type used in this IR to represent an instruction. */
|
||||
typedef fs_inst instruction;
|
||||
|
||||
/**
|
||||
* Construct an fs_builder that inserts instructions into \p shader.
|
||||
* \p dispatch_width gives the native execution width of the program.
|
||||
*/
|
||||
fs_builder(fs_visitor *shader,
|
||||
unsigned dispatch_width) :
|
||||
shader(shader), block(NULL), cursor(NULL),
|
||||
_dispatch_width(dispatch_width),
|
||||
_group(0),
|
||||
force_writemask_all(false),
|
||||
annotation()
|
||||
{
|
||||
}
|
||||
|
||||
explicit fs_builder(fs_visitor *s) : fs_builder(s, s->dispatch_width) {}
|
||||
|
||||
/**
|
||||
* Construct an fs_builder that inserts instructions into \p shader
|
||||
* before instruction \p inst in basic block \p block. The default
|
||||
* execution controls and debug annotation are initialized from the
|
||||
* instruction passed as argument.
|
||||
*/
|
||||
fs_builder(fs_visitor *shader, bblock_t *block, fs_inst *inst) :
|
||||
shader(shader), block(block), cursor(inst),
|
||||
_dispatch_width(inst->exec_size),
|
||||
_group(inst->group),
|
||||
force_writemask_all(inst->force_writemask_all)
|
||||
{
|
||||
annotation.str = inst->annotation;
|
||||
annotation.ir = inst->ir;
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct an fs_builder that inserts instructions before \p cursor in
|
||||
* basic block \p block, inheriting other code generation parameters
|
||||
* from this.
|
||||
*/
|
||||
fs_builder
|
||||
at(bblock_t *block, exec_node *cursor) const
|
||||
{
|
||||
fs_builder bld = *this;
|
||||
bld.block = block;
|
||||
bld.cursor = cursor;
|
||||
return bld;
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct an fs_builder appending instructions at the end of the
|
||||
* instruction list of the shader, inheriting other code generation
|
||||
* parameters from this.
|
||||
*/
|
||||
fs_builder
|
||||
at_end() const
|
||||
{
|
||||
return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a builder specifying the default SIMD width and group of
|
||||
* channel enable signals, inheriting other code generation parameters
|
||||
* from this.
|
||||
*
|
||||
* \p n gives the default SIMD width, \p i gives the slot group used for
|
||||
* predication and control flow masking in multiples of \p n channels.
|
||||
*/
|
||||
fs_builder
|
||||
group(unsigned n, unsigned i) const
|
||||
{
|
||||
fs_builder bld = *this;
|
||||
|
||||
if (n <= dispatch_width() && i < dispatch_width() / n) {
|
||||
bld._group += i * n;
|
||||
} else {
|
||||
/* The requested channel group isn't a subset of the channel group
|
||||
* of this builder, which means that the resulting instructions
|
||||
* would use (potentially undefined) channel enable signals not
|
||||
* specified by the parent builder. That's only valid if the
|
||||
* instruction doesn't have per-channel semantics, in which case
|
||||
* we should clear off the default group index in order to prevent
|
||||
* emitting instructions with channel group not aligned to their
|
||||
* own execution size.
|
||||
*/
|
||||
assert(force_writemask_all);
|
||||
bld._group = 0;
|
||||
}
|
||||
|
||||
bld._dispatch_width = n;
|
||||
return bld;
|
||||
}
|
||||
|
||||
/**
|
||||
* Alias for group() with width equal to eight.
|
||||
*/
|
||||
fs_builder
|
||||
quarter(unsigned i) const
|
||||
{
|
||||
return group(8, i);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a builder with per-channel control flow execution masking
|
||||
* disabled if \p b is true. If control flow execution masking is
|
||||
* already disabled this has no effect.
|
||||
*/
|
||||
fs_builder
|
||||
exec_all(bool b = true) const
|
||||
{
|
||||
fs_builder bld = *this;
|
||||
if (b)
|
||||
bld.force_writemask_all = true;
|
||||
return bld;
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a builder with the given debug annotation info.
|
||||
*/
|
||||
fs_builder
|
||||
annotate(const char *str, const void *ir = NULL) const
|
||||
{
|
||||
fs_builder bld = *this;
|
||||
bld.annotation.str = str;
|
||||
bld.annotation.ir = ir;
|
||||
return bld;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the SIMD width in use.
|
||||
*/
|
||||
unsigned
|
||||
dispatch_width() const
|
||||
{
|
||||
return _dispatch_width;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the channel group in use.
|
||||
*/
|
||||
unsigned
|
||||
group() const
|
||||
{
|
||||
return _group;
|
||||
}
|
||||
|
||||
/**
|
||||
* Allocate a virtual register of natural vector size (one for this IR)
|
||||
* and SIMD width. \p n gives the amount of space to allocate in
|
||||
* dispatch_width units (which is just enough space for one logical
|
||||
* component in this IR).
|
||||
*/
|
||||
dst_reg
|
||||
vgrf(enum brw_reg_type type, unsigned n = 1) const
|
||||
{
|
||||
const unsigned unit = reg_unit(shader->devinfo);
|
||||
assert(dispatch_width() <= 32);
|
||||
|
||||
if (n > 0)
|
||||
return dst_reg(VGRF, shader->alloc.allocate(
|
||||
DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
|
||||
unit * REG_SIZE) * unit),
|
||||
type);
|
||||
else
|
||||
return retype(null_reg_ud(), type);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a null register of floating type.
|
||||
*/
|
||||
dst_reg
|
||||
null_reg_f() const
|
||||
{
|
||||
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
|
||||
}
|
||||
|
||||
dst_reg
|
||||
null_reg_df() const
|
||||
{
|
||||
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a null register of signed integer type.
|
||||
*/
|
||||
dst_reg
|
||||
null_reg_d() const
|
||||
{
|
||||
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a null register of unsigned integer type.
|
||||
*/
|
||||
dst_reg
|
||||
null_reg_ud() const
|
||||
{
|
||||
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert an instruction into the program.
|
||||
*/
|
||||
instruction *
|
||||
emit(const instruction &inst) const
|
||||
{
|
||||
return emit(new(shader->mem_ctx) instruction(inst));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and insert a nullary control instruction into the program.
|
||||
*/
|
||||
instruction *
|
||||
emit(enum opcode opcode) const
|
||||
{
|
||||
return emit(instruction(opcode, dispatch_width()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and insert a nullary instruction into the program.
|
||||
*/
|
||||
instruction *
|
||||
emit(enum opcode opcode, const dst_reg &dst) const
|
||||
{
|
||||
return emit(instruction(opcode, dispatch_width(), dst));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and insert a unary instruction into the program.
|
||||
*/
|
||||
instruction *
|
||||
emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
|
||||
{
|
||||
switch (opcode) {
|
||||
case SHADER_OPCODE_RCP:
|
||||
case SHADER_OPCODE_RSQ:
|
||||
case SHADER_OPCODE_SQRT:
|
||||
case SHADER_OPCODE_EXP2:
|
||||
case SHADER_OPCODE_LOG2:
|
||||
case SHADER_OPCODE_SIN:
|
||||
case SHADER_OPCODE_COS:
|
||||
return emit(instruction(opcode, dispatch_width(), dst,
|
||||
fix_math_operand(src0)));
|
||||
|
||||
default:
|
||||
return emit(instruction(opcode, dispatch_width(), dst, src0));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and insert a binary instruction into the program.
|
||||
*/
|
||||
instruction *
|
||||
emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
|
||||
const src_reg &src1) const
|
||||
{
|
||||
switch (opcode) {
|
||||
case SHADER_OPCODE_POW:
|
||||
case SHADER_OPCODE_INT_QUOTIENT:
|
||||
case SHADER_OPCODE_INT_REMAINDER:
|
||||
return emit(instruction(opcode, dispatch_width(), dst,
|
||||
fix_math_operand(src0),
|
||||
fix_math_operand(src1)));
|
||||
|
||||
default:
|
||||
return emit(instruction(opcode, dispatch_width(), dst,
|
||||
src0, src1));
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and insert a ternary instruction into the program.
|
||||
*/
|
||||
instruction *
|
||||
emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
|
||||
const src_reg &src1, const src_reg &src2) const
|
||||
{
|
||||
switch (opcode) {
|
||||
case BRW_OPCODE_BFE:
|
||||
case BRW_OPCODE_BFI2:
|
||||
case BRW_OPCODE_MAD:
|
||||
case BRW_OPCODE_LRP:
|
||||
return emit(instruction(opcode, dispatch_width(), dst,
|
||||
fix_3src_operand(src0),
|
||||
fix_3src_operand(src1),
|
||||
fix_3src_operand(src2)));
|
||||
|
||||
default:
|
||||
return emit(instruction(opcode, dispatch_width(), dst,
|
||||
src0, src1, src2));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and insert an instruction with a variable number of sources
|
||||
* into the program.
|
||||
*/
|
||||
instruction *
|
||||
emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
|
||||
unsigned n) const
|
||||
{
|
||||
/* Use the emit() methods for specific operand counts to ensure that
|
||||
* opcode-specific operand fixups occur.
|
||||
*/
|
||||
if (n == 2) {
|
||||
return emit(opcode, dst, srcs[0], srcs[1]);
|
||||
} else if (n == 3) {
|
||||
return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
|
||||
} else {
|
||||
return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert a preallocated instruction into the program.
|
||||
*/
|
||||
instruction *
|
||||
emit(instruction *inst) const
|
||||
{
|
||||
assert(inst->exec_size <= 32);
|
||||
assert(inst->exec_size == dispatch_width() ||
|
||||
force_writemask_all);
|
||||
|
||||
inst->group = _group;
|
||||
inst->force_writemask_all = force_writemask_all;
|
||||
inst->annotation = annotation.str;
|
||||
inst->ir = annotation.ir;
|
||||
|
||||
if (block)
|
||||
static_cast<instruction *>(cursor)->insert_before(block, inst);
|
||||
else
|
||||
cursor->insert_before(inst);
|
||||
|
||||
return inst;
|
||||
}
|
||||
|
||||
/**
|
||||
* Select \p src0 if the comparison of both sources with the given
|
||||
* conditional mod evaluates to true, otherwise select \p src1.
|
||||
*
|
||||
* Generally useful to get the minimum or maximum of two values.
|
||||
*/
|
||||
instruction *
|
||||
emit_minmax(const dst_reg &dst, const src_reg &src0,
|
||||
const src_reg &src1, brw_conditional_mod mod) const
|
||||
{
|
||||
assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
|
||||
|
||||
/* In some cases we can't have bytes as operand for src1, so use the
|
||||
* same type for both operand.
|
||||
*/
|
||||
return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
|
||||
fix_unsigned_negate(src1)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy any live channel from \p src to the first channel of the result.
|
||||
*/
|
||||
src_reg
|
||||
emit_uniformize(const src_reg &src) const
|
||||
{
|
||||
/* FIXME: We use a vector chan_index and dst to allow constant and
|
||||
* copy propagration to move result all the way into the consuming
|
||||
* instruction (typically a surface index or sampler index for a
|
||||
* send). This uses 1 or 3 extra hw registers in 16 or 32 wide
|
||||
* dispatch. Once we teach const/copy propagation about scalars we
|
||||
* should go back to scalar destinations here.
|
||||
*/
|
||||
const fs_builder ubld = exec_all();
|
||||
const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
|
||||
const dst_reg dst = vgrf(src.type);
|
||||
|
||||
ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
|
||||
ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
|
||||
|
||||
return src_reg(component(dst, 0));
|
||||
}
|
||||
|
||||
src_reg
|
||||
move_to_vgrf(const src_reg &src, unsigned num_components) const
|
||||
{
|
||||
src_reg *const src_comps = new src_reg[num_components];
|
||||
for (unsigned i = 0; i < num_components; i++)
|
||||
src_comps[i] = offset(src, dispatch_width(), i);
|
||||
|
||||
const dst_reg dst = vgrf(src.type, num_components);
|
||||
LOAD_PAYLOAD(dst, src_comps, num_components, 0);
|
||||
|
||||
delete[] src_comps;
|
||||
|
||||
return src_reg(dst);
|
||||
}
|
||||
|
||||
void
|
||||
emit_scan_step(enum opcode opcode, brw_conditional_mod mod,
|
||||
const dst_reg &tmp,
|
||||
unsigned left_offset, unsigned left_stride,
|
||||
unsigned right_offset, unsigned right_stride) const
|
||||
{
|
||||
dst_reg left, right;
|
||||
left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
|
||||
right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
|
||||
if ((tmp.type == BRW_REGISTER_TYPE_Q ||
|
||||
tmp.type == BRW_REGISTER_TYPE_UQ) &&
|
||||
!shader->devinfo->has_64bit_int) {
|
||||
switch (opcode) {
|
||||
case BRW_OPCODE_MUL:
|
||||
/* This will get lowered by integer MUL lowering */
|
||||
set_condmod(mod, emit(opcode, right, left, right));
|
||||
break;
|
||||
|
||||
case BRW_OPCODE_SEL: {
|
||||
/* In order for the comparisons to work out right, we need our
|
||||
* comparisons to be strict.
|
||||
*/
|
||||
assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE);
|
||||
if (mod == BRW_CONDITIONAL_GE)
|
||||
mod = BRW_CONDITIONAL_G;
|
||||
|
||||
/* We treat the bottom 32 bits as unsigned regardless of
|
||||
* whether or not the integer as a whole is signed.
|
||||
*/
|
||||
dst_reg right_low = subscript(right, BRW_REGISTER_TYPE_UD, 0);
|
||||
dst_reg left_low = subscript(left, BRW_REGISTER_TYPE_UD, 0);
|
||||
|
||||
/* The upper bits get the same sign as the 64-bit type */
|
||||
brw_reg_type type32 = brw_reg_type_from_bit_size(32, tmp.type);
|
||||
dst_reg right_high = subscript(right, type32, 1);
|
||||
dst_reg left_high = subscript(left, type32, 1);
|
||||
|
||||
/* Build up our comparison:
|
||||
*
|
||||
* l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
|
||||
*/
|
||||
CMP(null_reg_ud(), retype(left_low, BRW_REGISTER_TYPE_UD),
|
||||
retype(right_low, BRW_REGISTER_TYPE_UD), mod);
|
||||
set_predicate(BRW_PREDICATE_NORMAL,
|
||||
CMP(null_reg_ud(), left_high, right_high,
|
||||
BRW_CONDITIONAL_EQ));
|
||||
set_predicate_inv(BRW_PREDICATE_NORMAL, true,
|
||||
CMP(null_reg_ud(), left_high, right_high, mod));
|
||||
|
||||
/* We could use selects here or we could use predicated MOVs
|
||||
* because the destination and second source (if it were a SEL)
|
||||
* are the same.
|
||||
*/
|
||||
set_predicate(BRW_PREDICATE_NORMAL, MOV(right_low, left_low));
|
||||
set_predicate(BRW_PREDICATE_NORMAL, MOV(right_high, left_high));
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
unreachable("Unsupported 64-bit scan op");
|
||||
}
|
||||
} else {
|
||||
set_condmod(mod, emit(opcode, right, left, right));
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
emit_scan(enum opcode opcode, const dst_reg &tmp,
|
||||
unsigned cluster_size, brw_conditional_mod mod) const
|
||||
{
|
||||
assert(dispatch_width() >= 8);
|
||||
|
||||
/* The instruction splitting code isn't advanced enough to split
|
||||
* these so we need to handle that ourselves.
|
||||
*/
|
||||
if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
|
||||
const unsigned half_width = dispatch_width() / 2;
|
||||
const fs_builder ubld = exec_all().group(half_width, 0);
|
||||
dst_reg left = tmp;
|
||||
dst_reg right = horiz_offset(tmp, half_width);
|
||||
ubld.emit_scan(opcode, left, cluster_size, mod);
|
||||
ubld.emit_scan(opcode, right, cluster_size, mod);
|
||||
if (cluster_size > half_width) {
|
||||
ubld.emit_scan_step(opcode, mod, tmp,
|
||||
half_width - 1, 0, half_width, 1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (cluster_size > 1) {
|
||||
const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
|
||||
ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2);
|
||||
}
|
||||
|
||||
if (cluster_size > 2) {
|
||||
if (type_sz(tmp.type) <= 4) {
|
||||
const fs_builder ubld =
|
||||
exec_all().group(dispatch_width() / 4, 0);
|
||||
ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4);
|
||||
ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4);
|
||||
} else {
|
||||
/* For 64-bit types, we have to do things differently because
|
||||
* the code above would land us with destination strides that
|
||||
* the hardware can't handle. Fortunately, we'll only be
|
||||
* 8-wide in that case and it's the same number of
|
||||
* instructions.
|
||||
*/
|
||||
const fs_builder ubld = exec_all().group(2, 0);
|
||||
for (unsigned i = 0; i < dispatch_width(); i += 4)
|
||||
ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1);
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned i = 4;
|
||||
i < MIN2(cluster_size, dispatch_width());
|
||||
i *= 2) {
|
||||
const fs_builder ubld = exec_all().group(i, 0);
|
||||
ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1);
|
||||
|
||||
if (dispatch_width() > i * 2)
|
||||
ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);
|
||||
|
||||
if (dispatch_width() > i * 4) {
|
||||
ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
|
||||
ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
instruction *
|
||||
emit_undef_for_dst(const instruction *old_inst) const
|
||||
{
|
||||
assert(old_inst->dst.file == VGRF);
|
||||
instruction *inst = emit(SHADER_OPCODE_UNDEF,
|
||||
retype(old_inst->dst, BRW_REGISTER_TYPE_UD));
|
||||
inst->size_written = old_inst->size_written;
|
||||
|
||||
return inst;
|
||||
}
|
||||
|
||||
/**
|
||||
* Assorted arithmetic ops.
|
||||
* @{
|
||||
*/
|
||||
#define ALU1(op) \
|
||||
instruction * \
|
||||
op(const dst_reg &dst, const src_reg &src0) const \
|
||||
{ \
|
||||
return emit(BRW_OPCODE_##op, dst, src0); \
|
||||
}
|
||||
|
||||
#define ALU2(op) \
|
||||
instruction * \
|
||||
op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
|
||||
{ \
|
||||
return emit(BRW_OPCODE_##op, dst, src0, src1); \
|
||||
}
|
||||
|
||||
#define ALU2_ACC(op) \
|
||||
instruction * \
|
||||
op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
|
||||
{ \
|
||||
instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
|
||||
inst->writes_accumulator = true; \
|
||||
return inst; \
|
||||
}
|
||||
|
||||
#define ALU3(op) \
|
||||
instruction * \
|
||||
op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
|
||||
const src_reg &src2) const \
|
||||
{ \
|
||||
return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
|
||||
}
|
||||
|
||||
ALU2(ADD)
|
||||
ALU3(ADD3)
|
||||
ALU2_ACC(ADDC)
|
||||
ALU2(AND)
|
||||
ALU2(ASR)
|
||||
ALU2(AVG)
|
||||
ALU3(BFE)
|
||||
ALU2(BFI1)
|
||||
ALU3(BFI2)
|
||||
ALU1(BFREV)
|
||||
ALU1(CBIT)
|
||||
ALU1(DIM)
|
||||
ALU2(DP2)
|
||||
ALU2(DP3)
|
||||
ALU2(DP4)
|
||||
ALU2(DPH)
|
||||
ALU1(FBH)
|
||||
ALU1(FBL)
|
||||
ALU1(FRC)
|
||||
ALU3(DP4A)
|
||||
ALU2(LINE)
|
||||
ALU1(LZD)
|
||||
ALU2(MAC)
|
||||
ALU2_ACC(MACH)
|
||||
ALU3(MAD)
|
||||
ALU1(MOV)
|
||||
ALU2(MUL)
|
||||
ALU1(NOT)
|
||||
ALU2(OR)
|
||||
ALU2(PLN)
|
||||
ALU1(RNDD)
|
||||
ALU1(RNDE)
|
||||
ALU1(RNDU)
|
||||
ALU1(RNDZ)
|
||||
ALU2(ROL)
|
||||
ALU2(ROR)
|
||||
ALU2(SAD2)
|
||||
ALU2_ACC(SADA2)
|
||||
ALU2(SEL)
|
||||
ALU2(SHL)
|
||||
ALU2(SHR)
|
||||
ALU2_ACC(SUBB)
|
||||
ALU2(XOR)
|
||||
|
||||
#undef ALU3
|
||||
#undef ALU2_ACC
|
||||
#undef ALU2
|
||||
#undef ALU1
|
||||
|
||||
instruction *
|
||||
F32TO16(const dst_reg &dst, const src_reg &src) const
|
||||
{
|
||||
assert(dst.type == BRW_REGISTER_TYPE_HF);
|
||||
assert(src.type == BRW_REGISTER_TYPE_F);
|
||||
|
||||
if (shader->devinfo->ver >= 8) {
|
||||
return MOV(dst, src);
|
||||
} else {
|
||||
assert(shader->devinfo->ver == 7);
|
||||
return emit(BRW_OPCODE_F32TO16,
|
||||
retype(dst, BRW_REGISTER_TYPE_W), src);
|
||||
}
|
||||
}
|
||||
|
||||
instruction *
|
||||
F16TO32(const dst_reg &dst, const src_reg &src) const
|
||||
{
|
||||
assert(dst.type == BRW_REGISTER_TYPE_F);
|
||||
assert(src.type == BRW_REGISTER_TYPE_HF);
|
||||
|
||||
if (shader->devinfo->ver >= 8) {
|
||||
return MOV(dst, src);
|
||||
} else {
|
||||
assert(shader->devinfo->ver == 7);
|
||||
return emit(BRW_OPCODE_F16TO32,
|
||||
dst, retype(src, BRW_REGISTER_TYPE_W));
|
||||
}
|
||||
}
|
||||
/** @} */
|
||||
|
||||
/**
|
||||
* CMP: Sets the low bit of the destination channels with the result
|
||||
* of the comparison, while the upper bits are undefined, and updates
|
||||
* the flag register with the packed 16 bits of the result.
|
||||
*/
|
||||
instruction *
|
||||
CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
|
||||
brw_conditional_mod condition) const
|
||||
{
|
||||
/* Take the instruction:
|
||||
*
|
||||
* CMP null<d> src0<f> src1<f>
|
||||
*
|
||||
* Original gfx4 does type conversion to the destination type
|
||||
* before comparison, producing garbage results for floating
|
||||
* point comparisons.
|
||||
*
|
||||
* The destination type doesn't matter on newer generations,
|
||||
* so we set the type to match src0 so we can compact the
|
||||
* instruction.
|
||||
*/
|
||||
return set_condmod(condition,
|
||||
emit(BRW_OPCODE_CMP, retype(dst, src0.type),
|
||||
fix_unsigned_negate(src0),
|
||||
fix_unsigned_negate(src1)));
|
||||
}
|
||||
|
||||
/**
|
||||
* CMPN: Behaves like CMP, but produces true if src1 is NaN.
|
||||
*/
|
||||
instruction *
|
||||
CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
|
||||
brw_conditional_mod condition) const
|
||||
{
|
||||
/* Take the instruction:
|
||||
*
|
||||
* CMP null<d> src0<f> src1<f>
|
||||
*
|
||||
* Original gfx4 does type conversion to the destination type
|
||||
* before comparison, producing garbage results for floating
|
||||
* point comparisons.
|
||||
*
|
||||
* The destination type doesn't matter on newer generations,
|
||||
* so we set the type to match src0 so we can compact the
|
||||
* instruction.
|
||||
*/
|
||||
return set_condmod(condition,
|
||||
emit(BRW_OPCODE_CMPN, retype(dst, src0.type),
|
||||
fix_unsigned_negate(src0),
|
||||
fix_unsigned_negate(src1)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Gfx4 predicated IF.
|
||||
*/
|
||||
instruction *
|
||||
IF(brw_predicate predicate) const
|
||||
{
|
||||
return set_predicate(predicate, emit(BRW_OPCODE_IF));
|
||||
}
|
||||
|
||||
/**
|
||||
* CSEL: dst = src2 <op> 0.0f ? src0 : src1
|
||||
*/
|
||||
instruction *
|
||||
CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
|
||||
const src_reg &src2, brw_conditional_mod condition) const
|
||||
{
|
||||
/* CSEL only operates on floats, so we can't do integer </<=/>=/>
|
||||
* comparisons. Zero/non-zero (== and !=) comparisons almost work.
|
||||
* 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
|
||||
*/
|
||||
assert(src2.type == BRW_REGISTER_TYPE_F);
|
||||
|
||||
return set_condmod(condition,
|
||||
emit(BRW_OPCODE_CSEL,
|
||||
retype(dst, BRW_REGISTER_TYPE_F),
|
||||
retype(src0, BRW_REGISTER_TYPE_F),
|
||||
retype(src1, BRW_REGISTER_TYPE_F),
|
||||
src2));
|
||||
}
|
||||
|
||||
/**
|
||||
* Emit a linear interpolation instruction.
|
||||
*/
|
||||
instruction *
|
||||
LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
|
||||
const src_reg &a) const
|
||||
{
|
||||
if (shader->devinfo->ver >= 6 && shader->devinfo->ver <= 10) {
|
||||
/* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
|
||||
* we need to reorder the operands.
|
||||
*/
|
||||
return emit(BRW_OPCODE_LRP, dst, a, y, x);
|
||||
|
||||
} else {
|
||||
/* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
|
||||
const dst_reg y_times_a = vgrf(dst.type);
|
||||
const dst_reg one_minus_a = vgrf(dst.type);
|
||||
const dst_reg x_times_one_minus_a = vgrf(dst.type);
|
||||
|
||||
MUL(y_times_a, y, a);
|
||||
ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
|
||||
MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
|
||||
return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Collect a number of registers in a contiguous range of registers.
|
||||
*/
|
||||
instruction *
|
||||
LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
|
||||
unsigned sources, unsigned header_size) const
|
||||
{
|
||||
instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
|
||||
inst->header_size = header_size;
|
||||
inst->size_written = header_size * REG_SIZE;
|
||||
for (unsigned i = header_size; i < sources; i++) {
|
||||
inst->size_written += dispatch_width() * type_sz(src[i].type) *
|
||||
dst.stride;
|
||||
}
|
||||
|
||||
return inst;
|
||||
}
|
||||
|
||||
instruction *
|
||||
UNDEF(const dst_reg &dst) const
|
||||
{
|
||||
assert(dst.file == VGRF);
|
||||
assert(dst.offset % REG_SIZE == 0);
|
||||
instruction *inst = emit(SHADER_OPCODE_UNDEF,
|
||||
retype(dst, BRW_REGISTER_TYPE_UD));
|
||||
inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE - dst.offset;
|
||||
|
||||
return inst;
|
||||
}
|
||||
|
||||
instruction *
|
||||
DPAS(const dst_reg &dst, const src_reg &src0, const src_reg &src1, const src_reg &src2,
|
||||
unsigned sdepth, unsigned rcount) const
|
||||
{
|
||||
assert(_dispatch_width == 8);
|
||||
assert(sdepth == 8);
|
||||
assert(rcount == 1 || rcount == 2 || rcount == 4 || rcount == 8);
|
||||
|
||||
instruction *inst = emit(BRW_OPCODE_DPAS, dst, src0, src1, src2);
|
||||
inst->sdepth = sdepth;
|
||||
inst->rcount = rcount;
|
||||
|
||||
if (dst.type == BRW_REGISTER_TYPE_HF) {
|
||||
inst->size_written = rcount * REG_SIZE / 2;
|
||||
} else {
|
||||
inst->size_written = rcount * REG_SIZE;
|
||||
}
|
||||
|
||||
return inst;
|
||||
}
|
||||
|
||||
fs_visitor *shader;
|
||||
|
||||
fs_inst *BREAK() { return emit(BRW_OPCODE_BREAK); }
|
||||
fs_inst *DO() { return emit(BRW_OPCODE_DO); }
|
||||
fs_inst *ENDIF() { return emit(BRW_OPCODE_ENDIF); }
|
||||
fs_inst *NOP() { return emit(BRW_OPCODE_NOP); }
|
||||
fs_inst *WHILE() { return emit(BRW_OPCODE_WHILE); }
|
||||
fs_inst *CONTINUE() { return emit(BRW_OPCODE_CONTINUE); }
|
||||
|
||||
private:
|
||||
/**
|
||||
* Workaround for negation of UD registers. See comment in
|
||||
* fs_generator::generate_code() for more details.
|
||||
*/
|
||||
src_reg
|
||||
fix_unsigned_negate(const src_reg &src) const
|
||||
{
|
||||
if (src.type == BRW_REGISTER_TYPE_UD &&
|
||||
src.negate) {
|
||||
dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
|
||||
MOV(temp, src);
|
||||
return src_reg(temp);
|
||||
} else {
|
||||
return src;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Workaround for source register modes not supported by the ternary
|
||||
* instruction encoding.
|
||||
*/
|
||||
src_reg
|
||||
fix_3src_operand(const src_reg &src) const
|
||||
{
|
||||
switch (src.file) {
|
||||
case FIXED_GRF:
|
||||
/* FINISHME: Could handle scalar region, other stride=1 regions */
|
||||
if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
|
||||
src.width != BRW_WIDTH_8 ||
|
||||
src.hstride != BRW_HORIZONTAL_STRIDE_1)
|
||||
break;
|
||||
FALLTHROUGH;
|
||||
case ATTR:
|
||||
case VGRF:
|
||||
case UNIFORM:
|
||||
case IMM:
|
||||
return src;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
dst_reg expanded = vgrf(src.type);
|
||||
MOV(expanded, src);
|
||||
return expanded;
|
||||
}
|
||||
|
||||
/**
|
||||
* Workaround for source register modes not supported by the math
|
||||
* instruction.
|
||||
*/
|
||||
src_reg
|
||||
fix_math_operand(const src_reg &src) const
|
||||
{
|
||||
/* Can't do hstride == 0 args on gfx6 math, so expand it out. We
|
||||
* might be able to do better by doing execsize = 1 math and then
|
||||
* expanding that result out, but we would need to be careful with
|
||||
* masking.
|
||||
*
|
||||
* Gfx6 hardware ignores source modifiers (negate and abs) on math
|
||||
* instructions, so we also move to a temp to set those up.
|
||||
*
|
||||
* Gfx7 relaxes most of the above restrictions, but still can't use IMM
|
||||
* operands to math
|
||||
*/
|
||||
if ((shader->devinfo->ver == 6 &&
|
||||
(src.file == IMM || src.file == UNIFORM ||
|
||||
src.abs || src.negate)) ||
|
||||
(shader->devinfo->ver == 7 && src.file == IMM)) {
|
||||
const dst_reg tmp = vgrf(src.type);
|
||||
MOV(tmp, src);
|
||||
return tmp;
|
||||
} else {
|
||||
return src;
|
||||
}
|
||||
}
|
||||
|
||||
bblock_t *block;
|
||||
exec_node *cursor;
|
||||
|
||||
unsigned _dispatch_width;
|
||||
unsigned _group;
|
||||
bool force_writemask_all;
|
||||
|
||||
/** Debug annotation info. */
|
||||
struct {
|
||||
const char *str;
|
||||
const void *ir;
|
||||
} annotation;
|
||||
};
|
||||
}
|
||||
|
||||
static inline fs_reg
|
||||
offset(const fs_reg ®, const brw::fs_builder &bld, unsigned delta)
|
||||
{
|
||||
return offset(reg, bld.dispatch_width(), delta);
|
||||
}
|
||||
|
||||
#endif
|
||||
568
src/intel/compiler/elk/brw_fs_cmod_propagation.cpp
Normal file
568
src/intel/compiler/elk/brw_fs_cmod_propagation.cpp
Normal file
|
|
@ -0,0 +1,568 @@
|
|||
/*
|
||||
* Copyright © 2014 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_fs.h"
|
||||
#include "brw_cfg.h"
|
||||
#include "brw_eu.h"
|
||||
|
||||
/** @file brw_fs_cmod_propagation.cpp
|
||||
*
|
||||
* Implements a pass that propagates the conditional modifier from a CMP x 0.0
|
||||
* instruction into the instruction that generated x. For instance, in this
|
||||
* sequence
|
||||
*
|
||||
* add(8) g70<1>F g69<8,8,1>F 4096F
|
||||
* cmp.ge.f0(8) null g70<8,8,1>F 0F
|
||||
*
|
||||
* we can do the comparison as part of the ADD instruction directly:
|
||||
*
|
||||
* add.ge.f0(8) g70<1>F g69<8,8,1>F 4096F
|
||||
*
|
||||
* If there had been a use of the flag register and another CMP using g70
|
||||
*
|
||||
* add.ge.f0(8) g70<1>F g69<8,8,1>F 4096F
|
||||
* (+f0) sel(8) g71<F> g72<8,8,1>F g73<8,8,1>F
|
||||
* cmp.ge.f0(8) null g70<8,8,1>F 0F
|
||||
*
|
||||
* we can recognize that the CMP is generating the flag value that already
|
||||
* exists and therefore remove the instruction.
|
||||
*/
|
||||
|
||||
using namespace brw;
|
||||
|
||||
static bool
|
||||
cmod_propagate_cmp_to_add(const intel_device_info *devinfo, bblock_t *block,
|
||||
fs_inst *inst)
|
||||
{
|
||||
bool read_flag = false;
|
||||
const unsigned flags_written = inst->flags_written(devinfo);
|
||||
|
||||
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
|
||||
if (scan_inst->opcode == BRW_OPCODE_ADD &&
|
||||
!scan_inst->is_partial_write() &&
|
||||
scan_inst->exec_size == inst->exec_size) {
|
||||
bool negate;
|
||||
|
||||
/* A CMP is basically a subtraction. The result of the
|
||||
* subtraction must be the same as the result of the addition.
|
||||
* This means that one of the operands must be negated. So (a +
|
||||
* b) vs (a == -b) or (a + -b) vs (a == b).
|
||||
*/
|
||||
if ((inst->src[0].equals(scan_inst->src[0]) &&
|
||||
inst->src[1].negative_equals(scan_inst->src[1])) ||
|
||||
(inst->src[0].equals(scan_inst->src[1]) &&
|
||||
inst->src[1].negative_equals(scan_inst->src[0]))) {
|
||||
negate = false;
|
||||
} else if ((inst->src[0].negative_equals(scan_inst->src[0]) &&
|
||||
inst->src[1].equals(scan_inst->src[1])) ||
|
||||
(inst->src[0].negative_equals(scan_inst->src[1]) &&
|
||||
inst->src[1].equals(scan_inst->src[0]))) {
|
||||
negate = true;
|
||||
} else {
|
||||
goto not_match;
|
||||
}
|
||||
|
||||
/* If the scan instruction writes a different flag register than the
|
||||
* instruction we're trying to propagate from, bail.
|
||||
*
|
||||
* FINISHME: The second part of the condition may be too strong.
|
||||
* Perhaps (scan_inst->flags_written() & flags_written) !=
|
||||
* flags_written?
|
||||
*/
|
||||
if (scan_inst->flags_written(devinfo) != 0 &&
|
||||
scan_inst->flags_written(devinfo) != flags_written)
|
||||
goto not_match;
|
||||
|
||||
/* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags":
|
||||
*
|
||||
* * Note that the [post condition signal] bits generated at
|
||||
* the output of a compute are before the .sat.
|
||||
*
|
||||
* Paragraph about post_zero does not mention saturation, but
|
||||
* testing it on actual GPUs shows that conditional modifiers
|
||||
* are applied after saturation.
|
||||
*
|
||||
* * post_zero bit: This bit reflects whether the final
|
||||
* result is zero after all the clamping, normalizing,
|
||||
* or format conversion logic.
|
||||
*
|
||||
* For signed types we don't care about saturation: it won't
|
||||
* change the result of conditional modifier.
|
||||
*
|
||||
* For floating and unsigned types there two special cases,
|
||||
* when we can remove inst even if scan_inst is saturated: G
|
||||
* and LE. Since conditional modifiers are just comparisons
|
||||
* against zero, saturating positive values to the upper
|
||||
* limit never changes the result of comparison.
|
||||
*
|
||||
* For negative values:
|
||||
* (sat(x) > 0) == (x > 0) --- false
|
||||
* (sat(x) <= 0) == (x <= 0) --- true
|
||||
*/
|
||||
const enum brw_conditional_mod cond =
|
||||
negate ? brw_swap_cmod(inst->conditional_mod)
|
||||
: inst->conditional_mod;
|
||||
|
||||
if (scan_inst->saturate &&
|
||||
(brw_reg_type_is_floating_point(scan_inst->dst.type) ||
|
||||
brw_reg_type_is_unsigned_integer(scan_inst->dst.type)) &&
|
||||
(cond != BRW_CONDITIONAL_G &&
|
||||
cond != BRW_CONDITIONAL_LE))
|
||||
goto not_match;
|
||||
|
||||
/* Otherwise, try propagating the conditional. */
|
||||
if (scan_inst->can_do_cmod() &&
|
||||
((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
|
||||
scan_inst->conditional_mod == cond)) {
|
||||
scan_inst->conditional_mod = cond;
|
||||
scan_inst->flag_subreg = inst->flag_subreg;
|
||||
inst->remove(block, true);
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
not_match:
|
||||
if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
|
||||
break;
|
||||
|
||||
read_flag = read_flag ||
|
||||
(scan_inst->flags_read(devinfo) & flags_written) != 0;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Propagate conditional modifiers from NOT instructions
|
||||
*
|
||||
* Attempt to convert sequences like
|
||||
*
|
||||
* or(8) g78<8,8,1> g76<8,8,1>UD g77<8,8,1>UD
|
||||
* ...
|
||||
* not.nz.f0(8) null g78<8,8,1>UD
|
||||
*
|
||||
* into
|
||||
*
|
||||
* or.z.f0(8) g78<8,8,1> g76<8,8,1>UD g77<8,8,1>UD
|
||||
*/
|
||||
static bool
|
||||
cmod_propagate_not(const intel_device_info *devinfo, bblock_t *block,
|
||||
fs_inst *inst)
|
||||
{
|
||||
const enum brw_conditional_mod cond = brw_negate_cmod(inst->conditional_mod);
|
||||
bool read_flag = false;
|
||||
const unsigned flags_written = inst->flags_written(devinfo);
|
||||
|
||||
if (cond != BRW_CONDITIONAL_Z && cond != BRW_CONDITIONAL_NZ)
|
||||
return false;
|
||||
|
||||
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
|
||||
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
|
||||
inst->src[0], inst->size_read(0))) {
|
||||
if (scan_inst->opcode != BRW_OPCODE_OR &&
|
||||
scan_inst->opcode != BRW_OPCODE_AND)
|
||||
break;
|
||||
|
||||
if (scan_inst->is_partial_write() ||
|
||||
scan_inst->dst.offset != inst->src[0].offset ||
|
||||
scan_inst->exec_size != inst->exec_size)
|
||||
break;
|
||||
|
||||
/* If the scan instruction writes a different flag register than the
|
||||
* instruction we're trying to propagate from, bail.
|
||||
*
|
||||
* FINISHME: The second part of the condition may be too strong.
|
||||
* Perhaps (scan_inst->flags_written() & flags_written) !=
|
||||
* flags_written?
|
||||
*/
|
||||
if (scan_inst->flags_written(devinfo) != 0 &&
|
||||
scan_inst->flags_written(devinfo) != flags_written)
|
||||
break;
|
||||
|
||||
if (scan_inst->can_do_cmod() &&
|
||||
((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
|
||||
scan_inst->conditional_mod == cond)) {
|
||||
scan_inst->conditional_mod = cond;
|
||||
scan_inst->flag_subreg = inst->flag_subreg;
|
||||
inst->remove(block, true);
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
|
||||
break;
|
||||
|
||||
read_flag = read_flag ||
|
||||
(scan_inst->flags_read(devinfo) & flags_written) != 0;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool
|
||||
opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block)
|
||||
{
|
||||
bool progress = false;
|
||||
UNUSED int ip = block->end_ip + 1;
|
||||
|
||||
foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
|
||||
ip--;
|
||||
|
||||
if ((inst->opcode != BRW_OPCODE_AND &&
|
||||
inst->opcode != BRW_OPCODE_CMP &&
|
||||
inst->opcode != BRW_OPCODE_MOV &&
|
||||
inst->opcode != BRW_OPCODE_NOT) ||
|
||||
inst->predicate != BRW_PREDICATE_NONE ||
|
||||
!inst->dst.is_null() ||
|
||||
(inst->src[0].file != VGRF && inst->src[0].file != ATTR &&
|
||||
inst->src[0].file != UNIFORM))
|
||||
continue;
|
||||
|
||||
/* An ABS source modifier can only be handled when processing a compare
|
||||
* with a value other than zero.
|
||||
*/
|
||||
if (inst->src[0].abs &&
|
||||
(inst->opcode != BRW_OPCODE_CMP || inst->src[1].is_zero()))
|
||||
continue;
|
||||
|
||||
/* Only an AND.NZ can be propagated. Many AND.Z instructions are
|
||||
* generated (for ir_unop_not in fs_visitor::emit_bool_to_cond_code).
|
||||
* Propagating those would require inverting the condition on the CMP.
|
||||
* This changes both the flag value and the register destination of the
|
||||
* CMP. That result may be used elsewhere, so we can't change its value
|
||||
* on a whim.
|
||||
*/
|
||||
if (inst->opcode == BRW_OPCODE_AND &&
|
||||
!(inst->src[1].is_one() &&
|
||||
inst->conditional_mod == BRW_CONDITIONAL_NZ &&
|
||||
!inst->src[0].negate))
|
||||
continue;
|
||||
|
||||
/* A CMP with a second source of zero can match with anything. A CMP
|
||||
* with a second source that is not zero can only match with an ADD
|
||||
* instruction.
|
||||
*
|
||||
* Only apply this optimization to float-point sources. It can fail for
|
||||
* integers. For inputs a = 0x80000000, b = 4, int(0x80000000) < 4, but
|
||||
* int(0x80000000) - 4 overflows and results in 0x7ffffffc. that's not
|
||||
* less than zero, so the flags get set differently than for (a < b).
|
||||
*/
|
||||
if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) {
|
||||
if (brw_reg_type_is_floating_point(inst->src[0].type) &&
|
||||
cmod_propagate_cmp_to_add(devinfo, block, inst))
|
||||
progress = true;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (inst->opcode == BRW_OPCODE_NOT) {
|
||||
progress = cmod_propagate_not(devinfo, block, inst) || progress;
|
||||
continue;
|
||||
}
|
||||
|
||||
bool read_flag = false;
|
||||
const unsigned flags_written = inst->flags_written(devinfo);
|
||||
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
|
||||
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
|
||||
inst->src[0], inst->size_read(0))) {
|
||||
/* If the scan instruction writes a different flag register than
|
||||
* the instruction we're trying to propagate from, bail.
|
||||
*
|
||||
* FINISHME: The second part of the condition may be too strong.
|
||||
* Perhaps (scan_inst->flags_written() & flags_written) !=
|
||||
* flags_written?
|
||||
*/
|
||||
if (scan_inst->flags_written(devinfo) != 0 &&
|
||||
scan_inst->flags_written(devinfo) != flags_written)
|
||||
break;
|
||||
|
||||
if (scan_inst->is_partial_write() ||
|
||||
scan_inst->dst.offset != inst->src[0].offset ||
|
||||
scan_inst->exec_size != inst->exec_size)
|
||||
break;
|
||||
|
||||
/* If the write mask is different we can't propagate. */
|
||||
if (scan_inst->force_writemask_all != inst->force_writemask_all)
|
||||
break;
|
||||
|
||||
/* CMP's result is the same regardless of dest type. */
|
||||
if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
|
||||
scan_inst->opcode == BRW_OPCODE_CMP &&
|
||||
brw_reg_type_is_integer(inst->dst.type)) {
|
||||
inst->remove(block, true);
|
||||
progress = true;
|
||||
break;
|
||||
}
|
||||
|
||||
/* If the AND wasn't handled by the previous case, it isn't safe
|
||||
* to remove it.
|
||||
*/
|
||||
if (inst->opcode == BRW_OPCODE_AND)
|
||||
break;
|
||||
|
||||
if (inst->opcode == BRW_OPCODE_MOV) {
|
||||
if (brw_reg_type_is_floating_point(scan_inst->dst.type)) {
|
||||
/* If the destination type of scan_inst is floating-point,
|
||||
* then:
|
||||
*
|
||||
* - The source of the MOV instruction must be the same
|
||||
* type.
|
||||
*
|
||||
* - The destination of the MOV instruction must be float
|
||||
* point with a size at least as large as the destination
|
||||
* of inst. Size-reducing f2f conversions could cause
|
||||
* non-zero values to become zero, etc.
|
||||
*/
|
||||
if (scan_inst->dst.type != inst->src[0].type)
|
||||
break;
|
||||
|
||||
if (!brw_reg_type_is_floating_point(inst->dst.type))
|
||||
break;
|
||||
|
||||
if (type_sz(scan_inst->dst.type) > type_sz(inst->dst.type))
|
||||
break;
|
||||
} else {
|
||||
/* If the destination type of scan_inst is integer, then:
|
||||
*
|
||||
* - The source of the MOV instruction must be integer with
|
||||
* the same size.
|
||||
*
|
||||
* - If the conditional modifier is Z or NZ, then the
|
||||
* destination type of inst must either be floating point
|
||||
* (of any size) or integer with a size at least as large
|
||||
* as the destination of inst.
|
||||
*
|
||||
* - If the conditional modifier is neither Z nor NZ, then the
|
||||
* destination type of inst must either be floating point
|
||||
* (of any size) or integer with a size at least as large
|
||||
* as the destination of inst and the same signedness.
|
||||
*/
|
||||
if (!brw_reg_type_is_integer(inst->src[0].type) ||
|
||||
type_sz(scan_inst->dst.type) != type_sz(inst->src[0].type))
|
||||
break;
|
||||
|
||||
if (brw_reg_type_is_integer(inst->dst.type)) {
|
||||
if (type_sz(inst->dst.type) < type_sz(scan_inst->dst.type))
|
||||
break;
|
||||
|
||||
if (inst->conditional_mod != BRW_CONDITIONAL_Z &&
|
||||
inst->conditional_mod != BRW_CONDITIONAL_NZ &&
|
||||
brw_reg_type_is_unsigned_integer(inst->dst.type) !=
|
||||
brw_reg_type_is_unsigned_integer(scan_inst->dst.type))
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Not safe to use inequality operators if the types are
|
||||
* different.
|
||||
*/
|
||||
if (scan_inst->dst.type != inst->src[0].type &&
|
||||
inst->conditional_mod != BRW_CONDITIONAL_Z &&
|
||||
inst->conditional_mod != BRW_CONDITIONAL_NZ)
|
||||
break;
|
||||
|
||||
/* Comparisons operate differently for ints and floats */
|
||||
if (scan_inst->dst.type != inst->dst.type) {
|
||||
/* Comparison result may be altered if the bit-size changes
|
||||
* since that affects range, denorms, etc
|
||||
*/
|
||||
if (type_sz(scan_inst->dst.type) != type_sz(inst->dst.type))
|
||||
break;
|
||||
|
||||
if (brw_reg_type_is_floating_point(scan_inst->dst.type) !=
|
||||
brw_reg_type_is_floating_point(inst->dst.type))
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Knowing following:
|
||||
* - CMP writes to flag register the result of
|
||||
* applying cmod to the `src0 - src1`.
|
||||
* After that it stores the same value to dst.
|
||||
* Other instructions first store their result to
|
||||
* dst, and then store cmod(dst) to the flag
|
||||
* register.
|
||||
* - inst is either CMP or MOV
|
||||
* - inst->dst is null
|
||||
* - inst->src[0] overlaps with scan_inst->dst
|
||||
* - inst->src[1] is zero
|
||||
* - scan_inst wrote to a flag register
|
||||
*
|
||||
* There can be three possible paths:
|
||||
*
|
||||
* - scan_inst is CMP:
|
||||
*
|
||||
* Considering that src0 is either 0x0 (false),
|
||||
* or 0xffffffff (true), and src1 is 0x0:
|
||||
*
|
||||
* - If inst's cmod is NZ, we can always remove
|
||||
* scan_inst: NZ is invariant for false and true. This
|
||||
* holds even if src0 is NaN: .nz is the only cmod,
|
||||
* that returns true for NaN.
|
||||
*
|
||||
* - .g is invariant if src0 has a UD type
|
||||
*
|
||||
* - .l is invariant if src0 has a D type
|
||||
*
|
||||
* - scan_inst and inst have the same cmod:
|
||||
*
|
||||
* If scan_inst is anything than CMP, it already
|
||||
* wrote the appropriate value to the flag register.
|
||||
*
|
||||
* - else:
|
||||
*
|
||||
* We can change cmod of scan_inst to that of inst,
|
||||
* and remove inst. It is valid as long as we make
|
||||
* sure that no instruction uses the flag register
|
||||
* between scan_inst and inst.
|
||||
*/
|
||||
if (!inst->src[0].negate &&
|
||||
scan_inst->flags_written(devinfo)) {
|
||||
if (scan_inst->opcode == BRW_OPCODE_CMP) {
|
||||
if ((inst->conditional_mod == BRW_CONDITIONAL_NZ) ||
|
||||
(inst->conditional_mod == BRW_CONDITIONAL_G &&
|
||||
inst->src[0].type == BRW_REGISTER_TYPE_UD) ||
|
||||
(inst->conditional_mod == BRW_CONDITIONAL_L &&
|
||||
inst->src[0].type == BRW_REGISTER_TYPE_D)) {
|
||||
inst->remove(block, true);
|
||||
progress = true;
|
||||
break;
|
||||
}
|
||||
} else if (scan_inst->conditional_mod == inst->conditional_mod) {
|
||||
/* On Gfx4 and Gfx5 sel.cond will dirty the flags, but the
|
||||
* flags value is not based on the result stored in the
|
||||
* destination. On all other platforms sel.cond will not
|
||||
* write the flags, so execution will not get to this point.
|
||||
*/
|
||||
if (scan_inst->opcode == BRW_OPCODE_SEL) {
|
||||
assert(devinfo->ver <= 5);
|
||||
} else {
|
||||
inst->remove(block, true);
|
||||
progress = true;
|
||||
}
|
||||
|
||||
break;
|
||||
} else if (!read_flag && scan_inst->can_do_cmod()) {
|
||||
scan_inst->conditional_mod = inst->conditional_mod;
|
||||
scan_inst->flag_subreg = inst->flag_subreg;
|
||||
inst->remove(block, true);
|
||||
progress = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* The conditional mod of the CMP/CMPN instructions behaves
|
||||
* specially because the flag output is not calculated from the
|
||||
* result of the instruction, but the other way around, which
|
||||
* means that even if the condmod to propagate and the condmod
|
||||
* from the CMP instruction are the same they will in general give
|
||||
* different results because they are evaluated based on different
|
||||
* inputs.
|
||||
*/
|
||||
if (scan_inst->opcode == BRW_OPCODE_CMP ||
|
||||
scan_inst->opcode == BRW_OPCODE_CMPN)
|
||||
break;
|
||||
|
||||
/* From the Sky Lake PRM, Vol 2a, "Multiply":
|
||||
*
|
||||
* "When multiplying integer data types, if one of the sources
|
||||
* is a DW, the resulting full precision data is stored in
|
||||
* the accumulator. However, if the destination data type is
|
||||
* either W or DW, the low bits of the result are written to
|
||||
* the destination register and the remaining high bits are
|
||||
* discarded. This results in undefined Overflow and Sign
|
||||
* flags. Therefore, conditional modifiers and saturation
|
||||
* (.sat) cannot be used in this case."
|
||||
*
|
||||
* We just disallow cmod propagation on all integer multiplies.
|
||||
*/
|
||||
if (!brw_reg_type_is_floating_point(scan_inst->dst.type) &&
|
||||
scan_inst->opcode == BRW_OPCODE_MUL)
|
||||
break;
|
||||
|
||||
enum brw_conditional_mod cond =
|
||||
inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
|
||||
: inst->conditional_mod;
|
||||
|
||||
/* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags":
|
||||
*
|
||||
* * Note that the [post condition signal] bits generated at
|
||||
* the output of a compute are before the .sat.
|
||||
*
|
||||
* Paragraph about post_zero does not mention saturation, but
|
||||
* testing it on actual GPUs shows that conditional modifiers are
|
||||
* applied after saturation.
|
||||
*
|
||||
* * post_zero bit: This bit reflects whether the final
|
||||
* result is zero after all the clamping, normalizing,
|
||||
* or format conversion logic.
|
||||
*
|
||||
* For this reason, no additional restrictions are necessary on
|
||||
* instructions with saturate.
|
||||
*/
|
||||
|
||||
/* Otherwise, try propagating the conditional. */
|
||||
if (scan_inst->can_do_cmod() &&
|
||||
((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
|
||||
scan_inst->conditional_mod == cond)) {
|
||||
scan_inst->conditional_mod = cond;
|
||||
scan_inst->flag_subreg = inst->flag_subreg;
|
||||
inst->remove(block, true);
|
||||
progress = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
|
||||
break;
|
||||
|
||||
read_flag = read_flag ||
|
||||
(scan_inst->flags_read(devinfo) & flags_written) != 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* There is progress if and only if instructions were removed. */
|
||||
assert(progress == (block->end_ip_delta != 0));
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
bool
|
||||
fs_visitor::opt_cmod_propagation()
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
foreach_block_reverse(block, cfg) {
|
||||
progress = opt_cmod_propagation_local(devinfo, block) || progress;
|
||||
}
|
||||
|
||||
if (progress) {
|
||||
cfg->adjust_block_ips();
|
||||
|
||||
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||||
}
|
||||
|
||||
return progress;
|
||||
}
|
||||
1858
src/intel/compiler/elk/brw_fs_combine_constants.cpp
Normal file
1858
src/intel/compiler/elk/brw_fs_combine_constants.cpp
Normal file
File diff suppressed because it is too large
Load diff
1468
src/intel/compiler/elk/brw_fs_copy_propagation.cpp
Normal file
1468
src/intel/compiler/elk/brw_fs_copy_propagation.cpp
Normal file
File diff suppressed because it is too large
Load diff
396
src/intel/compiler/elk/brw_fs_cse.cpp
Normal file
396
src/intel/compiler/elk/brw_fs_cse.cpp
Normal file
|
|
@ -0,0 +1,396 @@
|
|||
/*
|
||||
* Copyright © 2012 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_fs.h"
|
||||
#include "brw_fs_builder.h"
|
||||
#include "brw_cfg.h"
|
||||
|
||||
/** @file brw_fs_cse.cpp
|
||||
*
|
||||
* Support for local common subexpression elimination.
|
||||
*
|
||||
* See Muchnick's Advanced Compiler Design and Implementation, section
|
||||
* 13.1 (p378).
|
||||
*/
|
||||
|
||||
using namespace brw;
|
||||
|
||||
namespace {
|
||||
struct aeb_entry : public exec_node {
|
||||
/** The instruction that generates the expression value. */
|
||||
fs_inst *generator;
|
||||
|
||||
/** The temporary where the value is stored. */
|
||||
fs_reg tmp;
|
||||
};
|
||||
}
|
||||
|
||||
static bool
|
||||
is_expression(const fs_visitor *v, const fs_inst *const inst)
|
||||
{
|
||||
switch (inst->opcode) {
|
||||
case BRW_OPCODE_MOV:
|
||||
case BRW_OPCODE_SEL:
|
||||
case BRW_OPCODE_NOT:
|
||||
case BRW_OPCODE_AND:
|
||||
case BRW_OPCODE_OR:
|
||||
case BRW_OPCODE_XOR:
|
||||
case BRW_OPCODE_SHR:
|
||||
case BRW_OPCODE_SHL:
|
||||
case BRW_OPCODE_ASR:
|
||||
case BRW_OPCODE_CMP:
|
||||
case BRW_OPCODE_CMPN:
|
||||
case BRW_OPCODE_ADD:
|
||||
case BRW_OPCODE_MUL:
|
||||
case SHADER_OPCODE_MULH:
|
||||
case BRW_OPCODE_FRC:
|
||||
case BRW_OPCODE_RNDU:
|
||||
case BRW_OPCODE_RNDD:
|
||||
case BRW_OPCODE_RNDE:
|
||||
case BRW_OPCODE_RNDZ:
|
||||
case BRW_OPCODE_LINE:
|
||||
case BRW_OPCODE_PLN:
|
||||
case BRW_OPCODE_MAD:
|
||||
case BRW_OPCODE_LRP:
|
||||
case FS_OPCODE_FB_READ_LOGICAL:
|
||||
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
||||
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
|
||||
case FS_OPCODE_LINTERP:
|
||||
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
|
||||
case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
|
||||
case FS_OPCODE_LOAD_LIVE_CHANNELS:
|
||||
case SHADER_OPCODE_BROADCAST:
|
||||
case SHADER_OPCODE_MOV_INDIRECT:
|
||||
case SHADER_OPCODE_TEX_LOGICAL:
|
||||
case SHADER_OPCODE_TXD_LOGICAL:
|
||||
case SHADER_OPCODE_TXF_LOGICAL:
|
||||
case SHADER_OPCODE_TXL_LOGICAL:
|
||||
case SHADER_OPCODE_TXS_LOGICAL:
|
||||
case FS_OPCODE_TXB_LOGICAL:
|
||||
case SHADER_OPCODE_TXF_CMS_LOGICAL:
|
||||
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
|
||||
case SHADER_OPCODE_TXF_UMS_LOGICAL:
|
||||
case SHADER_OPCODE_TXF_MCS_LOGICAL:
|
||||
case SHADER_OPCODE_LOD_LOGICAL:
|
||||
case SHADER_OPCODE_TG4_LOGICAL:
|
||||
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
|
||||
case FS_OPCODE_PACK:
|
||||
return true;
|
||||
case SHADER_OPCODE_RCP:
|
||||
case SHADER_OPCODE_RSQ:
|
||||
case SHADER_OPCODE_SQRT:
|
||||
case SHADER_OPCODE_EXP2:
|
||||
case SHADER_OPCODE_LOG2:
|
||||
case SHADER_OPCODE_POW:
|
||||
case SHADER_OPCODE_INT_QUOTIENT:
|
||||
case SHADER_OPCODE_INT_REMAINDER:
|
||||
case SHADER_OPCODE_SIN:
|
||||
case SHADER_OPCODE_COS:
|
||||
return inst->mlen < 2;
|
||||
case SHADER_OPCODE_LOAD_PAYLOAD:
|
||||
return !is_coalescing_payload(v->alloc, inst);
|
||||
default:
|
||||
return inst->is_send_from_grf() && !inst->has_side_effects() &&
|
||||
!inst->is_volatile();
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
operands_match(const fs_inst *a, const fs_inst *b, bool *negate)
|
||||
{
|
||||
fs_reg *xs = a->src;
|
||||
fs_reg *ys = b->src;
|
||||
|
||||
if (a->opcode == BRW_OPCODE_MAD) {
|
||||
return xs[0].equals(ys[0]) &&
|
||||
((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) ||
|
||||
(xs[2].equals(ys[1]) && xs[1].equals(ys[2])));
|
||||
} else if (a->opcode == BRW_OPCODE_MUL && a->dst.type == BRW_REGISTER_TYPE_F) {
|
||||
bool xs0_negate = xs[0].negate;
|
||||
bool xs1_negate = xs[1].file == IMM ? xs[1].f < 0.0f
|
||||
: xs[1].negate;
|
||||
bool ys0_negate = ys[0].negate;
|
||||
bool ys1_negate = ys[1].file == IMM ? ys[1].f < 0.0f
|
||||
: ys[1].negate;
|
||||
float xs1_imm = xs[1].f;
|
||||
float ys1_imm = ys[1].f;
|
||||
|
||||
xs[0].negate = false;
|
||||
xs[1].negate = false;
|
||||
ys[0].negate = false;
|
||||
ys[1].negate = false;
|
||||
xs[1].f = fabsf(xs[1].f);
|
||||
ys[1].f = fabsf(ys[1].f);
|
||||
|
||||
bool ret = (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
|
||||
(xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
|
||||
|
||||
xs[0].negate = xs0_negate;
|
||||
xs[1].negate = xs[1].file == IMM ? false : xs1_negate;
|
||||
ys[0].negate = ys0_negate;
|
||||
ys[1].negate = ys[1].file == IMM ? false : ys1_negate;
|
||||
xs[1].f = xs1_imm;
|
||||
ys[1].f = ys1_imm;
|
||||
|
||||
*negate = (xs0_negate != xs1_negate) != (ys0_negate != ys1_negate);
|
||||
if (*negate && (a->saturate || b->saturate))
|
||||
return false;
|
||||
return ret;
|
||||
} else if (!a->is_commutative()) {
|
||||
bool match = true;
|
||||
for (int i = 0; i < a->sources; i++) {
|
||||
if (!xs[i].equals(ys[i])) {
|
||||
match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return match;
|
||||
} else {
|
||||
return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
|
||||
(xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
instructions_match(fs_inst *a, fs_inst *b, bool *negate)
|
||||
{
|
||||
return a->opcode == b->opcode &&
|
||||
a->force_writemask_all == b->force_writemask_all &&
|
||||
a->exec_size == b->exec_size &&
|
||||
a->group == b->group &&
|
||||
a->saturate == b->saturate &&
|
||||
a->predicate == b->predicate &&
|
||||
a->predicate_inverse == b->predicate_inverse &&
|
||||
a->conditional_mod == b->conditional_mod &&
|
||||
a->flag_subreg == b->flag_subreg &&
|
||||
a->dst.type == b->dst.type &&
|
||||
a->offset == b->offset &&
|
||||
a->mlen == b->mlen &&
|
||||
a->ex_mlen == b->ex_mlen &&
|
||||
a->sfid == b->sfid &&
|
||||
a->desc == b->desc &&
|
||||
a->size_written == b->size_written &&
|
||||
a->base_mrf == b->base_mrf &&
|
||||
a->check_tdr == b->check_tdr &&
|
||||
a->send_has_side_effects == b->send_has_side_effects &&
|
||||
a->eot == b->eot &&
|
||||
a->header_size == b->header_size &&
|
||||
a->shadow_compare == b->shadow_compare &&
|
||||
a->pi_noperspective == b->pi_noperspective &&
|
||||
a->target == b->target &&
|
||||
a->sources == b->sources &&
|
||||
operands_match(a, b, negate);
|
||||
}
|
||||
|
||||
static void
|
||||
create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate)
|
||||
{
|
||||
unsigned written = regs_written(inst);
|
||||
unsigned dst_width =
|
||||
DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
|
||||
fs_inst *copy;
|
||||
|
||||
if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
|
||||
assert(src.file == VGRF);
|
||||
fs_reg *payload = ralloc_array(bld.shader->mem_ctx, fs_reg,
|
||||
inst->sources);
|
||||
for (int i = 0; i < inst->header_size; i++) {
|
||||
payload[i] = src;
|
||||
src.offset += REG_SIZE;
|
||||
}
|
||||
for (int i = inst->header_size; i < inst->sources; i++) {
|
||||
src.type = inst->src[i].type;
|
||||
payload[i] = src;
|
||||
src = offset(src, bld, 1);
|
||||
}
|
||||
copy = bld.LOAD_PAYLOAD(inst->dst, payload, inst->sources,
|
||||
inst->header_size);
|
||||
} else if (written != dst_width) {
|
||||
assert(src.file == VGRF);
|
||||
assert(written % dst_width == 0);
|
||||
const int sources = written / dst_width;
|
||||
fs_reg *payload = ralloc_array(bld.shader->mem_ctx, fs_reg, sources);
|
||||
for (int i = 0; i < sources; i++) {
|
||||
payload[i] = src;
|
||||
src = offset(src, bld, 1);
|
||||
}
|
||||
copy = bld.LOAD_PAYLOAD(inst->dst, payload, sources, 0);
|
||||
} else {
|
||||
copy = bld.MOV(inst->dst, src);
|
||||
copy->group = inst->group;
|
||||
copy->force_writemask_all = inst->force_writemask_all;
|
||||
copy->src[0].negate = negate;
|
||||
}
|
||||
assert(regs_written(copy) == written);
|
||||
}
|
||||
|
||||
bool
|
||||
fs_visitor::opt_cse_local(const fs_live_variables &live, bblock_t *block, int &ip)
|
||||
{
|
||||
bool progress = false;
|
||||
exec_list aeb;
|
||||
|
||||
void *cse_ctx = ralloc_context(NULL);
|
||||
|
||||
foreach_inst_in_block(fs_inst, inst, block) {
|
||||
/* Skip some cases. */
|
||||
if (is_expression(this, inst) && !inst->is_partial_write() &&
|
||||
((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) ||
|
||||
inst->dst.is_null()))
|
||||
{
|
||||
bool found = false;
|
||||
bool negate = false;
|
||||
|
||||
foreach_in_list_use_after(aeb_entry, entry, &aeb) {
|
||||
/* Match current instruction's expression against those in AEB. */
|
||||
if (!(entry->generator->dst.is_null() && !inst->dst.is_null()) &&
|
||||
instructions_match(inst, entry->generator, &negate)) {
|
||||
found = true;
|
||||
progress = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
if (inst->opcode != BRW_OPCODE_MOV ||
|
||||
(inst->opcode == BRW_OPCODE_MOV &&
|
||||
inst->src[0].file == IMM &&
|
||||
inst->src[0].type == BRW_REGISTER_TYPE_VF)) {
|
||||
/* Our first sighting of this expression. Create an entry. */
|
||||
aeb_entry *entry = ralloc(cse_ctx, aeb_entry);
|
||||
entry->tmp = reg_undef;
|
||||
entry->generator = inst;
|
||||
aeb.push_tail(entry);
|
||||
}
|
||||
} else {
|
||||
/* This is at least our second sighting of this expression.
|
||||
* If we don't have a temporary already, make one.
|
||||
*/
|
||||
bool no_existing_temp = entry->tmp.file == BAD_FILE;
|
||||
if (no_existing_temp && !entry->generator->dst.is_null()) {
|
||||
const fs_builder ibld = fs_builder(this, block, entry->generator)
|
||||
.at(block, entry->generator->next);
|
||||
int written = regs_written(entry->generator);
|
||||
|
||||
entry->tmp = fs_reg(VGRF, alloc.allocate(written),
|
||||
entry->generator->dst.type);
|
||||
|
||||
create_copy_instr(ibld, entry->generator, entry->tmp, false);
|
||||
|
||||
entry->generator->dst = entry->tmp;
|
||||
}
|
||||
|
||||
/* dest <- temp */
|
||||
if (!inst->dst.is_null()) {
|
||||
assert(inst->size_written == entry->generator->size_written);
|
||||
assert(inst->dst.type == entry->tmp.type);
|
||||
const fs_builder ibld(this, block, inst);
|
||||
|
||||
create_copy_instr(ibld, inst, entry->tmp, negate);
|
||||
}
|
||||
|
||||
/* Set our iterator so that next time through the loop inst->next
|
||||
* will get the instruction in the basic block after the one we've
|
||||
* removed.
|
||||
*/
|
||||
fs_inst *prev = (fs_inst *)inst->prev;
|
||||
|
||||
inst->remove(block);
|
||||
inst = prev;
|
||||
}
|
||||
}
|
||||
|
||||
/* Discard jumps aren't represented in the CFG unfortunately, so we need
|
||||
* to make sure that they behave as a CSE barrier, since we lack global
|
||||
* dataflow information. This is particularly likely to cause problems
|
||||
* with instructions dependent on the current execution mask like
|
||||
* SHADER_OPCODE_FIND_LIVE_CHANNEL.
|
||||
*/
|
||||
if (inst->opcode == BRW_OPCODE_HALT ||
|
||||
inst->opcode == SHADER_OPCODE_HALT_TARGET)
|
||||
aeb.make_empty();
|
||||
|
||||
foreach_in_list_safe(aeb_entry, entry, &aeb) {
|
||||
/* Kill all AEB entries that write a different value to or read from
|
||||
* the flag register if we just wrote it.
|
||||
*/
|
||||
if (inst->flags_written(devinfo)) {
|
||||
bool negate; /* dummy */
|
||||
if (entry->generator->flags_read(devinfo) ||
|
||||
(entry->generator->flags_written(devinfo) &&
|
||||
!instructions_match(inst, entry->generator, &negate))) {
|
||||
entry->remove();
|
||||
ralloc_free(entry);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < entry->generator->sources; i++) {
|
||||
fs_reg *src_reg = &entry->generator->src[i];
|
||||
|
||||
/* Kill all AEB entries that use the destination we just
|
||||
* overwrote.
|
||||
*/
|
||||
if (regions_overlap(inst->dst, inst->size_written,
|
||||
entry->generator->src[i],
|
||||
entry->generator->size_read(i))) {
|
||||
entry->remove();
|
||||
ralloc_free(entry);
|
||||
break;
|
||||
}
|
||||
|
||||
/* Kill any AEB entries using registers that don't get reused any
|
||||
* more -- a sure sign they'll fail operands_match().
|
||||
*/
|
||||
if (src_reg->file == VGRF && live.vgrf_end[src_reg->nr] < ip) {
|
||||
entry->remove();
|
||||
ralloc_free(entry);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ip++;
|
||||
}
|
||||
|
||||
ralloc_free(cse_ctx);
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
bool
|
||||
fs_visitor::opt_cse()
|
||||
{
|
||||
const fs_live_variables &live = live_analysis.require();
|
||||
bool progress = false;
|
||||
int ip = 0;
|
||||
|
||||
foreach_block (block, cfg) {
|
||||
progress = opt_cse_local(live, block, ip) || progress;
|
||||
}
|
||||
|
||||
if (progress)
|
||||
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
||||
|
||||
return progress;
|
||||
}
|
||||
152
src/intel/compiler/elk/brw_fs_dead_code_eliminate.cpp
Normal file
152
src/intel/compiler/elk/brw_fs_dead_code_eliminate.cpp
Normal file
|
|
@ -0,0 +1,152 @@
|
|||
/*
|
||||
* Copyright © 2014 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_fs.h"
|
||||
#include "brw_fs_live_variables.h"
|
||||
#include "brw_cfg.h"
|
||||
|
||||
/** @file brw_fs_dead_code_eliminate.cpp
|
||||
*
|
||||
* Dataflow-aware dead code elimination.
|
||||
*
|
||||
* Walks the instruction list from the bottom, removing instructions that
|
||||
* have results that both aren't used in later blocks and haven't been read
|
||||
* yet in the tail end of this block.
|
||||
*/
|
||||
|
||||
using namespace brw;
|
||||
|
||||
/**
|
||||
* Is it safe to eliminate the instruction?
|
||||
*/
|
||||
static bool
|
||||
can_eliminate(const intel_device_info *devinfo, const fs_inst *inst,
|
||||
BITSET_WORD *flag_live)
|
||||
{
|
||||
return !inst->is_control_flow() &&
|
||||
!inst->has_side_effects() &&
|
||||
!(flag_live[0] & inst->flags_written(devinfo)) &&
|
||||
!inst->writes_accumulator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is it safe to omit the write, making the destination ARF null?
|
||||
*/
|
||||
static bool
|
||||
can_omit_write(const fs_inst *inst)
|
||||
{
|
||||
switch (inst->opcode) {
|
||||
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
|
||||
case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
|
||||
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
|
||||
return true;
|
||||
default:
|
||||
/* We can eliminate the destination write for ordinary instructions,
|
||||
* but not most SENDs.
|
||||
*/
|
||||
if (inst->opcode < 128 && inst->mlen == 0)
|
||||
return true;
|
||||
|
||||
/* It might not be safe for other virtual opcodes. */
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
fs_visitor::dead_code_eliminate()
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
const fs_live_variables &live_vars = live_analysis.require();
|
||||
int num_vars = live_vars.num_vars;
|
||||
BITSET_WORD *live = rzalloc_array(NULL, BITSET_WORD, BITSET_WORDS(num_vars));
|
||||
BITSET_WORD *flag_live = rzalloc_array(NULL, BITSET_WORD, 1);
|
||||
|
||||
foreach_block_reverse_safe(block, cfg) {
|
||||
memcpy(live, live_vars.block_data[block->num].liveout,
|
||||
sizeof(BITSET_WORD) * BITSET_WORDS(num_vars));
|
||||
memcpy(flag_live, live_vars.block_data[block->num].flag_liveout,
|
||||
sizeof(BITSET_WORD));
|
||||
|
||||
foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
|
||||
if (inst->dst.file == VGRF) {
|
||||
const unsigned var = live_vars.var_from_reg(inst->dst);
|
||||
bool result_live = false;
|
||||
|
||||
for (unsigned i = 0; i < regs_written(inst); i++)
|
||||
result_live |= BITSET_TEST(live, var + i);
|
||||
|
||||
if (!result_live &&
|
||||
(can_omit_write(inst) || can_eliminate(devinfo, inst, flag_live))) {
|
||||
inst->dst = fs_reg(spread(retype(brw_null_reg(), inst->dst.type),
|
||||
inst->dst.stride));
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (inst->dst.is_null() && can_eliminate(devinfo, inst, flag_live)) {
|
||||
inst->opcode = BRW_OPCODE_NOP;
|
||||
progress = true;
|
||||
}
|
||||
|
||||
if (inst->dst.file == VGRF) {
|
||||
if (!inst->is_partial_write()) {
|
||||
const unsigned var = live_vars.var_from_reg(inst->dst);
|
||||
for (unsigned i = 0; i < regs_written(inst); i++) {
|
||||
BITSET_CLEAR(live, var + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!inst->predicate && inst->exec_size >= 8)
|
||||
flag_live[0] &= ~inst->flags_written(devinfo);
|
||||
|
||||
if (inst->opcode == BRW_OPCODE_NOP) {
|
||||
inst->remove(block, true);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int i = 0; i < inst->sources; i++) {
|
||||
if (inst->src[i].file == VGRF) {
|
||||
int var = live_vars.var_from_reg(inst->src[i]);
|
||||
|
||||
for (unsigned j = 0; j < regs_read(inst, i); j++) {
|
||||
BITSET_SET(live, var + j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
flag_live[0] |= inst->flags_read(devinfo);
|
||||
}
|
||||
}
|
||||
|
||||
cfg->adjust_block_ips();
|
||||
|
||||
ralloc_free(live);
|
||||
ralloc_free(flag_live);
|
||||
|
||||
if (progress)
|
||||
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||||
|
||||
return progress;
|
||||
}
|
||||
2544
src/intel/compiler/elk/brw_fs_generator.cpp
Normal file
2544
src/intel/compiler/elk/brw_fs_generator.cpp
Normal file
File diff suppressed because it is too large
Load diff
371
src/intel/compiler/elk/brw_fs_live_variables.cpp
Normal file
371
src/intel/compiler/elk/brw_fs_live_variables.cpp
Normal file
|
|
@ -0,0 +1,371 @@
|
|||
/*
|
||||
* Copyright © 2012 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* Authors:
|
||||
* Eric Anholt <eric@anholt.net>
|
||||
*
|
||||
*/
|
||||
|
||||
#include "brw_fs.h"
|
||||
#include "brw_fs_live_variables.h"
|
||||
|
||||
using namespace brw;
|
||||
|
||||
#define MAX_INSTRUCTION (1 << 30)
|
||||
|
||||
/** @file brw_fs_live_variables.cpp
|
||||
*
|
||||
* Support for calculating liveness information about virtual GRFs.
|
||||
*
|
||||
* This produces a live interval for each whole virtual GRF. We could
|
||||
* choose to expose per-component live intervals for VGRFs of size > 1,
|
||||
* but we currently do not. It is easier for the consumers of this
|
||||
* information to work with whole VGRFs.
|
||||
*
|
||||
* However, we internally track use/def information at the per-GRF level for
|
||||
* greater accuracy. Large VGRFs may be accessed piecemeal over many
|
||||
* (possibly non-adjacent) instructions. In this case, examining a single
|
||||
* instruction is insufficient to decide whether a whole VGRF is ultimately
|
||||
* used or defined. Tracking individual components allows us to easily
|
||||
* assemble this information.
|
||||
*
|
||||
* See Muchnick's Advanced Compiler Design and Implementation, section
|
||||
* 14.1 (p444).
|
||||
*/
|
||||
|
||||
void
|
||||
fs_live_variables::setup_one_read(struct block_data *bd,
|
||||
int ip, const fs_reg ®)
|
||||
{
|
||||
int var = var_from_reg(reg);
|
||||
assert(var < num_vars);
|
||||
|
||||
start[var] = MIN2(start[var], ip);
|
||||
end[var] = MAX2(end[var], ip);
|
||||
|
||||
/* The use[] bitset marks when the block makes use of a variable (VGRF
|
||||
* channel) without having completely defined that variable within the
|
||||
* block.
|
||||
*/
|
||||
if (!BITSET_TEST(bd->def, var))
|
||||
BITSET_SET(bd->use, var);
|
||||
}
|
||||
|
||||
void
|
||||
fs_live_variables::setup_one_write(struct block_data *bd, fs_inst *inst,
|
||||
int ip, const fs_reg ®)
|
||||
{
|
||||
int var = var_from_reg(reg);
|
||||
assert(var < num_vars);
|
||||
|
||||
start[var] = MIN2(start[var], ip);
|
||||
end[var] = MAX2(end[var], ip);
|
||||
|
||||
/* The def[] bitset marks when an initialization in a block completely
|
||||
* screens off previous updates of that variable (VGRF channel).
|
||||
*/
|
||||
if (inst->dst.file == VGRF) {
|
||||
if (!inst->is_partial_write() && !BITSET_TEST(bd->use, var))
|
||||
BITSET_SET(bd->def, var);
|
||||
|
||||
BITSET_SET(bd->defout, var);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets up the use[] and def[] bitsets.
|
||||
*
|
||||
* The basic-block-level live variable analysis needs to know which
|
||||
* variables get used before they're completely defined, and which
|
||||
* variables are completely defined before they're used.
|
||||
*
|
||||
* These are tracked at the per-component level, rather than whole VGRFs.
|
||||
*/
|
||||
void
|
||||
fs_live_variables::setup_def_use()
|
||||
{
|
||||
int ip = 0;
|
||||
|
||||
foreach_block (block, cfg) {
|
||||
assert(ip == block->start_ip);
|
||||
if (block->num > 0)
|
||||
assert(cfg->blocks[block->num - 1]->end_ip == ip - 1);
|
||||
|
||||
struct block_data *bd = &block_data[block->num];
|
||||
|
||||
foreach_inst_in_block(fs_inst, inst, block) {
|
||||
/* Set use[] for this instruction */
|
||||
for (unsigned int i = 0; i < inst->sources; i++) {
|
||||
fs_reg reg = inst->src[i];
|
||||
|
||||
if (reg.file != VGRF)
|
||||
continue;
|
||||
|
||||
for (unsigned j = 0; j < regs_read(inst, i); j++) {
|
||||
setup_one_read(bd, ip, reg);
|
||||
reg.offset += REG_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
bd->flag_use[0] |= inst->flags_read(devinfo) & ~bd->flag_def[0];
|
||||
|
||||
/* Set def[] for this instruction */
|
||||
if (inst->dst.file == VGRF) {
|
||||
fs_reg reg = inst->dst;
|
||||
for (unsigned j = 0; j < regs_written(inst); j++) {
|
||||
setup_one_write(bd, inst, ip, reg);
|
||||
reg.offset += REG_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
if (!inst->predicate && inst->exec_size >= 8)
|
||||
bd->flag_def[0] |= inst->flags_written(devinfo) & ~bd->flag_use[0];
|
||||
|
||||
ip++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The algorithm incrementally sets bits in liveout and livein,
|
||||
* propagating it through control flow. It will eventually terminate
|
||||
* because it only ever adds bits, and stops when no bits are added in
|
||||
* a pass.
|
||||
*/
|
||||
void
|
||||
fs_live_variables::compute_live_variables()
|
||||
{
|
||||
bool cont = true;
|
||||
|
||||
/* Propagate defin and defout down the CFG to calculate the union of live
|
||||
* variables potentially defined along any possible control flow path.
|
||||
*/
|
||||
do {
|
||||
cont = false;
|
||||
|
||||
foreach_block (block, cfg) {
|
||||
const struct block_data *bd = &block_data[block->num];
|
||||
|
||||
foreach_list_typed(bblock_link, child_link, link, &block->children) {
|
||||
struct block_data *child_bd = &block_data[child_link->block->num];
|
||||
|
||||
for (int i = 0; i < bitset_words; i++) {
|
||||
const BITSET_WORD new_def = bd->defout[i] & ~child_bd->defin[i];
|
||||
child_bd->defin[i] |= new_def;
|
||||
child_bd->defout[i] |= new_def;
|
||||
cont |= new_def;
|
||||
}
|
||||
}
|
||||
}
|
||||
} while (cont);
|
||||
|
||||
do {
|
||||
cont = false;
|
||||
|
||||
foreach_block_reverse (block, cfg) {
|
||||
struct block_data *bd = &block_data[block->num];
|
||||
|
||||
/* Update liveout */
|
||||
foreach_list_typed(bblock_link, child_link, link, &block->children) {
|
||||
struct block_data *child_bd = &block_data[child_link->block->num];
|
||||
|
||||
for (int i = 0; i < bitset_words; i++) {
|
||||
BITSET_WORD new_liveout = (child_bd->livein[i] &
|
||||
~bd->liveout[i]);
|
||||
new_liveout &= bd->defout[i]; /* Screen off uses with no reaching def */
|
||||
if (new_liveout)
|
||||
bd->liveout[i] |= new_liveout;
|
||||
}
|
||||
BITSET_WORD new_liveout = (child_bd->flag_livein[0] &
|
||||
~bd->flag_liveout[0]);
|
||||
if (new_liveout)
|
||||
bd->flag_liveout[0] |= new_liveout;
|
||||
}
|
||||
|
||||
/* Update livein */
|
||||
for (int i = 0; i < bitset_words; i++) {
|
||||
BITSET_WORD new_livein = (bd->use[i] |
|
||||
(bd->liveout[i] &
|
||||
~bd->def[i]));
|
||||
new_livein &= bd->defin[i]; /* Screen off uses with no reaching def */
|
||||
if (new_livein & ~bd->livein[i]) {
|
||||
bd->livein[i] |= new_livein;
|
||||
cont = true;
|
||||
}
|
||||
}
|
||||
BITSET_WORD new_livein = (bd->flag_use[0] |
|
||||
(bd->flag_liveout[0] &
|
||||
~bd->flag_def[0]));
|
||||
if (new_livein & ~bd->flag_livein[0]) {
|
||||
bd->flag_livein[0] |= new_livein;
|
||||
cont = true;
|
||||
}
|
||||
}
|
||||
} while (cont);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extend the start/end ranges for each variable to account for the
|
||||
* new information calculated from control flow.
|
||||
*/
|
||||
void
|
||||
fs_live_variables::compute_start_end()
|
||||
{
|
||||
foreach_block (block, cfg) {
|
||||
struct block_data *bd = &block_data[block->num];
|
||||
unsigned i;
|
||||
|
||||
BITSET_FOREACH_SET(i, bd->livein, (unsigned)num_vars) {
|
||||
start[i] = MIN2(start[i], block->start_ip);
|
||||
end[i] = MAX2(end[i], block->start_ip);
|
||||
}
|
||||
|
||||
BITSET_FOREACH_SET(i, bd->liveout, (unsigned)num_vars) {
|
||||
start[i] = MIN2(start[i], block->end_ip);
|
||||
end[i] = MAX2(end[i], block->end_ip);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fs_live_variables::fs_live_variables(const backend_shader *s)
|
||||
: devinfo(s->devinfo), cfg(s->cfg)
|
||||
{
|
||||
mem_ctx = ralloc_context(NULL);
|
||||
linear_ctx *lin_ctx = linear_context(mem_ctx);
|
||||
|
||||
num_vgrfs = s->alloc.count;
|
||||
num_vars = 0;
|
||||
var_from_vgrf = linear_zalloc_array(lin_ctx, int, num_vgrfs);
|
||||
for (int i = 0; i < num_vgrfs; i++) {
|
||||
var_from_vgrf[i] = num_vars;
|
||||
num_vars += s->alloc.sizes[i];
|
||||
}
|
||||
|
||||
vgrf_from_var = linear_zalloc_array(lin_ctx, int, num_vars);
|
||||
for (int i = 0; i < num_vgrfs; i++) {
|
||||
for (unsigned j = 0; j < s->alloc.sizes[i]; j++) {
|
||||
vgrf_from_var[var_from_vgrf[i] + j] = i;
|
||||
}
|
||||
}
|
||||
|
||||
start = ralloc_array(mem_ctx, int, num_vars);
|
||||
end = linear_zalloc_array(lin_ctx, int, num_vars);
|
||||
for (int i = 0; i < num_vars; i++) {
|
||||
start[i] = MAX_INSTRUCTION;
|
||||
end[i] = -1;
|
||||
}
|
||||
|
||||
vgrf_start = ralloc_array(mem_ctx, int, num_vgrfs);
|
||||
vgrf_end = ralloc_array(mem_ctx, int, num_vgrfs);
|
||||
for (int i = 0; i < num_vgrfs; i++) {
|
||||
vgrf_start[i] = MAX_INSTRUCTION;
|
||||
vgrf_end[i] = -1;
|
||||
}
|
||||
|
||||
block_data = linear_zalloc_array(lin_ctx, struct block_data, cfg->num_blocks);
|
||||
|
||||
bitset_words = BITSET_WORDS(num_vars);
|
||||
for (int i = 0; i < cfg->num_blocks; i++) {
|
||||
block_data[i].def = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
|
||||
block_data[i].use = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
|
||||
block_data[i].livein = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
|
||||
block_data[i].liveout = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
|
||||
block_data[i].defin = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
|
||||
block_data[i].defout = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
|
||||
|
||||
block_data[i].flag_def[0] = 0;
|
||||
block_data[i].flag_use[0] = 0;
|
||||
block_data[i].flag_livein[0] = 0;
|
||||
block_data[i].flag_liveout[0] = 0;
|
||||
}
|
||||
|
||||
setup_def_use();
|
||||
compute_live_variables();
|
||||
compute_start_end();
|
||||
|
||||
/* Merge the per-component live ranges to whole VGRF live ranges. */
|
||||
for (int i = 0; i < num_vars; i++) {
|
||||
const unsigned vgrf = vgrf_from_var[i];
|
||||
vgrf_start[vgrf] = MIN2(vgrf_start[vgrf], start[i]);
|
||||
vgrf_end[vgrf] = MAX2(vgrf_end[vgrf], end[i]);
|
||||
}
|
||||
}
|
||||
|
||||
fs_live_variables::~fs_live_variables()
|
||||
{
|
||||
ralloc_free(mem_ctx);
|
||||
}
|
||||
|
||||
static bool
|
||||
check_register_live_range(const fs_live_variables *live, int ip,
|
||||
const fs_reg ®, unsigned n)
|
||||
{
|
||||
const unsigned var = live->var_from_reg(reg);
|
||||
|
||||
if (var + n > unsigned(live->num_vars) ||
|
||||
live->vgrf_start[reg.nr] > ip || live->vgrf_end[reg.nr] < ip)
|
||||
return false;
|
||||
|
||||
for (unsigned j = 0; j < n; j++) {
|
||||
if (live->start[var + j] > ip || live->end[var + j] < ip)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
fs_live_variables::validate(const backend_shader *s) const
|
||||
{
|
||||
int ip = 0;
|
||||
|
||||
foreach_block_and_inst(block, fs_inst, inst, s->cfg) {
|
||||
for (unsigned i = 0; i < inst->sources; i++) {
|
||||
if (inst->src[i].file == VGRF &&
|
||||
!check_register_live_range(this, ip,
|
||||
inst->src[i], regs_read(inst, i)))
|
||||
return false;
|
||||
}
|
||||
|
||||
if (inst->dst.file == VGRF &&
|
||||
!check_register_live_range(this, ip, inst->dst, regs_written(inst)))
|
||||
return false;
|
||||
|
||||
ip++;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
fs_live_variables::vars_interfere(int a, int b) const
|
||||
{
|
||||
return !(end[b] <= start[a] ||
|
||||
end[a] <= start[b]);
|
||||
}
|
||||
|
||||
bool
|
||||
fs_live_variables::vgrfs_interfere(int a, int b) const
|
||||
{
|
||||
return !(vgrf_end[a] <= vgrf_start[b] ||
|
||||
vgrf_end[b] <= vgrf_start[a]);
|
||||
}
|
||||
148
src/intel/compiler/elk/brw_fs_live_variables.h
Normal file
148
src/intel/compiler/elk/brw_fs_live_variables.h
Normal file
|
|
@ -0,0 +1,148 @@
|
|||
/*
|
||||
* Copyright © 2012 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* Authors:
|
||||
* Eric Anholt <eric@anholt.net>
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef BRW_FS_LIVE_VARIABLES_H
|
||||
#define BRW_FS_LIVE_VARIABLES_H
|
||||
|
||||
#include "brw_ir_analysis.h"
|
||||
#include "brw_ir_fs.h"
|
||||
#include "util/bitset.h"
|
||||
|
||||
struct cfg_t;
|
||||
struct backend_shader;
|
||||
|
||||
namespace brw {
|
||||
|
||||
class fs_live_variables {
|
||||
public:
|
||||
struct block_data {
|
||||
/**
|
||||
* Which variables are defined before being used in the block.
|
||||
*
|
||||
* Note that for our purposes, "defined" means unconditionally, completely
|
||||
* defined.
|
||||
*/
|
||||
BITSET_WORD *def;
|
||||
|
||||
/**
|
||||
* Which variables are used before being defined in the block.
|
||||
*/
|
||||
BITSET_WORD *use;
|
||||
|
||||
/** Which defs reach the entry point of the block. */
|
||||
BITSET_WORD *livein;
|
||||
|
||||
/** Which defs reach the exit point of the block. */
|
||||
BITSET_WORD *liveout;
|
||||
|
||||
/**
|
||||
* Variables such that the entry point of the block may be reached from any
|
||||
* of their definitions.
|
||||
*/
|
||||
BITSET_WORD *defin;
|
||||
|
||||
/**
|
||||
* Variables such that the exit point of the block may be reached from any
|
||||
* of their definitions.
|
||||
*/
|
||||
BITSET_WORD *defout;
|
||||
|
||||
BITSET_WORD flag_def[1];
|
||||
BITSET_WORD flag_use[1];
|
||||
BITSET_WORD flag_livein[1];
|
||||
BITSET_WORD flag_liveout[1];
|
||||
};
|
||||
|
||||
fs_live_variables(const backend_shader *s);
|
||||
~fs_live_variables();
|
||||
|
||||
bool validate(const backend_shader *s) const;
|
||||
|
||||
analysis_dependency_class
|
||||
dependency_class() const
|
||||
{
|
||||
return (DEPENDENCY_INSTRUCTION_IDENTITY |
|
||||
DEPENDENCY_INSTRUCTION_DATA_FLOW |
|
||||
DEPENDENCY_VARIABLES);
|
||||
}
|
||||
|
||||
bool vars_interfere(int a, int b) const;
|
||||
bool vgrfs_interfere(int a, int b) const;
|
||||
int var_from_reg(const fs_reg ®) const
|
||||
{
|
||||
return var_from_vgrf[reg.nr] + reg.offset / REG_SIZE;
|
||||
}
|
||||
|
||||
/** Map from virtual GRF number to index in block_data arrays. */
|
||||
int *var_from_vgrf;
|
||||
|
||||
/**
|
||||
* Map from any index in block_data to the virtual GRF containing it.
|
||||
*
|
||||
* For alloc.sizes of [1, 2, 3], vgrf_from_var would contain
|
||||
* [0, 1, 1, 2, 2, 2].
|
||||
*/
|
||||
int *vgrf_from_var;
|
||||
|
||||
int num_vars;
|
||||
int num_vgrfs;
|
||||
int bitset_words;
|
||||
|
||||
/** @{
|
||||
* Final computed live ranges for each var (each component of each virtual
|
||||
* GRF).
|
||||
*/
|
||||
int *start;
|
||||
int *end;
|
||||
/** @} */
|
||||
|
||||
/** @{
|
||||
* Final computed live ranges for each VGRF.
|
||||
*/
|
||||
int *vgrf_start;
|
||||
int *vgrf_end;
|
||||
/** @} */
|
||||
|
||||
/** Per-basic-block information on live variables */
|
||||
struct block_data *block_data;
|
||||
|
||||
protected:
|
||||
void setup_def_use();
|
||||
void setup_one_read(struct block_data *bd, int ip, const fs_reg ®);
|
||||
void setup_one_write(struct block_data *bd, fs_inst *inst, int ip,
|
||||
const fs_reg ®);
|
||||
void compute_live_variables();
|
||||
void compute_start_end();
|
||||
|
||||
const struct intel_device_info *devinfo;
|
||||
const cfg_t *cfg;
|
||||
void *mem_ctx;
|
||||
};
|
||||
|
||||
} /* namespace brw */
|
||||
|
||||
#endif /* BRW_FS_LIVE_VARIABLES_H */
|
||||
306
src/intel/compiler/elk/brw_fs_lower_dpas.cpp
Normal file
306
src/intel/compiler/elk/brw_fs_lower_dpas.cpp
Normal file
|
|
@ -0,0 +1,306 @@
|
|||
/*
|
||||
* Copyright 2023 Intel Corporation
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "brw_fs.h"
|
||||
#include "brw_fs_builder.h"
|
||||
|
||||
using namespace brw;
|
||||
|
||||
static void
|
||||
f16_using_mac(const fs_builder &bld, fs_inst *inst)
|
||||
{
|
||||
/* We only intend to support configurations where the destination and
|
||||
* accumulator have the same type.
|
||||
*/
|
||||
if (!inst->src[0].is_null())
|
||||
assert(inst->dst.type == inst->src[0].type);
|
||||
|
||||
assert(inst->src[1].type == BRW_REGISTER_TYPE_HF);
|
||||
assert(inst->src[2].type == BRW_REGISTER_TYPE_HF);
|
||||
|
||||
const brw_reg_type src0_type = inst->dst.type;
|
||||
const brw_reg_type src1_type = BRW_REGISTER_TYPE_HF;
|
||||
const brw_reg_type src2_type = BRW_REGISTER_TYPE_HF;
|
||||
|
||||
const fs_reg dest = inst->dst;
|
||||
fs_reg src0 = inst->src[0];
|
||||
const fs_reg src1 = retype(inst->src[1], src1_type);
|
||||
const fs_reg src2 = retype(inst->src[2], src2_type);
|
||||
|
||||
const unsigned dest_stride =
|
||||
dest.type == BRW_REGISTER_TYPE_HF ? REG_SIZE / 2 : REG_SIZE;
|
||||
|
||||
for (unsigned r = 0; r < inst->rcount; r++) {
|
||||
fs_reg temp = bld.vgrf(BRW_REGISTER_TYPE_HF, 1);
|
||||
|
||||
for (unsigned subword = 0; subword < 2; subword++) {
|
||||
for (unsigned s = 0; s < inst->sdepth; s++) {
|
||||
/* The first multiply of the dot-product operation has to
|
||||
* explicitly write the accumulator register. The successive MAC
|
||||
* instructions will implicitly read *and* write the
|
||||
* accumulator. Those MAC instructions can also optionally
|
||||
* explicitly write some other register.
|
||||
*
|
||||
* FINISHME: The accumulator can actually hold 16 HF values. On
|
||||
* Gfx12 there are two accumulators. It should be possible to do
|
||||
* this in SIMD16 or even SIMD32. I was unable to get this to work
|
||||
* properly.
|
||||
*/
|
||||
if (s == 0 && subword == 0) {
|
||||
const unsigned acc_width = 8;
|
||||
fs_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD),
|
||||
inst->group % acc_width);
|
||||
|
||||
if (bld.shader->devinfo->verx10 >= 125) {
|
||||
acc = subscript(acc, BRW_REGISTER_TYPE_HF, subword);
|
||||
} else {
|
||||
acc = retype(acc, BRW_REGISTER_TYPE_HF);
|
||||
}
|
||||
|
||||
bld.MUL(acc,
|
||||
subscript(retype(byte_offset(src1, s * REG_SIZE),
|
||||
BRW_REGISTER_TYPE_UD),
|
||||
BRW_REGISTER_TYPE_HF, subword),
|
||||
component(retype(byte_offset(src2, r * REG_SIZE),
|
||||
BRW_REGISTER_TYPE_HF),
|
||||
s * 2 + subword))
|
||||
->writes_accumulator = true;
|
||||
|
||||
} else {
|
||||
fs_reg result;
|
||||
|
||||
/* As mentioned above, the MAC had an optional, explicit
|
||||
* destination register. Various optimization passes are not
|
||||
* clever enough to understand the intricacies of this
|
||||
* instruction, so only write the result register on the final
|
||||
* MAC in the sequence.
|
||||
*/
|
||||
if ((s + 1) == inst->sdepth && subword == 1)
|
||||
result = temp;
|
||||
else
|
||||
result = retype(bld.null_reg_ud(), BRW_REGISTER_TYPE_HF);
|
||||
|
||||
bld.MAC(result,
|
||||
subscript(retype(byte_offset(src1, s * REG_SIZE),
|
||||
BRW_REGISTER_TYPE_UD),
|
||||
BRW_REGISTER_TYPE_HF, subword),
|
||||
component(retype(byte_offset(src2, r * REG_SIZE),
|
||||
BRW_REGISTER_TYPE_HF),
|
||||
s * 2 + subword))
|
||||
->writes_accumulator = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!src0.is_null()) {
|
||||
if (src0_type != BRW_REGISTER_TYPE_HF) {
|
||||
fs_reg temp2 = bld.vgrf(src0_type, 1);
|
||||
|
||||
bld.MOV(temp2, temp);
|
||||
|
||||
bld.ADD(byte_offset(dest, r * dest_stride),
|
||||
temp2,
|
||||
byte_offset(src0, r * dest_stride));
|
||||
} else {
|
||||
bld.ADD(byte_offset(dest, r * dest_stride),
|
||||
temp,
|
||||
byte_offset(src0, r * dest_stride));
|
||||
}
|
||||
} else {
|
||||
bld.MOV(byte_offset(dest, r * dest_stride), temp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
int8_using_dp4a(const fs_builder &bld, fs_inst *inst)
|
||||
{
|
||||
/* We only intend to support configurations where the destination and
|
||||
* accumulator have the same type.
|
||||
*/
|
||||
if (!inst->src[0].is_null())
|
||||
assert(inst->dst.type == inst->src[0].type);
|
||||
|
||||
assert(inst->src[1].type == BRW_REGISTER_TYPE_B ||
|
||||
inst->src[1].type == BRW_REGISTER_TYPE_UB);
|
||||
assert(inst->src[2].type == BRW_REGISTER_TYPE_B ||
|
||||
inst->src[2].type == BRW_REGISTER_TYPE_UB);
|
||||
|
||||
const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB
|
||||
? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
|
||||
|
||||
const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB
|
||||
? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
|
||||
|
||||
fs_reg dest = inst->dst;
|
||||
fs_reg src0 = inst->src[0];
|
||||
const fs_reg src1 = retype(inst->src[1], src1_type);
|
||||
const fs_reg src2 = retype(inst->src[2], src2_type);
|
||||
|
||||
const unsigned dest_stride = REG_SIZE;
|
||||
|
||||
for (unsigned r = 0; r < inst->rcount; r++) {
|
||||
if (!src0.is_null()) {
|
||||
bld.MOV(dest, src0);
|
||||
src0 = byte_offset(src0, dest_stride);
|
||||
} else {
|
||||
bld.MOV(dest, retype(brw_imm_d(0), dest.type));
|
||||
}
|
||||
|
||||
for (unsigned s = 0; s < inst->sdepth; s++) {
|
||||
bld.DP4A(dest,
|
||||
dest,
|
||||
byte_offset(src1, s * REG_SIZE),
|
||||
component(byte_offset(src2, r * REG_SIZE), s))
|
||||
->saturate = inst->saturate;
|
||||
}
|
||||
|
||||
dest = byte_offset(dest, dest_stride);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
int8_using_mul_add(const fs_builder &bld, fs_inst *inst)
|
||||
{
|
||||
/* We only intend to support configurations where the destination and
|
||||
* accumulator have the same type.
|
||||
*/
|
||||
if (!inst->src[0].is_null())
|
||||
assert(inst->dst.type == inst->src[0].type);
|
||||
|
||||
assert(inst->src[1].type == BRW_REGISTER_TYPE_B ||
|
||||
inst->src[1].type == BRW_REGISTER_TYPE_UB);
|
||||
assert(inst->src[2].type == BRW_REGISTER_TYPE_B ||
|
||||
inst->src[2].type == BRW_REGISTER_TYPE_UB);
|
||||
|
||||
const brw_reg_type src0_type = inst->dst.type;
|
||||
|
||||
const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB
|
||||
? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
|
||||
|
||||
const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB
|
||||
? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
|
||||
|
||||
fs_reg dest = inst->dst;
|
||||
fs_reg src0 = inst->src[0];
|
||||
const fs_reg src1 = retype(inst->src[1], src1_type);
|
||||
const fs_reg src2 = retype(inst->src[2], src2_type);
|
||||
|
||||
const unsigned dest_stride = REG_SIZE;
|
||||
|
||||
for (unsigned r = 0; r < inst->rcount; r++) {
|
||||
if (!src0.is_null()) {
|
||||
bld.MOV(dest, src0);
|
||||
src0 = byte_offset(src0, dest_stride);
|
||||
} else {
|
||||
bld.MOV(dest, retype(brw_imm_d(0), dest.type));
|
||||
}
|
||||
|
||||
for (unsigned s = 0; s < inst->sdepth; s++) {
|
||||
fs_reg temp1 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
||||
fs_reg temp2 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
||||
fs_reg temp3 = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
|
||||
const brw_reg_type temp_type =
|
||||
(inst->src[1].type == BRW_REGISTER_TYPE_B ||
|
||||
inst->src[2].type == BRW_REGISTER_TYPE_B)
|
||||
? BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW;
|
||||
|
||||
/* Expand 8 dwords of packed bytes into 16 dwords of packed
|
||||
* words.
|
||||
*
|
||||
* FINISHME: Gfx9 should not need this work around. Gfx11
|
||||
* may be able to use integer MAD. Both platforms may be
|
||||
* able to use MAC.
|
||||
*/
|
||||
bld.group(32, 0).MOV(retype(temp3, temp_type),
|
||||
retype(byte_offset(src2, r * REG_SIZE),
|
||||
inst->src[2].type));
|
||||
|
||||
bld.MUL(subscript(temp1, temp_type, 0),
|
||||
subscript(retype(byte_offset(src1, s * REG_SIZE),
|
||||
BRW_REGISTER_TYPE_UD),
|
||||
inst->src[1].type, 0),
|
||||
subscript(component(retype(temp3,
|
||||
BRW_REGISTER_TYPE_UD),
|
||||
s * 2),
|
||||
temp_type, 0));
|
||||
|
||||
bld.MUL(subscript(temp1, temp_type, 1),
|
||||
subscript(retype(byte_offset(src1, s * REG_SIZE),
|
||||
BRW_REGISTER_TYPE_UD),
|
||||
inst->src[1].type, 1),
|
||||
subscript(component(retype(temp3,
|
||||
BRW_REGISTER_TYPE_UD),
|
||||
s * 2),
|
||||
temp_type, 1));
|
||||
|
||||
bld.MUL(subscript(temp2, temp_type, 0),
|
||||
subscript(retype(byte_offset(src1, s * REG_SIZE),
|
||||
BRW_REGISTER_TYPE_UD),
|
||||
inst->src[1].type, 2),
|
||||
subscript(component(retype(temp3,
|
||||
BRW_REGISTER_TYPE_UD),
|
||||
s * 2 + 1),
|
||||
temp_type, 0));
|
||||
|
||||
bld.MUL(subscript(temp2, temp_type, 1),
|
||||
subscript(retype(byte_offset(src1, s * REG_SIZE),
|
||||
BRW_REGISTER_TYPE_UD),
|
||||
inst->src[1].type, 3),
|
||||
subscript(component(retype(temp3,
|
||||
BRW_REGISTER_TYPE_UD),
|
||||
s * 2 + 1),
|
||||
temp_type, 1));
|
||||
|
||||
bld.ADD(subscript(temp1, src0_type, 0),
|
||||
subscript(temp1, temp_type, 0),
|
||||
subscript(temp1, temp_type, 1));
|
||||
|
||||
bld.ADD(subscript(temp2, src0_type, 0),
|
||||
subscript(temp2, temp_type, 0),
|
||||
subscript(temp2, temp_type, 1));
|
||||
|
||||
bld.ADD(retype(temp1, src0_type),
|
||||
retype(temp1, src0_type),
|
||||
retype(temp2, src0_type));
|
||||
|
||||
bld.ADD(dest, dest, retype(temp1, src0_type))
|
||||
->saturate = inst->saturate;
|
||||
}
|
||||
|
||||
dest = byte_offset(dest, dest_stride);
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
brw_lower_dpas(fs_visitor &v)
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
foreach_block_and_inst_safe(block, fs_inst, inst, v.cfg) {
|
||||
if (inst->opcode != BRW_OPCODE_DPAS)
|
||||
continue;
|
||||
|
||||
const fs_builder bld = fs_builder(&v, block, inst).group(8, 0).exec_all();
|
||||
|
||||
if (brw_reg_type_is_floating_point(inst->dst.type)) {
|
||||
f16_using_mac(bld, inst);
|
||||
} else {
|
||||
if (v.devinfo->ver >= 12) {
|
||||
int8_using_dp4a(bld, inst);
|
||||
} else {
|
||||
int8_using_mul_add(bld, inst);
|
||||
}
|
||||
}
|
||||
|
||||
inst->remove(block);
|
||||
progress = true;
|
||||
}
|
||||
|
||||
if (progress)
|
||||
v.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||||
|
||||
return progress;
|
||||
}
|
||||
92
src/intel/compiler/elk/brw_fs_lower_pack.cpp
Normal file
92
src/intel/compiler/elk/brw_fs_lower_pack.cpp
Normal file
|
|
@ -0,0 +1,92 @@
|
|||
/*
|
||||
* Copyright © 2015 Connor Abbott
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "util/half_float.h"
|
||||
#include "brw_fs.h"
|
||||
#include "brw_cfg.h"
|
||||
#include "brw_fs_builder.h"
|
||||
|
||||
using namespace brw;
|
||||
|
||||
bool
|
||||
fs_visitor::lower_pack()
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
||||
if (inst->opcode != FS_OPCODE_PACK &&
|
||||
inst->opcode != FS_OPCODE_PACK_HALF_2x16_SPLIT)
|
||||
continue;
|
||||
|
||||
assert(inst->dst.file == VGRF);
|
||||
assert(inst->saturate == false);
|
||||
fs_reg dst = inst->dst;
|
||||
|
||||
const fs_builder ibld(this, block, inst);
|
||||
/* The lowering generates 2 instructions for what was previously 1. This
|
||||
* can trick the IR to believe we're doing partial writes, but the
|
||||
* register is actually fully written. Mark it as undef to help the IR
|
||||
* reduce the liveness of the register.
|
||||
*/
|
||||
if (!inst->is_partial_write())
|
||||
ibld.emit_undef_for_dst(inst);
|
||||
|
||||
switch (inst->opcode) {
|
||||
case FS_OPCODE_PACK:
|
||||
for (unsigned i = 0; i < inst->sources; i++)
|
||||
ibld.MOV(subscript(dst, inst->src[i].type, i), inst->src[i]);
|
||||
break;
|
||||
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
|
||||
assert(dst.type == BRW_REGISTER_TYPE_UD);
|
||||
|
||||
for (unsigned i = 0; i < inst->sources; i++) {
|
||||
if (inst->src[i].file == IMM) {
|
||||
const uint32_t half = _mesa_float_to_half(inst->src[i].f);
|
||||
ibld.MOV(subscript(dst, BRW_REGISTER_TYPE_UW, i),
|
||||
brw_imm_uw(half));
|
||||
} else if (i == 1 && devinfo->ver < 9) {
|
||||
/* Pre-Skylake requires DWord aligned destinations */
|
||||
fs_reg tmp = ibld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||
ibld.F32TO16(subscript(tmp, BRW_REGISTER_TYPE_HF, 0),
|
||||
inst->src[i]);
|
||||
ibld.MOV(subscript(dst, BRW_REGISTER_TYPE_UW, 1),
|
||||
subscript(tmp, BRW_REGISTER_TYPE_UW, 0));
|
||||
} else {
|
||||
ibld.F32TO16(subscript(dst, BRW_REGISTER_TYPE_HF, i),
|
||||
inst->src[i]);
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
unreachable("skipped above");
|
||||
}
|
||||
|
||||
inst->remove(block);
|
||||
progress = true;
|
||||
}
|
||||
|
||||
if (progress)
|
||||
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||||
|
||||
return progress;
|
||||
}
|
||||
677
src/intel/compiler/elk/brw_fs_lower_regioning.cpp
Normal file
677
src/intel/compiler/elk/brw_fs_lower_regioning.cpp
Normal file
|
|
@ -0,0 +1,677 @@
|
|||
/*
|
||||
* Copyright © 2018 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_fs.h"
|
||||
#include "brw_cfg.h"
|
||||
#include "brw_fs_builder.h"
|
||||
|
||||
using namespace brw;
|
||||
|
||||
namespace {
|
||||
/* From the SKL PRM Vol 2a, "Move":
|
||||
*
|
||||
* "A mov with the same source and destination type, no source modifier,
|
||||
* and no saturation is a raw move. A packed byte destination region (B
|
||||
* or UB type with HorzStride == 1 and ExecSize > 1) can only be written
|
||||
* using raw move."
|
||||
*/
|
||||
bool
|
||||
is_byte_raw_mov(const fs_inst *inst)
|
||||
{
|
||||
return type_sz(inst->dst.type) == 1 &&
|
||||
inst->opcode == BRW_OPCODE_MOV &&
|
||||
inst->src[0].type == inst->dst.type &&
|
||||
!inst->saturate &&
|
||||
!inst->src[0].negate &&
|
||||
!inst->src[0].abs;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return an acceptable byte stride for the destination of an instruction
|
||||
* that requires it to have some particular alignment.
|
||||
*/
|
||||
unsigned
|
||||
required_dst_byte_stride(const fs_inst *inst)
|
||||
{
|
||||
if (inst->dst.is_accumulator()) {
|
||||
/* If the destination is an accumulator, insist that we leave the
|
||||
* stride alone. We cannot "fix" accumulator destinations by writing
|
||||
* to a temporary and emitting a MOV into the original destination.
|
||||
* For multiply instructions (our one use of the accumulator), the
|
||||
* MUL writes the full 66 bits of the accumulator whereas the MOV we
|
||||
* would emit only writes 33 bits and leaves the top 33 bits
|
||||
* undefined.
|
||||
*
|
||||
* It's safe to just require the original stride here because the
|
||||
* lowering pass will detect the mismatch in has_invalid_src_region
|
||||
* and fix the sources of the multiply instead of the destination.
|
||||
*/
|
||||
return inst->dst.stride * type_sz(inst->dst.type);
|
||||
} else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
|
||||
!is_byte_raw_mov(inst)) {
|
||||
return get_exec_type_size(inst);
|
||||
} else {
|
||||
/* Calculate the maximum byte stride and the minimum/maximum type
|
||||
* size across all source and destination operands we are required to
|
||||
* lower.
|
||||
*/
|
||||
unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);
|
||||
unsigned min_size = type_sz(inst->dst.type);
|
||||
unsigned max_size = type_sz(inst->dst.type);
|
||||
|
||||
for (unsigned i = 0; i < inst->sources; i++) {
|
||||
if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
|
||||
const unsigned size = type_sz(inst->src[i].type);
|
||||
max_stride = MAX2(max_stride, inst->src[i].stride * size);
|
||||
min_size = MIN2(min_size, size);
|
||||
max_size = MAX2(max_size, size);
|
||||
}
|
||||
}
|
||||
|
||||
/* All operands involved in lowering need to fit in the calculated
|
||||
* stride.
|
||||
*/
|
||||
assert(max_size <= 4 * min_size);
|
||||
|
||||
/* Attempt to use the largest byte stride among all present operands,
|
||||
* but never exceed a stride of 4 since that would lead to illegal
|
||||
* destination regions during lowering.
|
||||
*/
|
||||
return MIN2(max_stride, 4 * min_size);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Return an acceptable byte sub-register offset for the destination of an
|
||||
* instruction that requires it to be aligned to the sub-register offset of
|
||||
* the sources.
|
||||
*/
|
||||
unsigned
|
||||
required_dst_byte_offset(const intel_device_info *devinfo, const fs_inst *inst)
|
||||
{
|
||||
for (unsigned i = 0; i < inst->sources; i++) {
|
||||
if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
|
||||
if (reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE) !=
|
||||
reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE))
|
||||
return 0;
|
||||
}
|
||||
|
||||
return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the closest legal execution type for an instruction on
|
||||
* the specified platform.
|
||||
*/
|
||||
brw_reg_type
|
||||
required_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
|
||||
{
|
||||
const brw_reg_type t = get_exec_type(inst);
|
||||
const bool has_64bit = brw_reg_type_is_floating_point(t) ?
|
||||
devinfo->has_64bit_float : devinfo->has_64bit_int;
|
||||
|
||||
switch (inst->opcode) {
|
||||
case SHADER_OPCODE_SHUFFLE:
|
||||
/* IVB has an issue (which we found empirically) where it reads
|
||||
* two address register components per channel for indirectly
|
||||
* addressed 64-bit sources.
|
||||
*
|
||||
* From the Cherryview PRM Vol 7. "Register Region Restrictions":
|
||||
*
|
||||
* "When source or destination datatype is 64b or operation is
|
||||
* integer DWord multiply, indirect addressing must not be
|
||||
* used."
|
||||
*
|
||||
* Work around both of the above and handle platforms that
|
||||
* don't support 64-bit types at all.
|
||||
*/
|
||||
if ((!devinfo->has_64bit_int ||
|
||||
devinfo->platform == INTEL_PLATFORM_CHV ||
|
||||
intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4)
|
||||
return BRW_REGISTER_TYPE_UD;
|
||||
else if (has_dst_aligned_region_restriction(devinfo, inst))
|
||||
return brw_int_type(type_sz(t), false);
|
||||
else
|
||||
return t;
|
||||
|
||||
case SHADER_OPCODE_SEL_EXEC:
|
||||
if ((!has_64bit || devinfo->has_64bit_float_via_math_pipe) &&
|
||||
type_sz(t) > 4)
|
||||
return BRW_REGISTER_TYPE_UD;
|
||||
else
|
||||
return t;
|
||||
|
||||
case SHADER_OPCODE_QUAD_SWIZZLE:
|
||||
if (has_dst_aligned_region_restriction(devinfo, inst))
|
||||
return brw_int_type(type_sz(t), false);
|
||||
else
|
||||
return t;
|
||||
|
||||
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
||||
/* From the Cherryview PRM Vol 7. "Register Region Restrictions":
|
||||
*
|
||||
* "When source or destination datatype is 64b or operation is
|
||||
* integer DWord multiply, indirect addressing must not be
|
||||
* used."
|
||||
*
|
||||
* For MTL (verx10 == 125), float64 is supported, but int64 is not.
|
||||
* Therefore we need to lower cluster broadcast using 32-bit int ops.
|
||||
*
|
||||
* For gfx12.5+ platforms that support int64, the register regions
|
||||
* used by cluster broadcast aren't supported by the 64-bit pipeline.
|
||||
*
|
||||
* Work around the above and handle platforms that don't
|
||||
* support 64-bit types at all.
|
||||
*/
|
||||
if ((!has_64bit || devinfo->verx10 >= 125 ||
|
||||
devinfo->platform == INTEL_PLATFORM_CHV ||
|
||||
intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4)
|
||||
return BRW_REGISTER_TYPE_UD;
|
||||
else
|
||||
return brw_int_type(type_sz(t), false);
|
||||
|
||||
case SHADER_OPCODE_BROADCAST:
|
||||
case SHADER_OPCODE_MOV_INDIRECT:
|
||||
if (((devinfo->verx10 == 70 ||
|
||||
devinfo->platform == INTEL_PLATFORM_CHV ||
|
||||
intel_device_info_is_9lp(devinfo) ||
|
||||
devinfo->verx10 >= 125) && type_sz(inst->src[0].type) > 4) ||
|
||||
(devinfo->verx10 >= 125 &&
|
||||
brw_reg_type_is_floating_point(inst->src[0].type)))
|
||||
return brw_int_type(type_sz(t), false);
|
||||
else
|
||||
return t;
|
||||
|
||||
default:
|
||||
return t;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the stride between channels of the specified register in
|
||||
* byte units, or ~0u if the region cannot be represented with a
|
||||
* single one-dimensional stride.
|
||||
*/
|
||||
unsigned
|
||||
byte_stride(const fs_reg ®)
|
||||
{
|
||||
switch (reg.file) {
|
||||
case BAD_FILE:
|
||||
case UNIFORM:
|
||||
case IMM:
|
||||
case VGRF:
|
||||
case MRF:
|
||||
case ATTR:
|
||||
return reg.stride * type_sz(reg.type);
|
||||
case ARF:
|
||||
case FIXED_GRF:
|
||||
if (reg.is_null()) {
|
||||
return 0;
|
||||
} else {
|
||||
const unsigned hstride = reg.hstride ? 1 << (reg.hstride - 1) : 0;
|
||||
const unsigned vstride = reg.vstride ? 1 << (reg.vstride - 1) : 0;
|
||||
const unsigned width = 1 << reg.width;
|
||||
|
||||
if (width == 1) {
|
||||
return vstride * type_sz(reg.type);
|
||||
} else if (hstride * width == vstride) {
|
||||
return hstride * type_sz(reg.type);
|
||||
} else {
|
||||
return ~0u;
|
||||
}
|
||||
}
|
||||
default:
|
||||
unreachable("Invalid register file");
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Return whether the instruction has an unsupported channel bit layout
|
||||
* specified for the i-th source region.
|
||||
*/
|
||||
bool
|
||||
has_invalid_src_region(const intel_device_info *devinfo, const fs_inst *inst,
|
||||
unsigned i)
|
||||
{
|
||||
if (is_send(inst) || inst->is_math() || inst->is_control_source(i) ||
|
||||
inst->opcode == BRW_OPCODE_DPAS) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Empirical testing shows that Broadwell has a bug affecting half-float
|
||||
* MAD instructions when any of its sources has a non-zero offset, such
|
||||
* as:
|
||||
*
|
||||
* mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q };
|
||||
*
|
||||
* We used to generate code like this for SIMD8 executions where we
|
||||
* used to pack components Y and W of a vector at offset 16B of a SIMD
|
||||
* register. The problem doesn't occur if the stride of the source is 0.
|
||||
*/
|
||||
if (devinfo->ver == 8 &&
|
||||
inst->opcode == BRW_OPCODE_MAD &&
|
||||
inst->src[i].type == BRW_REGISTER_TYPE_HF &&
|
||||
reg_offset(inst->src[i]) % REG_SIZE > 0 &&
|
||||
inst->src[i].stride != 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
|
||||
const unsigned src_byte_offset = reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
|
||||
|
||||
return has_dst_aligned_region_restriction(devinfo, inst) &&
|
||||
!is_uniform(inst->src[i]) &&
|
||||
(byte_stride(inst->src[i]) != byte_stride(inst->dst) ||
|
||||
src_byte_offset != dst_byte_offset);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return whether the instruction has an unsupported channel bit layout
|
||||
* specified for the destination region.
|
||||
*/
|
||||
bool
|
||||
has_invalid_dst_region(const intel_device_info *devinfo,
|
||||
const fs_inst *inst)
|
||||
{
|
||||
if (is_send(inst) || inst->is_math()) {
|
||||
return false;
|
||||
} else {
|
||||
const brw_reg_type exec_type = get_exec_type(inst);
|
||||
const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
|
||||
const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
|
||||
type_sz(inst->dst.type) < type_sz(exec_type);
|
||||
|
||||
return (has_dst_aligned_region_restriction(devinfo, inst) &&
|
||||
(required_dst_byte_stride(inst) != byte_stride(inst->dst) ||
|
||||
required_dst_byte_offset(devinfo, inst) != dst_byte_offset)) ||
|
||||
(is_narrowing_conversion &&
|
||||
required_dst_byte_stride(inst) != byte_stride(inst->dst));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a non-zero value if the execution type of the instruction is
|
||||
* unsupported. The destination and sources matching the returned mask
|
||||
* will be bit-cast to an integer type of appropriate size, lowering any
|
||||
* source or destination modifiers into separate MOV instructions.
|
||||
*/
|
||||
unsigned
|
||||
has_invalid_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
|
||||
{
|
||||
if (required_exec_type(devinfo, inst) != get_exec_type(inst)) {
|
||||
switch (inst->opcode) {
|
||||
case SHADER_OPCODE_SHUFFLE:
|
||||
case SHADER_OPCODE_QUAD_SWIZZLE:
|
||||
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
||||
case SHADER_OPCODE_BROADCAST:
|
||||
case SHADER_OPCODE_MOV_INDIRECT:
|
||||
return 0x1;
|
||||
|
||||
case SHADER_OPCODE_SEL_EXEC:
|
||||
return 0x3;
|
||||
|
||||
default:
|
||||
unreachable("Unknown invalid execution type source mask.");
|
||||
}
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Return whether the instruction has unsupported source modifiers
|
||||
* specified for the i-th source region.
|
||||
*/
|
||||
bool
|
||||
has_invalid_src_modifiers(const intel_device_info *devinfo,
|
||||
const fs_inst *inst, unsigned i)
|
||||
{
|
||||
return (!inst->can_do_source_mods(devinfo) &&
|
||||
(inst->src[i].negate || inst->src[i].abs)) ||
|
||||
((has_invalid_exec_type(devinfo, inst) & (1u << i)) &&
|
||||
(inst->src[i].negate || inst->src[i].abs ||
|
||||
inst->src[i].type != get_exec_type(inst)));
|
||||
}
|
||||
|
||||
/*
|
||||
* Return whether the instruction has an unsupported type conversion
|
||||
* specified for the destination.
|
||||
*/
|
||||
bool
|
||||
has_invalid_conversion(const intel_device_info *devinfo, const fs_inst *inst)
|
||||
{
|
||||
switch (inst->opcode) {
|
||||
case BRW_OPCODE_MOV:
|
||||
return false;
|
||||
case BRW_OPCODE_SEL:
|
||||
return inst->dst.type != get_exec_type(inst);
|
||||
default:
|
||||
/* FIXME: We assume the opcodes not explicitly mentioned before just
|
||||
* work fine with arbitrary conversions, unless they need to be
|
||||
* bit-cast.
|
||||
*/
|
||||
return has_invalid_exec_type(devinfo, inst) &&
|
||||
inst->dst.type != get_exec_type(inst);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return whether the instruction has unsupported destination modifiers.
|
||||
*/
|
||||
bool
|
||||
has_invalid_dst_modifiers(const intel_device_info *devinfo, const fs_inst *inst)
|
||||
{
|
||||
return (has_invalid_exec_type(devinfo, inst) &&
|
||||
(inst->saturate || inst->conditional_mod)) ||
|
||||
has_invalid_conversion(devinfo, inst);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return whether the instruction has non-standard semantics for the
|
||||
* conditional mod which don't cause the flag register to be updated with
|
||||
* the comparison result.
|
||||
*/
|
||||
bool
|
||||
has_inconsistent_cmod(const fs_inst *inst)
|
||||
{
|
||||
return inst->opcode == BRW_OPCODE_SEL ||
|
||||
inst->opcode == BRW_OPCODE_CSEL ||
|
||||
inst->opcode == BRW_OPCODE_IF ||
|
||||
inst->opcode == BRW_OPCODE_WHILE;
|
||||
}
|
||||
|
||||
bool
|
||||
lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
|
||||
}
|
||||
|
||||
namespace brw {
|
||||
/**
|
||||
* Remove any modifiers from the \p i-th source region of the instruction,
|
||||
* including negate, abs and any implicit type conversion to the execution
|
||||
* type. Instead any source modifiers will be implemented as a separate
|
||||
* MOV instruction prior to the original instruction.
|
||||
*/
|
||||
bool
|
||||
lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
|
||||
{
|
||||
assert(inst->components_read(i) == 1);
|
||||
assert(v->devinfo->has_integer_dword_mul ||
|
||||
inst->opcode != BRW_OPCODE_MUL ||
|
||||
brw_reg_type_is_floating_point(get_exec_type(inst)) ||
|
||||
MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4 ||
|
||||
type_sz(inst->src[i].type) == get_exec_type_size(inst));
|
||||
|
||||
const fs_builder ibld(v, block, inst);
|
||||
const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
|
||||
|
||||
lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
|
||||
inst->src[i] = tmp;
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
/**
|
||||
* Remove any modifiers from the destination region of the instruction,
|
||||
* including saturate, conditional mod and any implicit type conversion
|
||||
* from the execution type. Instead any destination modifiers will be
|
||||
* implemented as a separate MOV instruction after the original
|
||||
* instruction.
|
||||
*/
|
||||
bool
|
||||
lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
|
||||
{
|
||||
const fs_builder ibld(v, block, inst);
|
||||
const brw_reg_type type = get_exec_type(inst);
|
||||
/* Not strictly necessary, but if possible use a temporary with the same
|
||||
* channel alignment as the current destination in order to avoid
|
||||
* violating the restrictions enforced later on by lower_src_region()
|
||||
* and lower_dst_region(), which would introduce additional copy
|
||||
* instructions into the program unnecessarily.
|
||||
*/
|
||||
const unsigned stride =
|
||||
type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
|
||||
type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
|
||||
fs_reg tmp = ibld.vgrf(type, stride);
|
||||
ibld.UNDEF(tmp);
|
||||
tmp = horiz_stride(tmp, stride);
|
||||
|
||||
/* Emit a MOV taking care of all the destination modifiers. */
|
||||
fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
|
||||
mov->saturate = inst->saturate;
|
||||
if (!has_inconsistent_cmod(inst))
|
||||
mov->conditional_mod = inst->conditional_mod;
|
||||
if (inst->opcode != BRW_OPCODE_SEL) {
|
||||
mov->predicate = inst->predicate;
|
||||
mov->predicate_inverse = inst->predicate_inverse;
|
||||
}
|
||||
mov->flag_subreg = inst->flag_subreg;
|
||||
lower_instruction(v, block, mov);
|
||||
|
||||
/* Point the original instruction at the temporary, and clean up any
|
||||
* destination modifiers.
|
||||
*/
|
||||
assert(inst->size_written == inst->dst.component_size(inst->exec_size));
|
||||
inst->dst = tmp;
|
||||
inst->size_written = inst->dst.component_size(inst->exec_size);
|
||||
inst->saturate = false;
|
||||
if (!has_inconsistent_cmod(inst))
|
||||
inst->conditional_mod = BRW_CONDITIONAL_NONE;
|
||||
|
||||
assert(!inst->flags_written(v->devinfo) || !mov->predicate);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove any non-trivial shuffling of data from the \p i-th source region
|
||||
* of the instruction. Instead implement the region as a series of integer
|
||||
* copies into a temporary with the same channel layout as the destination.
|
||||
*/
|
||||
bool
|
||||
lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
|
||||
{
|
||||
assert(inst->components_read(i) == 1);
|
||||
const fs_builder ibld(v, block, inst);
|
||||
const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
|
||||
type_sz(inst->src[i].type);
|
||||
assert(stride > 0);
|
||||
fs_reg tmp = ibld.vgrf(inst->src[i].type, stride);
|
||||
ibld.UNDEF(tmp);
|
||||
tmp = horiz_stride(tmp, stride);
|
||||
|
||||
/* Emit a series of 32-bit integer copies with any source modifiers
|
||||
* cleaned up (because their semantics are dependent on the type).
|
||||
*/
|
||||
const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
|
||||
false);
|
||||
const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
|
||||
fs_reg raw_src = inst->src[i];
|
||||
raw_src.negate = false;
|
||||
raw_src.abs = false;
|
||||
|
||||
for (unsigned j = 0; j < n; j++)
|
||||
ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
|
||||
|
||||
/* Point the original instruction at the temporary, making sure to keep
|
||||
* any source modifiers in the instruction.
|
||||
*/
|
||||
fs_reg lower_src = tmp;
|
||||
lower_src.negate = inst->src[i].negate;
|
||||
lower_src.abs = inst->src[i].abs;
|
||||
inst->src[i] = lower_src;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove any non-trivial shuffling of data from the destination region of
|
||||
* the instruction. Instead implement the region as a series of integer
|
||||
* copies from a temporary with a channel layout compatible with the
|
||||
* sources.
|
||||
*/
|
||||
bool
|
||||
lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
|
||||
{
|
||||
/* We cannot replace the result of an integer multiply which writes the
|
||||
* accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
|
||||
* value whereas the MOV will act on only 32 or 33 bits of the
|
||||
* accumulator.
|
||||
*/
|
||||
assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
|
||||
brw_reg_type_is_floating_point(inst->dst.type));
|
||||
|
||||
const fs_builder ibld(v, block, inst);
|
||||
const unsigned stride = required_dst_byte_stride(inst) /
|
||||
type_sz(inst->dst.type);
|
||||
assert(stride > 0);
|
||||
fs_reg tmp = ibld.vgrf(inst->dst.type, stride);
|
||||
ibld.UNDEF(tmp);
|
||||
tmp = horiz_stride(tmp, stride);
|
||||
|
||||
/* Emit a series of 32-bit integer copies from the temporary into the
|
||||
* original destination.
|
||||
*/
|
||||
const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
|
||||
false);
|
||||
const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
|
||||
|
||||
if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
|
||||
/* Note that in general we cannot simply predicate the copies on the
|
||||
* same flag register as the original instruction, since it may have
|
||||
* been overwritten by the instruction itself. Instead initialize
|
||||
* the temporary with the previous contents of the destination
|
||||
* register.
|
||||
*/
|
||||
for (unsigned j = 0; j < n; j++)
|
||||
ibld.MOV(subscript(tmp, raw_type, j),
|
||||
subscript(inst->dst, raw_type, j));
|
||||
}
|
||||
|
||||
for (unsigned j = 0; j < n; j++)
|
||||
ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
|
||||
subscript(tmp, raw_type, j));
|
||||
|
||||
/* Point the original instruction at the temporary, making sure to keep
|
||||
* any destination modifiers in the instruction.
|
||||
*/
|
||||
assert(inst->size_written == inst->dst.component_size(inst->exec_size));
|
||||
inst->dst = tmp;
|
||||
inst->size_written = inst->dst.component_size(inst->exec_size);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Change sources and destination of the instruction to an
|
||||
* appropriate legal type, splitting the instruction into multiple
|
||||
* ones of smaller execution type if necessary, to be used in cases
|
||||
* where the execution type of an instruction is unsupported.
|
||||
*/
|
||||
bool
|
||||
lower_exec_type(fs_visitor *v, bblock_t *block, fs_inst *inst)
|
||||
{
|
||||
assert(inst->dst.type == get_exec_type(inst));
|
||||
const unsigned mask = has_invalid_exec_type(v->devinfo, inst);
|
||||
const brw_reg_type raw_type = required_exec_type(v->devinfo, inst);
|
||||
const unsigned n = get_exec_type_size(inst) / type_sz(raw_type);
|
||||
const fs_builder ibld(v, block, inst);
|
||||
|
||||
fs_reg tmp = ibld.vgrf(inst->dst.type, inst->dst.stride);
|
||||
ibld.UNDEF(tmp);
|
||||
tmp = horiz_stride(tmp, inst->dst.stride);
|
||||
|
||||
for (unsigned j = 0; j < n; j++) {
|
||||
fs_inst sub_inst = *inst;
|
||||
|
||||
for (unsigned i = 0; i < inst->sources; i++) {
|
||||
if (mask & (1u << i)) {
|
||||
assert(inst->src[i].type == inst->dst.type);
|
||||
sub_inst.src[i] = subscript(inst->src[i], raw_type, j);
|
||||
}
|
||||
}
|
||||
|
||||
sub_inst.dst = subscript(tmp, raw_type, j);
|
||||
|
||||
assert(sub_inst.size_written == sub_inst.dst.component_size(sub_inst.exec_size));
|
||||
assert(!sub_inst.flags_written(v->devinfo) && !sub_inst.saturate);
|
||||
ibld.emit(sub_inst);
|
||||
|
||||
fs_inst *mov = ibld.MOV(subscript(inst->dst, raw_type, j),
|
||||
subscript(tmp, raw_type, j));
|
||||
if (inst->opcode != BRW_OPCODE_SEL) {
|
||||
mov->predicate = inst->predicate;
|
||||
mov->predicate_inverse = inst->predicate_inverse;
|
||||
}
|
||||
lower_instruction(v, block, mov);
|
||||
}
|
||||
|
||||
inst->remove(block);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Legalize the source and destination regioning controls of the specified
|
||||
* instruction.
|
||||
*/
|
||||
bool
|
||||
lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
|
||||
{
|
||||
const intel_device_info *devinfo = v->devinfo;
|
||||
bool progress = false;
|
||||
|
||||
if (has_invalid_dst_modifiers(devinfo, inst))
|
||||
progress |= lower_dst_modifiers(v, block, inst);
|
||||
|
||||
if (has_invalid_dst_region(devinfo, inst))
|
||||
progress |= lower_dst_region(v, block, inst);
|
||||
|
||||
for (unsigned i = 0; i < inst->sources; i++) {
|
||||
if (has_invalid_src_modifiers(devinfo, inst, i))
|
||||
progress |= lower_src_modifiers(v, block, inst, i);
|
||||
|
||||
if (has_invalid_src_region(devinfo, inst, i))
|
||||
progress |= lower_src_region(v, block, inst, i);
|
||||
}
|
||||
|
||||
if (has_invalid_exec_type(devinfo, inst))
|
||||
progress |= lower_exec_type(v, block, inst);
|
||||
|
||||
return progress;
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
fs_visitor::lower_regioning()
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
foreach_block_and_inst_safe(block, fs_inst, inst, cfg)
|
||||
progress |= lower_instruction(this, block, inst);
|
||||
|
||||
if (progress)
|
||||
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
||||
|
||||
return progress;
|
||||
}
|
||||
8804
src/intel/compiler/elk/brw_fs_nir.cpp
Normal file
8804
src/intel/compiler/elk/brw_fs_nir.cpp
Normal file
File diff suppressed because it is too large
Load diff
1412
src/intel/compiler/elk/brw_fs_reg_allocate.cpp
Normal file
1412
src/intel/compiler/elk/brw_fs_reg_allocate.cpp
Normal file
File diff suppressed because it is too large
Load diff
349
src/intel/compiler/elk/brw_fs_register_coalesce.cpp
Normal file
349
src/intel/compiler/elk/brw_fs_register_coalesce.cpp
Normal file
|
|
@ -0,0 +1,349 @@
|
|||
/*
|
||||
* Copyright © 2012 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/** @file brw_fs_register_coalesce.cpp
|
||||
*
|
||||
* Implements register coalescing: Checks if the two registers involved in a
|
||||
* raw move don't interfere, in which case they can both be stored in the same
|
||||
* place and the MOV removed.
|
||||
*
|
||||
* To do this, all uses of the source of the MOV in the shader are replaced
|
||||
* with the destination of the MOV. For example:
|
||||
*
|
||||
* add vgrf3:F, vgrf1:F, vgrf2:F
|
||||
* mov vgrf4:F, vgrf3:F
|
||||
* mul vgrf5:F, vgrf5:F, vgrf4:F
|
||||
*
|
||||
* becomes
|
||||
*
|
||||
* add vgrf4:F, vgrf1:F, vgrf2:F
|
||||
* mul vgrf5:F, vgrf5:F, vgrf4:F
|
||||
*/
|
||||
|
||||
#include "brw_fs.h"
|
||||
#include "brw_cfg.h"
|
||||
#include "brw_fs_live_variables.h"
|
||||
|
||||
using namespace brw;
|
||||
|
||||
static bool
|
||||
is_nop_mov(const fs_inst *inst)
|
||||
{
|
||||
if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
|
||||
fs_reg dst = inst->dst;
|
||||
for (int i = 0; i < inst->sources; i++) {
|
||||
if (!dst.equals(inst->src[i])) {
|
||||
return false;
|
||||
}
|
||||
dst.offset += (i < inst->header_size ? REG_SIZE :
|
||||
inst->exec_size * dst.stride *
|
||||
type_sz(inst->src[i].type));
|
||||
}
|
||||
return true;
|
||||
} else if (inst->opcode == BRW_OPCODE_MOV) {
|
||||
return inst->dst.equals(inst->src[0]);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool
|
||||
is_coalesce_candidate(const fs_visitor *v, const fs_inst *inst)
|
||||
{
|
||||
if ((inst->opcode != BRW_OPCODE_MOV &&
|
||||
inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) ||
|
||||
inst->is_partial_write() ||
|
||||
inst->saturate ||
|
||||
inst->src[0].file != VGRF ||
|
||||
inst->src[0].negate ||
|
||||
inst->src[0].abs ||
|
||||
!inst->src[0].is_contiguous() ||
|
||||
inst->dst.file != VGRF ||
|
||||
inst->dst.type != inst->src[0].type) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (v->alloc.sizes[inst->src[0].nr] >
|
||||
v->alloc.sizes[inst->dst.nr])
|
||||
return false;
|
||||
|
||||
if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
|
||||
if (!is_coalescing_payload(v->alloc, inst)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
can_coalesce_vars(const fs_live_variables &live, const cfg_t *cfg,
|
||||
const bblock_t *block, const fs_inst *inst,
|
||||
int dst_var, int src_var)
|
||||
{
|
||||
if (!live.vars_interfere(src_var, dst_var))
|
||||
return true;
|
||||
|
||||
int dst_start = live.start[dst_var];
|
||||
int dst_end = live.end[dst_var];
|
||||
int src_start = live.start[src_var];
|
||||
int src_end = live.end[src_var];
|
||||
|
||||
/* Variables interfere and one line range isn't a subset of the other. */
|
||||
if ((dst_end > src_end && src_start < dst_start) ||
|
||||
(src_end > dst_end && dst_start < src_start))
|
||||
return false;
|
||||
|
||||
/* Check for a write to either register in the intersection of their live
|
||||
* ranges.
|
||||
*/
|
||||
int start_ip = MAX2(dst_start, src_start);
|
||||
int end_ip = MIN2(dst_end, src_end);
|
||||
|
||||
foreach_block(scan_block, cfg) {
|
||||
if (scan_block->end_ip < start_ip)
|
||||
continue;
|
||||
|
||||
int scan_ip = scan_block->start_ip - 1;
|
||||
|
||||
bool seen_src_write = false;
|
||||
bool seen_copy = false;
|
||||
foreach_inst_in_block(fs_inst, scan_inst, scan_block) {
|
||||
scan_ip++;
|
||||
|
||||
/* Ignore anything before the intersection of the live ranges */
|
||||
if (scan_ip < start_ip)
|
||||
continue;
|
||||
|
||||
/* Ignore the copying instruction itself */
|
||||
if (scan_inst == inst) {
|
||||
seen_copy = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (scan_ip > end_ip)
|
||||
return true; /* registers do not interfere */
|
||||
|
||||
if (seen_src_write && !seen_copy) {
|
||||
/* In order to satisfy the guarantee of register coalescing, we
|
||||
* must ensure that the two registers always have the same value
|
||||
* during the intersection of their live ranges. One way to do
|
||||
* this is to simply ensure that neither is ever written apart
|
||||
* from the one copy which syncs up the two registers. However,
|
||||
* this can be overly conservative and only works in the case
|
||||
* where the destination live range is entirely contained in the
|
||||
* source live range.
|
||||
*
|
||||
* To handle the other case where the source is contained in the
|
||||
* destination, we allow writes to the source register as long as
|
||||
* they happen before the copy, in the same block as the copy, and
|
||||
* the destination is never read between first such write and the
|
||||
* copy. This effectively moves the write from the copy up.
|
||||
*/
|
||||
for (int j = 0; j < scan_inst->sources; j++) {
|
||||
if (regions_overlap(scan_inst->src[j], scan_inst->size_read(j),
|
||||
inst->dst, inst->size_written))
|
||||
return false; /* registers interfere */
|
||||
}
|
||||
}
|
||||
|
||||
/* The MOV being coalesced had better be the only instruction which
|
||||
* writes to the coalesce destination in the intersection.
|
||||
*/
|
||||
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
|
||||
inst->dst, inst->size_written))
|
||||
return false; /* registers interfere */
|
||||
|
||||
/* See the big comment above */
|
||||
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
|
||||
inst->src[0], inst->size_read(0))) {
|
||||
if (seen_copy || scan_block != block ||
|
||||
(scan_inst->force_writemask_all && !inst->force_writemask_all))
|
||||
return false;
|
||||
seen_src_write = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
fs_visitor::register_coalesce()
|
||||
{
|
||||
bool progress = false;
|
||||
fs_live_variables &live = live_analysis.require();
|
||||
int src_size = 0;
|
||||
int channels_remaining = 0;
|
||||
unsigned src_reg = ~0u, dst_reg = ~0u;
|
||||
int *dst_reg_offset = new int[MAX_VGRF_SIZE(devinfo)];
|
||||
fs_inst **mov = new fs_inst *[MAX_VGRF_SIZE(devinfo)];
|
||||
int *dst_var = new int[MAX_VGRF_SIZE(devinfo)];
|
||||
int *src_var = new int[MAX_VGRF_SIZE(devinfo)];
|
||||
|
||||
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
||||
if (!is_coalesce_candidate(this, inst))
|
||||
continue;
|
||||
|
||||
if (is_nop_mov(inst)) {
|
||||
inst->opcode = BRW_OPCODE_NOP;
|
||||
progress = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (src_reg != inst->src[0].nr) {
|
||||
src_reg = inst->src[0].nr;
|
||||
|
||||
src_size = alloc.sizes[inst->src[0].nr];
|
||||
assert(src_size <= MAX_VGRF_SIZE(devinfo));
|
||||
|
||||
channels_remaining = src_size;
|
||||
memset(mov, 0, sizeof(*mov) * MAX_VGRF_SIZE(devinfo));
|
||||
|
||||
dst_reg = inst->dst.nr;
|
||||
}
|
||||
|
||||
if (dst_reg != inst->dst.nr)
|
||||
continue;
|
||||
|
||||
if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
|
||||
for (int i = 0; i < src_size; i++) {
|
||||
dst_reg_offset[i] = i;
|
||||
}
|
||||
mov[0] = inst;
|
||||
channels_remaining -= regs_written(inst);
|
||||
} else {
|
||||
const int offset = inst->src[0].offset / REG_SIZE;
|
||||
if (mov[offset]) {
|
||||
/* This is the second time that this offset in the register has
|
||||
* been set. This means, in particular, that inst->dst was
|
||||
* live before this instruction and that the live ranges of
|
||||
* inst->dst and inst->src[0] overlap and we can't coalesce the
|
||||
* two variables. Let's ensure that doesn't happen.
|
||||
*/
|
||||
channels_remaining = -1;
|
||||
continue;
|
||||
}
|
||||
for (unsigned i = 0; i < MAX2(inst->size_written / REG_SIZE, 1); i++)
|
||||
dst_reg_offset[offset + i] = inst->dst.offset / REG_SIZE + i;
|
||||
mov[offset] = inst;
|
||||
channels_remaining -= regs_written(inst);
|
||||
}
|
||||
|
||||
if (channels_remaining)
|
||||
continue;
|
||||
|
||||
bool can_coalesce = true;
|
||||
for (int i = 0; i < src_size; i++) {
|
||||
if (dst_reg_offset[i] != dst_reg_offset[0] + i) {
|
||||
/* Registers are out-of-order. */
|
||||
can_coalesce = false;
|
||||
src_reg = ~0u;
|
||||
break;
|
||||
}
|
||||
|
||||
dst_var[i] = live.var_from_vgrf[dst_reg] + dst_reg_offset[i];
|
||||
src_var[i] = live.var_from_vgrf[src_reg] + i;
|
||||
|
||||
if (!can_coalesce_vars(live, cfg, block, inst, dst_var[i], src_var[i])) {
|
||||
can_coalesce = false;
|
||||
src_reg = ~0u;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!can_coalesce)
|
||||
continue;
|
||||
|
||||
progress = true;
|
||||
|
||||
for (int i = 0; i < src_size; i++) {
|
||||
if (!mov[i])
|
||||
continue;
|
||||
|
||||
if (mov[i]->conditional_mod == BRW_CONDITIONAL_NONE) {
|
||||
mov[i]->opcode = BRW_OPCODE_NOP;
|
||||
mov[i]->dst = reg_undef;
|
||||
for (int j = 0; j < mov[i]->sources; j++) {
|
||||
mov[i]->src[j] = reg_undef;
|
||||
}
|
||||
} else {
|
||||
/* If we have a conditional modifier, rewrite the MOV to be a
|
||||
* MOV.cmod from the coalesced register. Hopefully, cmod
|
||||
* propagation will clean this up and move it to the instruction
|
||||
* that writes the register. If not, this keeps things correct
|
||||
* while still letting us coalesce.
|
||||
*/
|
||||
assert(mov[i]->opcode == BRW_OPCODE_MOV);
|
||||
assert(mov[i]->sources == 1);
|
||||
mov[i]->src[0] = mov[i]->dst;
|
||||
mov[i]->dst = retype(brw_null_reg(), mov[i]->dst.type);
|
||||
}
|
||||
}
|
||||
|
||||
foreach_block_and_inst(block, fs_inst, scan_inst, cfg) {
|
||||
if (scan_inst->dst.file == VGRF &&
|
||||
scan_inst->dst.nr == src_reg) {
|
||||
scan_inst->dst.nr = dst_reg;
|
||||
scan_inst->dst.offset = scan_inst->dst.offset % REG_SIZE +
|
||||
dst_reg_offset[scan_inst->dst.offset / REG_SIZE] * REG_SIZE;
|
||||
}
|
||||
|
||||
for (int j = 0; j < scan_inst->sources; j++) {
|
||||
if (scan_inst->src[j].file == VGRF &&
|
||||
scan_inst->src[j].nr == src_reg) {
|
||||
scan_inst->src[j].nr = dst_reg;
|
||||
scan_inst->src[j].offset = scan_inst->src[j].offset % REG_SIZE +
|
||||
dst_reg_offset[scan_inst->src[j].offset / REG_SIZE] * REG_SIZE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < src_size; i++) {
|
||||
live.start[dst_var[i]] = MIN2(live.start[dst_var[i]],
|
||||
live.start[src_var[i]]);
|
||||
live.end[dst_var[i]] = MAX2(live.end[dst_var[i]],
|
||||
live.end[src_var[i]]);
|
||||
}
|
||||
src_reg = ~0u;
|
||||
}
|
||||
|
||||
if (progress) {
|
||||
foreach_block_and_inst_safe (block, backend_instruction, inst, cfg) {
|
||||
if (inst->opcode == BRW_OPCODE_NOP) {
|
||||
inst->remove(block, true);
|
||||
}
|
||||
}
|
||||
|
||||
cfg->adjust_block_ips();
|
||||
|
||||
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||||
}
|
||||
|
||||
delete[] src_var;
|
||||
delete[] dst_var;
|
||||
delete[] mov;
|
||||
delete[] dst_reg_offset;
|
||||
|
||||
return progress;
|
||||
}
|
||||
165
src/intel/compiler/elk/brw_fs_saturate_propagation.cpp
Normal file
165
src/intel/compiler/elk/brw_fs_saturate_propagation.cpp
Normal file
|
|
@ -0,0 +1,165 @@
|
|||
/*
|
||||
* Copyright © 2013 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_fs.h"
|
||||
#include "brw_fs_live_variables.h"
|
||||
#include "brw_cfg.h"
|
||||
|
||||
using namespace brw;
|
||||
|
||||
/** @file brw_fs_saturate_propagation.cpp
|
||||
*
|
||||
* Implements a pass that propagates the SAT modifier from a MOV.SAT into the
|
||||
* instruction that produced the source of the MOV.SAT, thereby allowing the
|
||||
* MOV's src and dst to be coalesced and the MOV removed.
|
||||
*
|
||||
* For instance,
|
||||
*
|
||||
* ADD tmp, src0, src1
|
||||
* MOV.SAT dst, tmp
|
||||
*
|
||||
* would be transformed into
|
||||
*
|
||||
* ADD.SAT tmp, src0, src1
|
||||
* MOV dst, tmp
|
||||
*/
|
||||
|
||||
static bool
|
||||
opt_saturate_propagation_local(const fs_live_variables &live, bblock_t *block)
|
||||
{
|
||||
bool progress = false;
|
||||
int ip = block->end_ip + 1;
|
||||
|
||||
foreach_inst_in_block_reverse(fs_inst, inst, block) {
|
||||
ip--;
|
||||
|
||||
if (inst->opcode != BRW_OPCODE_MOV ||
|
||||
!inst->saturate ||
|
||||
inst->dst.file != VGRF ||
|
||||
inst->dst.type != inst->src[0].type ||
|
||||
inst->src[0].file != VGRF ||
|
||||
inst->src[0].abs)
|
||||
continue;
|
||||
|
||||
int src_var = live.var_from_reg(inst->src[0]);
|
||||
int src_end_ip = live.end[src_var];
|
||||
|
||||
bool interfered = false;
|
||||
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
|
||||
if (scan_inst->exec_size == inst->exec_size &&
|
||||
regions_overlap(scan_inst->dst, scan_inst->size_written,
|
||||
inst->src[0], inst->size_read(0))) {
|
||||
if (scan_inst->is_partial_write() ||
|
||||
(scan_inst->dst.type != inst->dst.type &&
|
||||
!scan_inst->can_change_types()))
|
||||
break;
|
||||
|
||||
if (scan_inst->saturate) {
|
||||
inst->saturate = false;
|
||||
progress = true;
|
||||
} else if (src_end_ip == ip || inst->dst.equals(inst->src[0])) {
|
||||
if (scan_inst->can_do_saturate()) {
|
||||
if (scan_inst->dst.type != inst->dst.type) {
|
||||
scan_inst->dst.type = inst->dst.type;
|
||||
for (int i = 0; i < scan_inst->sources; i++) {
|
||||
scan_inst->src[i].type = inst->dst.type;
|
||||
}
|
||||
}
|
||||
|
||||
if (inst->src[0].negate) {
|
||||
if (scan_inst->opcode == BRW_OPCODE_MUL) {
|
||||
scan_inst->src[0].negate = !scan_inst->src[0].negate;
|
||||
inst->src[0].negate = false;
|
||||
} else if (scan_inst->opcode == BRW_OPCODE_MAD) {
|
||||
for (int i = 0; i < 2; i++) {
|
||||
if (scan_inst->src[i].file == IMM) {
|
||||
brw_negate_immediate(scan_inst->src[i].type,
|
||||
&scan_inst->src[i].as_brw_reg());
|
||||
} else {
|
||||
scan_inst->src[i].negate = !scan_inst->src[i].negate;
|
||||
}
|
||||
}
|
||||
inst->src[0].negate = false;
|
||||
} else if (scan_inst->opcode == BRW_OPCODE_ADD) {
|
||||
if (scan_inst->src[1].file == IMM) {
|
||||
if (!brw_negate_immediate(scan_inst->src[1].type,
|
||||
&scan_inst->src[1].as_brw_reg())) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
scan_inst->src[1].negate = !scan_inst->src[1].negate;
|
||||
}
|
||||
scan_inst->src[0].negate = !scan_inst->src[0].negate;
|
||||
inst->src[0].negate = false;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
scan_inst->saturate = true;
|
||||
inst->saturate = false;
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
for (int i = 0; i < scan_inst->sources; i++) {
|
||||
if (scan_inst->src[i].file == VGRF &&
|
||||
scan_inst->src[i].nr == inst->src[0].nr &&
|
||||
regions_overlap(
|
||||
scan_inst->src[i], scan_inst->size_read(i),
|
||||
inst->src[0], inst->size_read(0))) {
|
||||
if (scan_inst->opcode != BRW_OPCODE_MOV ||
|
||||
!scan_inst->saturate ||
|
||||
scan_inst->src[0].abs ||
|
||||
scan_inst->src[0].negate ||
|
||||
scan_inst->src[0].abs != inst->src[0].abs ||
|
||||
scan_inst->src[0].negate != inst->src[0].negate) {
|
||||
interfered = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (interfered)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
bool
|
||||
fs_visitor::opt_saturate_propagation()
|
||||
{
|
||||
const fs_live_variables &live = live_analysis.require();
|
||||
bool progress = false;
|
||||
|
||||
foreach_block (block, cfg) {
|
||||
progress = opt_saturate_propagation_local(live, block) || progress;
|
||||
}
|
||||
|
||||
/* Live intervals are still valid. */
|
||||
|
||||
return progress;
|
||||
}
|
||||
1365
src/intel/compiler/elk/brw_fs_scoreboard.cpp
Normal file
1365
src/intel/compiler/elk/brw_fs_scoreboard.cpp
Normal file
File diff suppressed because it is too large
Load diff
229
src/intel/compiler/elk/brw_fs_sel_peephole.cpp
Normal file
229
src/intel/compiler/elk/brw_fs_sel_peephole.cpp
Normal file
|
|
@ -0,0 +1,229 @@
|
|||
/*
|
||||
* Copyright © 2013 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_fs.h"
|
||||
#include "brw_fs_builder.h"
|
||||
#include "brw_cfg.h"
|
||||
|
||||
/** @file brw_fs_sel_peephole.cpp
|
||||
*
|
||||
* This file contains the opt_peephole_sel() optimization pass that replaces
|
||||
* MOV instructions to the same destination in the "then" and "else" bodies of
|
||||
* an if statement with SEL instructions.
|
||||
*/
|
||||
|
||||
/* Four MOVs seems to be pretty typical, so I picked the next power of two in
|
||||
* the hopes that it would handle almost anything possible in a single
|
||||
* pass.
|
||||
*/
|
||||
#define MAX_MOVS 8 /**< The maximum number of MOVs to attempt to match. */
|
||||
|
||||
using namespace brw;
|
||||
|
||||
/**
|
||||
* Scans forwards from an IF counting consecutive MOV instructions in the
|
||||
* "then" and "else" blocks of the if statement.
|
||||
*
|
||||
* A pointer to the bblock_t following the IF is passed as the <then_block>
|
||||
* argument. The function stores pointers to the MOV instructions in the
|
||||
* <then_mov> and <else_mov> arrays.
|
||||
*
|
||||
* \return the minimum number of MOVs found in the two branches or zero if
|
||||
* an error occurred.
|
||||
*
|
||||
* E.g.:
|
||||
* IF ...
|
||||
* then_mov[0] = MOV g4, ...
|
||||
* then_mov[1] = MOV g5, ...
|
||||
* then_mov[2] = MOV g6, ...
|
||||
* ELSE ...
|
||||
* else_mov[0] = MOV g4, ...
|
||||
* else_mov[1] = MOV g5, ...
|
||||
* else_mov[2] = MOV g7, ...
|
||||
* ENDIF
|
||||
* returns 3.
|
||||
*/
|
||||
static int
|
||||
count_movs_from_if(const intel_device_info *devinfo,
|
||||
fs_inst *then_mov[MAX_MOVS], fs_inst *else_mov[MAX_MOVS],
|
||||
bblock_t *then_block, bblock_t *else_block)
|
||||
{
|
||||
int then_movs = 0;
|
||||
foreach_inst_in_block(fs_inst, inst, then_block) {
|
||||
if (then_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV ||
|
||||
inst->flags_written(devinfo))
|
||||
break;
|
||||
|
||||
then_mov[then_movs] = inst;
|
||||
then_movs++;
|
||||
}
|
||||
|
||||
int else_movs = 0;
|
||||
foreach_inst_in_block(fs_inst, inst, else_block) {
|
||||
if (else_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV ||
|
||||
inst->flags_written(devinfo))
|
||||
break;
|
||||
|
||||
else_mov[else_movs] = inst;
|
||||
else_movs++;
|
||||
}
|
||||
|
||||
return MIN2(then_movs, else_movs);
|
||||
}
|
||||
|
||||
/**
|
||||
* Try to replace IF/MOV+/ELSE/MOV+/ENDIF with SEL.
|
||||
*
|
||||
* Many GLSL shaders contain the following pattern:
|
||||
*
|
||||
* x = condition ? foo : bar
|
||||
*
|
||||
* or
|
||||
*
|
||||
* if (...) a.xyzw = foo.xyzw;
|
||||
* else a.xyzw = bar.xyzw;
|
||||
*
|
||||
* The compiler emits an ir_if tree for this, since each subexpression might be
|
||||
* a complex tree that could have side-effects or short-circuit logic.
|
||||
*
|
||||
* However, the common case is to simply select one of two constants or
|
||||
* variable values---which is exactly what SEL is for. In this case, the
|
||||
* assembly looks like:
|
||||
*
|
||||
* (+f0) IF
|
||||
* MOV dst src0
|
||||
* ...
|
||||
* ELSE
|
||||
* MOV dst src1
|
||||
* ...
|
||||
* ENDIF
|
||||
*
|
||||
* where each pair of MOVs to a common destination and can be easily translated
|
||||
* into
|
||||
*
|
||||
* (+f0) SEL dst src0 src1
|
||||
*
|
||||
* If src0 is an immediate value, we promote it to a temporary GRF.
|
||||
*/
|
||||
bool
|
||||
fs_visitor::opt_peephole_sel()
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
foreach_block (block, cfg) {
|
||||
/* IF instructions, by definition, can only be found at the ends of
|
||||
* basic blocks.
|
||||
*/
|
||||
fs_inst *if_inst = (fs_inst *)block->end();
|
||||
if (if_inst->opcode != BRW_OPCODE_IF)
|
||||
continue;
|
||||
|
||||
fs_inst *else_mov[MAX_MOVS] = { NULL };
|
||||
fs_inst *then_mov[MAX_MOVS] = { NULL };
|
||||
|
||||
bblock_t *then_block = block->next();
|
||||
bblock_t *else_block = NULL;
|
||||
foreach_list_typed(bblock_link, child, link, &block->children) {
|
||||
if (child->block != then_block) {
|
||||
if (child->block->prev()->end()->opcode == BRW_OPCODE_ELSE) {
|
||||
else_block = child->block;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (else_block == NULL)
|
||||
continue;
|
||||
|
||||
int movs = count_movs_from_if(devinfo, then_mov, else_mov, then_block, else_block);
|
||||
|
||||
if (movs == 0)
|
||||
continue;
|
||||
|
||||
/* Generate SEL instructions for pairs of MOVs to a common destination. */
|
||||
for (int i = 0; i < movs; i++) {
|
||||
if (!then_mov[i] || !else_mov[i])
|
||||
break;
|
||||
|
||||
/* Check that the MOVs are the right form. */
|
||||
if (!then_mov[i]->dst.equals(else_mov[i]->dst) ||
|
||||
then_mov[i]->exec_size != else_mov[i]->exec_size ||
|
||||
then_mov[i]->group != else_mov[i]->group ||
|
||||
then_mov[i]->force_writemask_all != else_mov[i]->force_writemask_all ||
|
||||
then_mov[i]->is_partial_write() ||
|
||||
else_mov[i]->is_partial_write() ||
|
||||
then_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE ||
|
||||
else_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE) {
|
||||
movs = i;
|
||||
break;
|
||||
}
|
||||
|
||||
/* Check that source types for mov operations match. */
|
||||
if (then_mov[i]->src[0].type != else_mov[i]->src[0].type) {
|
||||
movs = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (movs == 0)
|
||||
continue;
|
||||
|
||||
for (int i = 0; i < movs; i++) {
|
||||
const fs_builder ibld = fs_builder(this, then_block, then_mov[i])
|
||||
.at(block, if_inst);
|
||||
|
||||
if (then_mov[i]->src[0].equals(else_mov[i]->src[0])) {
|
||||
ibld.MOV(then_mov[i]->dst, then_mov[i]->src[0]);
|
||||
} else {
|
||||
/* Only the last source register can be a constant, so if the MOV
|
||||
* in the "then" clause uses a constant, we need to put it in a
|
||||
* temporary.
|
||||
*/
|
||||
fs_reg src0(then_mov[i]->src[0]);
|
||||
if (src0.file == IMM) {
|
||||
src0 = ibld.vgrf(then_mov[i]->src[0].type);
|
||||
ibld.MOV(src0, then_mov[i]->src[0]);
|
||||
}
|
||||
|
||||
/* 64-bit immediates can't be placed in src1. */
|
||||
fs_reg src1(else_mov[i]->src[0]);
|
||||
if (src1.file == IMM && type_sz(src1.type) == 8) {
|
||||
src1 = ibld.vgrf(else_mov[i]->src[0].type);
|
||||
ibld.MOV(src1, else_mov[i]->src[0]);
|
||||
}
|
||||
|
||||
set_predicate_inv(if_inst->predicate, if_inst->predicate_inverse,
|
||||
ibld.SEL(then_mov[i]->dst, src0, src1));
|
||||
}
|
||||
|
||||
then_mov[i]->remove(then_block);
|
||||
else_mov[i]->remove(else_block);
|
||||
}
|
||||
|
||||
progress = true;
|
||||
}
|
||||
|
||||
if (progress)
|
||||
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
||||
|
||||
return progress;
|
||||
}
|
||||
605
src/intel/compiler/elk/brw_fs_thread_payload.cpp
Normal file
605
src/intel/compiler/elk/brw_fs_thread_payload.cpp
Normal file
|
|
@ -0,0 +1,605 @@
|
|||
/*
|
||||
* Copyright © 2006-2022 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_fs.h"
|
||||
#include "brw_fs_builder.h"
|
||||
|
||||
using namespace brw;
|
||||
|
||||
vs_thread_payload::vs_thread_payload(const fs_visitor &v)
|
||||
{
|
||||
unsigned r = 0;
|
||||
|
||||
/* R0: Thread header. */
|
||||
r += reg_unit(v.devinfo);
|
||||
|
||||
/* R1: URB handles. */
|
||||
urb_handles = brw_ud8_grf(r, 0);
|
||||
r += reg_unit(v.devinfo);
|
||||
|
||||
num_regs = r;
|
||||
}
|
||||
|
||||
tcs_thread_payload::tcs_thread_payload(const fs_visitor &v)
|
||||
{
|
||||
struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(v.prog_data);
|
||||
struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(v.prog_data);
|
||||
struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) v.key;
|
||||
|
||||
if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH) {
|
||||
patch_urb_output = brw_ud1_grf(0, 0);
|
||||
primitive_id = brw_vec1_grf(0, 1);
|
||||
|
||||
/* r1-r4 contain the ICP handles. */
|
||||
icp_handle_start = brw_ud8_grf(1, 0);
|
||||
|
||||
num_regs = 5;
|
||||
} else {
|
||||
assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH);
|
||||
assert(tcs_key->input_vertices <= BRW_MAX_TCS_INPUT_VERTICES);
|
||||
|
||||
unsigned r = 0;
|
||||
|
||||
r += reg_unit(v.devinfo);
|
||||
|
||||
patch_urb_output = brw_ud8_grf(r, 0);
|
||||
r += reg_unit(v.devinfo);
|
||||
|
||||
if (tcs_prog_data->include_primitive_id) {
|
||||
primitive_id = brw_vec8_grf(r, 0);
|
||||
r += reg_unit(v.devinfo);
|
||||
}
|
||||
|
||||
/* ICP handles occupy the next 1-32 registers. */
|
||||
icp_handle_start = brw_ud8_grf(r, 0);
|
||||
r += brw_tcs_prog_key_input_vertices(tcs_key) * reg_unit(v.devinfo);
|
||||
|
||||
num_regs = r;
|
||||
}
|
||||
}
|
||||
|
||||
tes_thread_payload::tes_thread_payload(const fs_visitor &v)
|
||||
{
|
||||
unsigned r = 0;
|
||||
|
||||
/* R0: Thread Header. */
|
||||
patch_urb_input = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
|
||||
primitive_id = brw_vec1_grf(0, 1);
|
||||
r += reg_unit(v.devinfo);
|
||||
|
||||
/* R1-3: gl_TessCoord.xyz. */
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
coords[i] = brw_vec8_grf(r, 0);
|
||||
r += reg_unit(v.devinfo);
|
||||
}
|
||||
|
||||
/* R4: URB output handles. */
|
||||
urb_output = brw_ud8_grf(r, 0);
|
||||
r += reg_unit(v.devinfo);
|
||||
|
||||
num_regs = r;
|
||||
}
|
||||
|
||||
gs_thread_payload::gs_thread_payload(fs_visitor &v)
|
||||
{
|
||||
struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(v.prog_data);
|
||||
struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(v.prog_data);
|
||||
const fs_builder bld = fs_builder(&v).at_end();
|
||||
|
||||
/* R0: thread header. */
|
||||
unsigned r = reg_unit(v.devinfo);
|
||||
|
||||
/* R1: output URB handles. */
|
||||
urb_handles = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||
bld.AND(urb_handles, brw_ud8_grf(r, 0),
|
||||
v.devinfo->ver >= 20 ? brw_imm_ud(0xFFFFFF) : brw_imm_ud(0xFFFF));
|
||||
|
||||
/* R1: Instance ID stored in bits 31:27 */
|
||||
instance_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||
bld.SHR(instance_id, brw_ud8_grf(r, 0), brw_imm_ud(27u));
|
||||
|
||||
r += reg_unit(v.devinfo);
|
||||
|
||||
if (gs_prog_data->include_primitive_id) {
|
||||
primitive_id = brw_ud8_grf(r, 0);
|
||||
r += reg_unit(v.devinfo);
|
||||
}
|
||||
|
||||
/* Always enable VUE handles so we can safely use pull model if needed.
|
||||
*
|
||||
* The push model for a GS uses a ton of register space even for trivial
|
||||
* scenarios with just a few inputs, so just make things easier and a bit
|
||||
* safer by always having pull model available.
|
||||
*/
|
||||
gs_prog_data->base.include_vue_handles = true;
|
||||
|
||||
/* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
|
||||
icp_handle_start = brw_ud8_grf(r, 0);
|
||||
r += v.nir->info.gs.vertices_in * reg_unit(v.devinfo);
|
||||
|
||||
num_regs = r;
|
||||
|
||||
/* Use a maximum of 24 registers for push-model inputs. */
|
||||
const unsigned max_push_components = 24;
|
||||
|
||||
/* If pushing our inputs would take too many registers, reduce the URB read
|
||||
* length (which is in HWords, or 8 registers), and resort to pulling.
|
||||
*
|
||||
* Note that the GS reads <URB Read Length> HWords for every vertex - so we
|
||||
* have to multiply by VerticesIn to obtain the total storage requirement.
|
||||
*/
|
||||
if (8 * vue_prog_data->urb_read_length * v.nir->info.gs.vertices_in >
|
||||
max_push_components) {
|
||||
vue_prog_data->urb_read_length =
|
||||
ROUND_DOWN_TO(max_push_components / v.nir->info.gs.vertices_in, 8) / 8;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
setup_fs_payload_gfx20(fs_thread_payload &payload,
|
||||
const fs_visitor &v,
|
||||
bool &source_depth_to_render_target)
|
||||
{
|
||||
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(v.prog_data);
|
||||
const unsigned payload_width = 16;
|
||||
assert(v.dispatch_width % payload_width == 0);
|
||||
assert(v.devinfo->ver >= 20);
|
||||
|
||||
for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
|
||||
/* R0-1: PS thread payload header, masks and pixel X/Y coordinates. */
|
||||
payload.num_regs++;
|
||||
payload.subspan_coord_reg[j] = payload.num_regs++;
|
||||
}
|
||||
|
||||
for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
|
||||
/* R2-13: Barycentric interpolation coordinates. These appear
|
||||
* in the same order that they appear in the brw_barycentric_mode
|
||||
* enum. Each set of coordinates occupies 2 64B registers per
|
||||
* SIMD16 half. Coordinates only appear if they were enabled
|
||||
* using the "Barycentric Interpolation Mode" bits in WM_STATE.
|
||||
*/
|
||||
for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
|
||||
if (prog_data->barycentric_interp_modes & (1 << i)) {
|
||||
payload.barycentric_coord_reg[i][j] = payload.num_regs;
|
||||
payload.num_regs += payload_width / 4;
|
||||
}
|
||||
}
|
||||
|
||||
/* R14: Interpolated depth if "Pixel Shader Uses Source Depth" is set. */
|
||||
if (prog_data->uses_src_depth) {
|
||||
payload.source_depth_reg[j] = payload.num_regs;
|
||||
payload.num_regs += payload_width / 8;
|
||||
}
|
||||
|
||||
/* R15: Interpolated W if "Pixel Shader Uses Source W" is set. */
|
||||
if (prog_data->uses_src_w) {
|
||||
payload.source_w_reg[j] = payload.num_regs;
|
||||
payload.num_regs += payload_width / 8;
|
||||
}
|
||||
|
||||
/* R16: MSAA input coverage mask if "Pixel Shader Uses Input
|
||||
* Coverage Mask" is set.
|
||||
*/
|
||||
if (prog_data->uses_sample_mask) {
|
||||
payload.sample_mask_in_reg[j] = payload.num_regs;
|
||||
payload.num_regs += payload_width / 8;
|
||||
}
|
||||
|
||||
/* R19: MSAA position XY offsets if "Position XY Offset Select"
|
||||
* is either POSOFFSET_CENTROID or POSOFFSET_SAMPLE. Note that
|
||||
* this is delivered as a single SIMD32 vector, inconsistently
|
||||
* with most other PS payload fields.
|
||||
*/
|
||||
if (prog_data->uses_pos_offset && j == 0) {
|
||||
for (unsigned k = 0; k < 2; k++) {
|
||||
payload.sample_pos_reg[k] = payload.num_regs;
|
||||
payload.num_regs++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (prog_data->uses_depth_w_coefficients) {
|
||||
assert(v.max_polygons == 1);
|
||||
payload.depth_w_coef_reg = payload.num_regs;
|
||||
payload.num_regs += 2;
|
||||
}
|
||||
|
||||
if (v.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
|
||||
source_depth_to_render_target = true;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
setup_fs_payload_gfx6(fs_thread_payload &payload,
|
||||
const fs_visitor &v,
|
||||
bool &source_depth_to_render_target)
|
||||
{
|
||||
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(v.prog_data);
|
||||
|
||||
const unsigned payload_width = MIN2(16, v.dispatch_width);
|
||||
assert(v.dispatch_width % payload_width == 0);
|
||||
assert(v.devinfo->ver >= 6 && v.devinfo->ver < 20);
|
||||
|
||||
payload.num_regs = 0;
|
||||
|
||||
/* R0: PS thread payload header. */
|
||||
payload.num_regs++;
|
||||
|
||||
for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
|
||||
/* R1: masks, pixel X/Y coordinates. */
|
||||
payload.subspan_coord_reg[j] = payload.num_regs++;
|
||||
}
|
||||
|
||||
for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
|
||||
/* R3-26: barycentric interpolation coordinates. These appear in the
|
||||
* same order that they appear in the brw_barycentric_mode enum. Each
|
||||
* set of coordinates occupies 2 registers if dispatch width == 8 and 4
|
||||
* registers if dispatch width == 16. Coordinates only appear if they
|
||||
* were enabled using the "Barycentric Interpolation Mode" bits in
|
||||
* WM_STATE.
|
||||
*/
|
||||
for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
|
||||
if (prog_data->barycentric_interp_modes & (1 << i)) {
|
||||
payload.barycentric_coord_reg[i][j] = payload.num_regs;
|
||||
payload.num_regs += payload_width / 4;
|
||||
}
|
||||
}
|
||||
|
||||
/* R27-28: interpolated depth if uses source depth */
|
||||
if (prog_data->uses_src_depth) {
|
||||
payload.source_depth_reg[j] = payload.num_regs;
|
||||
payload.num_regs += payload_width / 8;
|
||||
}
|
||||
|
||||
/* R29-30: interpolated W set if GFX6_WM_USES_SOURCE_W. */
|
||||
if (prog_data->uses_src_w) {
|
||||
payload.source_w_reg[j] = payload.num_regs;
|
||||
payload.num_regs += payload_width / 8;
|
||||
}
|
||||
|
||||
/* R31: MSAA position offsets. */
|
||||
if (prog_data->uses_pos_offset) {
|
||||
payload.sample_pos_reg[j] = payload.num_regs;
|
||||
payload.num_regs++;
|
||||
}
|
||||
|
||||
/* R32-33: MSAA input coverage mask */
|
||||
if (prog_data->uses_sample_mask) {
|
||||
assert(v.devinfo->ver >= 7);
|
||||
payload.sample_mask_in_reg[j] = payload.num_regs;
|
||||
payload.num_regs += payload_width / 8;
|
||||
}
|
||||
}
|
||||
|
||||
/* R66: Source Depth and/or W Attribute Vertex Deltas */
|
||||
if (prog_data->uses_depth_w_coefficients) {
|
||||
assert(v.max_polygons == 1);
|
||||
payload.depth_w_coef_reg = payload.num_regs;
|
||||
payload.num_regs++;
|
||||
}
|
||||
|
||||
if (v.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
|
||||
source_depth_to_render_target = true;
|
||||
}
|
||||
}
|
||||
|
||||
#undef P /* prompted depth */
|
||||
#undef C /* computed */
|
||||
#undef N /* non-promoted? */
|
||||
|
||||
#define P 0
|
||||
#define C 1
|
||||
#define N 2
|
||||
|
||||
static const struct {
|
||||
GLuint mode:2;
|
||||
GLuint sd_present:1;
|
||||
GLuint sd_to_rt:1;
|
||||
GLuint dd_present:1;
|
||||
GLuint ds_present:1;
|
||||
} wm_iz_table[BRW_WM_IZ_BIT_MAX] =
|
||||
{
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ N, 1, 1, 0, 0 },
|
||||
{ N, 0, 1, 0, 0 },
|
||||
{ N, 0, 1, 0, 0 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ C, 0, 1, 1, 0 },
|
||||
{ C, 0, 1, 1, 0 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ N, 1, 1, 0, 0 },
|
||||
{ C, 0, 1, 1, 0 },
|
||||
{ C, 0, 1, 1, 0 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ N, 1, 1, 0, 0 },
|
||||
{ N, 0, 1, 0, 0 },
|
||||
{ N, 0, 1, 0, 0 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ C, 0, 1, 1, 0 },
|
||||
{ C, 0, 1, 1, 0 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ N, 1, 1, 0, 0 },
|
||||
{ C, 0, 1, 1, 0 },
|
||||
{ C, 0, 1, 1, 0 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ N, 1, 1, 0, 1 },
|
||||
{ N, 0, 1, 0, 1 },
|
||||
{ N, 0, 1, 0, 1 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ C, 0, 1, 1, 1 },
|
||||
{ C, 0, 1, 1, 1 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ N, 1, 1, 0, 1 },
|
||||
{ C, 0, 1, 1, 1 },
|
||||
{ C, 0, 1, 1, 1 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ C, 0, 0, 0, 1 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ C, 0, 1, 0, 1 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ C, 1, 1, 0, 1 },
|
||||
{ C, 0, 1, 0, 1 },
|
||||
{ C, 0, 1, 0, 1 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ C, 1, 1, 1, 1 },
|
||||
{ C, 0, 1, 1, 1 },
|
||||
{ C, 0, 1, 1, 1 },
|
||||
{ P, 0, 0, 0, 0 },
|
||||
{ C, 1, 1, 1, 1 },
|
||||
{ C, 0, 1, 1, 1 },
|
||||
{ C, 0, 1, 1, 1 }
|
||||
};
|
||||
|
||||
/**
|
||||
* \param line_aa BRW_NEVER, BRW_ALWAYS or BRW_SOMETIMES
|
||||
* \param lookup bitmask of BRW_WM_IZ_* flags
|
||||
*/
|
||||
static inline void
|
||||
setup_fs_payload_gfx4(fs_thread_payload &payload,
|
||||
const fs_visitor &v,
|
||||
bool &source_depth_to_render_target,
|
||||
bool &runtime_check_aads_emit)
|
||||
{
|
||||
assert(v.dispatch_width <= 16);
|
||||
|
||||
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(v.prog_data);
|
||||
brw_wm_prog_key *key = (brw_wm_prog_key *) v.key;
|
||||
|
||||
GLuint reg = 1;
|
||||
bool kill_stats_promoted_workaround = false;
|
||||
int lookup = key->iz_lookup;
|
||||
|
||||
assert(lookup < BRW_WM_IZ_BIT_MAX);
|
||||
|
||||
/* Crazy workaround in the windowizer, which we need to track in
|
||||
* our register allocation and render target writes. See the "If
|
||||
* statistics are enabled..." paragraph of 11.5.3.2: Early Depth
|
||||
* Test Cases [Pre-DevGT] of the 3D Pipeline - Windower B-Spec.
|
||||
*/
|
||||
if (key->stats_wm &&
|
||||
(lookup & BRW_WM_IZ_PS_KILL_ALPHATEST_BIT) &&
|
||||
wm_iz_table[lookup].mode == P) {
|
||||
kill_stats_promoted_workaround = true;
|
||||
}
|
||||
|
||||
payload.subspan_coord_reg[0] = reg++;
|
||||
|
||||
if (wm_iz_table[lookup].sd_present || prog_data->uses_src_depth ||
|
||||
kill_stats_promoted_workaround) {
|
||||
payload.source_depth_reg[0] = reg;
|
||||
reg += 2;
|
||||
}
|
||||
|
||||
if (wm_iz_table[lookup].sd_to_rt || kill_stats_promoted_workaround)
|
||||
source_depth_to_render_target = true;
|
||||
|
||||
if (wm_iz_table[lookup].ds_present || key->line_aa != BRW_NEVER) {
|
||||
payload.aa_dest_stencil_reg[0] = reg;
|
||||
runtime_check_aads_emit =
|
||||
!wm_iz_table[lookup].ds_present && key->line_aa == BRW_SOMETIMES;
|
||||
reg++;
|
||||
}
|
||||
|
||||
if (wm_iz_table[lookup].dd_present) {
|
||||
payload.dest_depth_reg[0] = reg;
|
||||
reg+=2;
|
||||
}
|
||||
|
||||
payload.num_regs = reg;
|
||||
}
|
||||
|
||||
#undef P /* prompted depth */
|
||||
#undef C /* computed */
|
||||
#undef N /* non-promoted? */
|
||||
|
||||
fs_thread_payload::fs_thread_payload(const fs_visitor &v,
|
||||
bool &source_depth_to_render_target,
|
||||
bool &runtime_check_aads_emit)
|
||||
: subspan_coord_reg(),
|
||||
source_depth_reg(),
|
||||
source_w_reg(),
|
||||
aa_dest_stencil_reg(),
|
||||
dest_depth_reg(),
|
||||
sample_pos_reg(),
|
||||
sample_mask_in_reg(),
|
||||
depth_w_coef_reg(),
|
||||
barycentric_coord_reg()
|
||||
{
|
||||
if (v.devinfo->ver >= 20)
|
||||
setup_fs_payload_gfx20(*this, v, source_depth_to_render_target);
|
||||
else if (v.devinfo->ver >= 6)
|
||||
setup_fs_payload_gfx6(*this, v, source_depth_to_render_target);
|
||||
else
|
||||
setup_fs_payload_gfx4(*this, v, source_depth_to_render_target,
|
||||
runtime_check_aads_emit);
|
||||
}
|
||||
|
||||
cs_thread_payload::cs_thread_payload(const fs_visitor &v)
|
||||
{
|
||||
struct brw_cs_prog_data *prog_data = brw_cs_prog_data(v.prog_data);
|
||||
|
||||
unsigned r = reg_unit(v.devinfo);
|
||||
|
||||
/* See nir_setup_uniforms for subgroup_id in earlier versions. */
|
||||
if (v.devinfo->verx10 >= 125) {
|
||||
subgroup_id_ = brw_ud1_grf(0, 2);
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
if (prog_data->generate_local_id & (1 << i)) {
|
||||
local_invocation_id[i] = brw_uw8_grf(r, 0);
|
||||
r += reg_unit(v.devinfo);
|
||||
if (v.devinfo->ver < 20 && v.dispatch_width == 32)
|
||||
r += reg_unit(v.devinfo);
|
||||
} else {
|
||||
local_invocation_id[i] = brw_imm_uw(0);
|
||||
}
|
||||
}
|
||||
|
||||
/* TODO: Fill out uses_btd_stack_ids automatically */
|
||||
if (prog_data->uses_btd_stack_ids)
|
||||
r += reg_unit(v.devinfo);
|
||||
}
|
||||
|
||||
num_regs = r;
|
||||
}
|
||||
|
||||
void
|
||||
cs_thread_payload::load_subgroup_id(const fs_builder &bld,
|
||||
fs_reg &dest) const
|
||||
{
|
||||
auto devinfo = bld.shader->devinfo;
|
||||
dest = retype(dest, BRW_REGISTER_TYPE_UD);
|
||||
|
||||
if (subgroup_id_.file != BAD_FILE) {
|
||||
assert(devinfo->verx10 >= 125);
|
||||
bld.AND(dest, subgroup_id_, brw_imm_ud(INTEL_MASK(7, 0)));
|
||||
} else {
|
||||
assert(devinfo->verx10 < 125);
|
||||
assert(gl_shader_stage_is_compute(bld.shader->stage));
|
||||
int index = brw_get_subgroup_id_param_index(devinfo,
|
||||
bld.shader->stage_prog_data);
|
||||
bld.MOV(dest, fs_reg(UNIFORM, index, BRW_REGISTER_TYPE_UD));
|
||||
}
|
||||
}
|
||||
|
||||
task_mesh_thread_payload::task_mesh_thread_payload(fs_visitor &v)
|
||||
: cs_thread_payload(v)
|
||||
{
|
||||
/* Task and Mesh Shader Payloads (SIMD8 and SIMD16)
|
||||
*
|
||||
* R0: Header
|
||||
* R1: Local_ID.X[0-7 or 0-15]
|
||||
* R2: Inline Parameter
|
||||
*
|
||||
* Task and Mesh Shader Payloads (SIMD32)
|
||||
*
|
||||
* R0: Header
|
||||
* R1: Local_ID.X[0-15]
|
||||
* R2: Local_ID.X[16-31]
|
||||
* R3: Inline Parameter
|
||||
*
|
||||
* Local_ID.X values are 16 bits.
|
||||
*
|
||||
* Inline parameter is optional but always present since we use it to pass
|
||||
* the address to descriptors.
|
||||
*/
|
||||
|
||||
const fs_builder bld = fs_builder(&v).at_end();
|
||||
|
||||
unsigned r = 0;
|
||||
assert(subgroup_id_.file != BAD_FILE);
|
||||
extended_parameter_0 = retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD);
|
||||
|
||||
if (v.devinfo->ver >= 20) {
|
||||
urb_output = brw_ud1_grf(1, 0);
|
||||
} else {
|
||||
urb_output = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||
/* In both mesh and task shader payload, lower 16 bits of g0.6 is
|
||||
* an offset within Slice's Local URB, which says where shader is
|
||||
* supposed to output its data.
|
||||
*/
|
||||
bld.AND(urb_output, brw_ud1_grf(0, 6), brw_imm_ud(0xFFFF));
|
||||
}
|
||||
|
||||
if (v.stage == MESA_SHADER_MESH) {
|
||||
/* g0.7 is Task Shader URB Entry Offset, which contains both an offset
|
||||
* within Slice's Local USB (bits 0:15) and a slice selector
|
||||
* (bits 16:24). Slice selector can be non zero when mesh shader
|
||||
* is spawned on slice other than the one where task shader was run.
|
||||
* Bit 24 says that Slice ID is present and bits 16:23 is the Slice ID.
|
||||
*/
|
||||
task_urb_input = brw_ud1_grf(0, 7);
|
||||
}
|
||||
r += reg_unit(v.devinfo);
|
||||
|
||||
local_index = brw_uw8_grf(r, 0);
|
||||
r += reg_unit(v.devinfo);
|
||||
if (v.devinfo->ver < 20 && v.dispatch_width == 32)
|
||||
r += reg_unit(v.devinfo);
|
||||
|
||||
inline_parameter = brw_ud1_grf(r, 0);
|
||||
r += reg_unit(v.devinfo);
|
||||
|
||||
num_regs = r;
|
||||
}
|
||||
|
||||
bs_thread_payload::bs_thread_payload(const fs_visitor &v)
|
||||
{
|
||||
unsigned r = 0;
|
||||
|
||||
/* R0: Thread header. */
|
||||
r += reg_unit(v.devinfo);
|
||||
|
||||
/* R1: Stack IDs. */
|
||||
r += reg_unit(v.devinfo);
|
||||
|
||||
/* R2: Inline Parameter. Used for argument addresses. */
|
||||
global_arg_ptr = brw_ud1_grf(r, 0);
|
||||
local_arg_ptr = brw_ud1_grf(r, 2);
|
||||
r += reg_unit(v.devinfo);
|
||||
|
||||
num_regs = r;
|
||||
}
|
||||
|
||||
void
|
||||
bs_thread_payload::load_shader_type(const fs_builder &bld, fs_reg &dest) const
|
||||
{
|
||||
fs_reg ud_dest = retype(dest, BRW_REGISTER_TYPE_UD);
|
||||
bld.MOV(ud_dest, retype(brw_vec1_grf(0, 3), ud_dest.type));
|
||||
bld.AND(ud_dest, ud_dest, brw_imm_ud(0xf));
|
||||
}
|
||||
199
src/intel/compiler/elk/brw_fs_validate.cpp
Normal file
199
src/intel/compiler/elk/brw_fs_validate.cpp
Normal file
|
|
@ -0,0 +1,199 @@
|
|||
/*
|
||||
* Copyright © 2015 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/** @file brw_fs_validate.cpp
|
||||
*
|
||||
* Implements a pass that validates various invariants of the IR. The current
|
||||
* pass only validates that GRF's uses are sane. More can be added later.
|
||||
*/
|
||||
|
||||
#include "brw_fs.h"
|
||||
#include "brw_cfg.h"
|
||||
|
||||
#define fsv_assert(assertion) \
|
||||
{ \
|
||||
if (!(assertion)) { \
|
||||
fprintf(stderr, "ASSERT: Scalar %s validation failed!\n", \
|
||||
_mesa_shader_stage_to_abbrev(stage)); \
|
||||
dump_instruction(inst, stderr); \
|
||||
fprintf(stderr, "%s:%d: '%s' failed\n", __FILE__, __LINE__, #assertion); \
|
||||
abort(); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define fsv_assert_eq(first, second) \
|
||||
{ \
|
||||
unsigned f = (first); \
|
||||
unsigned s = (second); \
|
||||
if (f != s) { \
|
||||
fprintf(stderr, "ASSERT: Scalar %s validation failed!\n", \
|
||||
_mesa_shader_stage_to_abbrev(stage)); \
|
||||
dump_instruction(inst, stderr); \
|
||||
fprintf(stderr, "%s:%d: A == B failed\n", __FILE__, __LINE__); \
|
||||
fprintf(stderr, " A = %s = %u\n", #first, f); \
|
||||
fprintf(stderr, " B = %s = %u\n", #second, s); \
|
||||
abort(); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define fsv_assert_ne(first, second) \
|
||||
{ \
|
||||
unsigned f = (first); \
|
||||
unsigned s = (second); \
|
||||
if (f == s) { \
|
||||
fprintf(stderr, "ASSERT: Scalar %s validation failed!\n", \
|
||||
_mesa_shader_stage_to_abbrev(stage)); \
|
||||
dump_instruction(inst, stderr); \
|
||||
fprintf(stderr, "%s:%d: A != B failed\n", __FILE__, __LINE__); \
|
||||
fprintf(stderr, " A = %s = %u\n", #first, f); \
|
||||
fprintf(stderr, " B = %s = %u\n", #second, s); \
|
||||
abort(); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define fsv_assert_lte(first, second) \
|
||||
{ \
|
||||
unsigned f = (first); \
|
||||
unsigned s = (second); \
|
||||
if (f > s) { \
|
||||
fprintf(stderr, "ASSERT: Scalar %s validation failed!\n", \
|
||||
_mesa_shader_stage_to_abbrev(stage)); \
|
||||
dump_instruction(inst, stderr); \
|
||||
fprintf(stderr, "%s:%d: A <= B failed\n", __FILE__, __LINE__); \
|
||||
fprintf(stderr, " A = %s = %u\n", #first, f); \
|
||||
fprintf(stderr, " B = %s = %u\n", #second, s); \
|
||||
abort(); \
|
||||
} \
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
void
|
||||
fs_visitor::validate()
|
||||
{
|
||||
cfg->validate(_mesa_shader_stage_to_abbrev(stage));
|
||||
|
||||
foreach_block_and_inst (block, fs_inst, inst, cfg) {
|
||||
switch (inst->opcode) {
|
||||
case SHADER_OPCODE_SEND:
|
||||
fsv_assert(is_uniform(inst->src[0]) && is_uniform(inst->src[1]));
|
||||
break;
|
||||
|
||||
case BRW_OPCODE_MOV:
|
||||
fsv_assert(inst->sources == 1);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
if (inst->is_3src(compiler)) {
|
||||
const unsigned integer_sources =
|
||||
brw_reg_type_is_integer(inst->src[0].type) +
|
||||
brw_reg_type_is_integer(inst->src[1].type) +
|
||||
brw_reg_type_is_integer(inst->src[2].type);
|
||||
const unsigned float_sources =
|
||||
brw_reg_type_is_floating_point(inst->src[0].type) +
|
||||
brw_reg_type_is_floating_point(inst->src[1].type) +
|
||||
brw_reg_type_is_floating_point(inst->src[2].type);
|
||||
|
||||
fsv_assert((integer_sources == 3 && float_sources == 0) ||
|
||||
(integer_sources == 0 && float_sources == 3));
|
||||
|
||||
if (devinfo->ver >= 10) {
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
if (inst->src[i].file == BRW_IMMEDIATE_VALUE)
|
||||
continue;
|
||||
|
||||
switch (inst->src[i].vstride) {
|
||||
case BRW_VERTICAL_STRIDE_0:
|
||||
case BRW_VERTICAL_STRIDE_4:
|
||||
case BRW_VERTICAL_STRIDE_8:
|
||||
case BRW_VERTICAL_STRIDE_16:
|
||||
break;
|
||||
|
||||
case BRW_VERTICAL_STRIDE_1:
|
||||
fsv_assert_lte(12, devinfo->ver);
|
||||
break;
|
||||
|
||||
case BRW_VERTICAL_STRIDE_2:
|
||||
fsv_assert_lte(devinfo->ver, 11);
|
||||
break;
|
||||
|
||||
default:
|
||||
fsv_assert(!"invalid vstride");
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if (grf_used != 0) {
|
||||
/* Only perform the pre-Gfx10 checks after register allocation has
|
||||
* occured.
|
||||
*
|
||||
* Many passes (e.g., constant copy propagation) will genenerate
|
||||
* invalid 3-source instructions with the expectation that later
|
||||
* passes (e.g., combine constants) will fix them.
|
||||
*/
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
fsv_assert_ne(inst->src[i].file, BRW_IMMEDIATE_VALUE);
|
||||
|
||||
/* A stride of 1 (the usual case) or 0, with a special
|
||||
* "repctrl" bit, is allowed. The repctrl bit doesn't work for
|
||||
* 64-bit datatypes, so if the source type is 64-bit then only
|
||||
* a stride of 1 is allowed. From the Broadwell PRM, Volume 7
|
||||
* "3D Media GPGPU", page 944:
|
||||
*
|
||||
* This is applicable to 32b datatypes and 16b datatype. 64b
|
||||
* datatypes cannot use the replicate control.
|
||||
*/
|
||||
fsv_assert_lte(inst->src[i].vstride, 1);
|
||||
|
||||
if (type_sz(inst->src[i].type) > 4)
|
||||
fsv_assert_eq(inst->src[i].vstride, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (inst->dst.file == VGRF) {
|
||||
fsv_assert_lte(inst->dst.offset / REG_SIZE + regs_written(inst),
|
||||
alloc.sizes[inst->dst.nr]);
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < inst->sources; i++) {
|
||||
if (inst->src[i].file == VGRF) {
|
||||
fsv_assert_lte(inst->src[i].offset / REG_SIZE + regs_read(inst, i),
|
||||
alloc.sizes[inst->src[i].nr]);
|
||||
}
|
||||
}
|
||||
|
||||
/* Accumulator Registers, bspec 47251:
|
||||
*
|
||||
* "When destination is accumulator with offset 0, destination
|
||||
* horizontal stride must be 1."
|
||||
*/
|
||||
if (intel_needs_workaround(devinfo, 14014617373) &&
|
||||
inst->dst.is_accumulator() &&
|
||||
inst->dst.offset == 0) {
|
||||
fsv_assert_eq(inst->dst.stride, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
1266
src/intel/compiler/elk/brw_fs_visitor.cpp
Normal file
1266
src/intel/compiler/elk/brw_fs_visitor.cpp
Normal file
File diff suppressed because it is too large
Load diff
2566
src/intel/compiler/elk/brw_gram.y
Normal file
2566
src/intel/compiler/elk/brw_gram.y
Normal file
File diff suppressed because it is too large
Load diff
1732
src/intel/compiler/elk/brw_inst.h
Normal file
1732
src/intel/compiler/elk/brw_inst.h
Normal file
File diff suppressed because it is too large
Load diff
108
src/intel/compiler/elk/brw_interpolation_map.c
Normal file
108
src/intel/compiler/elk/brw_interpolation_map.c
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
/*
|
||||
* Copyright © 2013 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_compiler.h"
|
||||
#include "compiler/nir/nir.h"
|
||||
|
||||
static char const *get_qual_name(int mode)
|
||||
{
|
||||
switch (mode) {
|
||||
case INTERP_MODE_NONE: return "none";
|
||||
case INTERP_MODE_FLAT: return "flat";
|
||||
case INTERP_MODE_SMOOTH: return "smooth";
|
||||
case INTERP_MODE_NOPERSPECTIVE: return "nopersp";
|
||||
default: return "???";
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
gfx4_frag_prog_set_interp_modes(struct brw_wm_prog_data *prog_data,
|
||||
const struct intel_vue_map *vue_map,
|
||||
unsigned location, unsigned slot_count,
|
||||
enum glsl_interp_mode interp)
|
||||
{
|
||||
for (unsigned k = 0; k < slot_count; k++) {
|
||||
unsigned slot = vue_map->varying_to_slot[location + k];
|
||||
if (slot != -1 && prog_data->interp_mode[slot] == INTERP_MODE_NONE) {
|
||||
prog_data->interp_mode[slot] = interp;
|
||||
|
||||
if (prog_data->interp_mode[slot] == INTERP_MODE_FLAT) {
|
||||
prog_data->contains_flat_varying = true;
|
||||
} else if (prog_data->interp_mode[slot] == INTERP_MODE_NOPERSPECTIVE) {
|
||||
prog_data->contains_noperspective_varying = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Set up interpolation modes for every element in the VUE */
|
||||
void
|
||||
brw_setup_vue_interpolation(const struct intel_vue_map *vue_map, nir_shader *nir,
|
||||
struct brw_wm_prog_data *prog_data)
|
||||
{
|
||||
/* Initialise interp_mode. INTERP_MODE_NONE == 0 */
|
||||
memset(prog_data->interp_mode, 0, sizeof(prog_data->interp_mode));
|
||||
|
||||
if (!vue_map)
|
||||
return;
|
||||
|
||||
/* HPOS always wants noperspective. setting it up here allows
|
||||
* us to not need special handling in the SF program.
|
||||
*/
|
||||
unsigned pos_slot = vue_map->varying_to_slot[VARYING_SLOT_POS];
|
||||
if (pos_slot != -1) {;
|
||||
prog_data->interp_mode[pos_slot] = INTERP_MODE_NOPERSPECTIVE;
|
||||
prog_data->contains_noperspective_varying = true;
|
||||
}
|
||||
|
||||
nir_foreach_shader_in_variable(var, nir) {
|
||||
unsigned location = var->data.location;
|
||||
unsigned slot_count = glsl_count_attribute_slots(var->type, false);
|
||||
|
||||
gfx4_frag_prog_set_interp_modes(prog_data, vue_map, location, slot_count,
|
||||
var->data.interpolation);
|
||||
|
||||
if (location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1) {
|
||||
location = location + VARYING_SLOT_BFC0 - VARYING_SLOT_COL0;
|
||||
gfx4_frag_prog_set_interp_modes(prog_data, vue_map, location,
|
||||
slot_count, var->data.interpolation);
|
||||
}
|
||||
}
|
||||
|
||||
const bool debug = false;
|
||||
if (debug) {
|
||||
fprintf(stderr, "VUE map:\n");
|
||||
for (int i = 0; i < vue_map->num_slots; i++) {
|
||||
int varying = vue_map->slot_to_varying[i];
|
||||
if (varying == -1) {
|
||||
fprintf(stderr, "%d: --\n", i);
|
||||
continue;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%d: %d %s ofs %d\n",
|
||||
i, varying,
|
||||
get_qual_name(prog_data->interp_mode[i]),
|
||||
brw_vue_slot_to_offset(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
216
src/intel/compiler/elk/brw_ir.h
Normal file
216
src/intel/compiler/elk/brw_ir.h
Normal file
|
|
@ -0,0 +1,216 @@
|
|||
/* -*- c++ -*- */
|
||||
/*
|
||||
* Copyright © 2010-2016 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BRW_IR_H
|
||||
#define BRW_IR_H
|
||||
|
||||
#include <assert.h>
|
||||
#include "brw_reg.h"
|
||||
#include "compiler/glsl/list.h"
|
||||
|
||||
#define MAX_SAMPLER_MESSAGE_SIZE 11
|
||||
|
||||
/* The sampler can return a vec5 when sampling with sparse residency. In
|
||||
* SIMD32, each component takes up 4 GRFs, so we need to allow up to size-20
|
||||
* VGRFs to hold the result.
|
||||
*/
|
||||
#define MAX_VGRF_SIZE(devinfo) ((devinfo)->ver >= 20 ? 40 : 20)
|
||||
|
||||
#ifdef __cplusplus
|
||||
struct backend_reg : private brw_reg
|
||||
{
|
||||
backend_reg() {}
|
||||
backend_reg(const struct brw_reg ®) : brw_reg(reg), offset(0) {}
|
||||
|
||||
const brw_reg &as_brw_reg() const
|
||||
{
|
||||
assert(file == ARF || file == FIXED_GRF || file == MRF || file == IMM);
|
||||
assert(offset == 0);
|
||||
return static_cast<const brw_reg &>(*this);
|
||||
}
|
||||
|
||||
brw_reg &as_brw_reg()
|
||||
{
|
||||
assert(file == ARF || file == FIXED_GRF || file == MRF || file == IMM);
|
||||
assert(offset == 0);
|
||||
return static_cast<brw_reg &>(*this);
|
||||
}
|
||||
|
||||
bool equals(const backend_reg &r) const;
|
||||
bool negative_equals(const backend_reg &r) const;
|
||||
|
||||
bool is_zero() const;
|
||||
bool is_one() const;
|
||||
bool is_negative_one() const;
|
||||
bool is_null() const;
|
||||
bool is_accumulator() const;
|
||||
|
||||
/** Offset from the start of the (virtual) register in bytes. */
|
||||
uint16_t offset;
|
||||
|
||||
using brw_reg::type;
|
||||
using brw_reg::file;
|
||||
using brw_reg::negate;
|
||||
using brw_reg::abs;
|
||||
using brw_reg::address_mode;
|
||||
using brw_reg::subnr;
|
||||
using brw_reg::nr;
|
||||
|
||||
using brw_reg::swizzle;
|
||||
using brw_reg::writemask;
|
||||
using brw_reg::indirect_offset;
|
||||
using brw_reg::vstride;
|
||||
using brw_reg::width;
|
||||
using brw_reg::hstride;
|
||||
|
||||
using brw_reg::df;
|
||||
using brw_reg::f;
|
||||
using brw_reg::d;
|
||||
using brw_reg::ud;
|
||||
using brw_reg::d64;
|
||||
using brw_reg::u64;
|
||||
};
|
||||
|
||||
struct bblock_t;
|
||||
|
||||
struct backend_instruction : public exec_node {
|
||||
bool is_3src(const struct brw_compiler *compiler) const;
|
||||
bool is_math() const;
|
||||
bool is_control_flow_begin() const;
|
||||
bool is_control_flow_end() const;
|
||||
bool is_control_flow() const;
|
||||
bool is_commutative() const;
|
||||
bool can_do_source_mods() const;
|
||||
bool can_do_saturate() const;
|
||||
bool can_do_cmod() const;
|
||||
bool reads_accumulator_implicitly() const;
|
||||
bool writes_accumulator_implicitly(const struct intel_device_info *devinfo) const;
|
||||
|
||||
/**
|
||||
* Instructions that use indirect addressing have additional register
|
||||
* regioning restrictions.
|
||||
*/
|
||||
bool uses_indirect_addressing() const;
|
||||
|
||||
void remove(bblock_t *block, bool defer_later_block_ip_updates = false);
|
||||
void insert_after(bblock_t *block, backend_instruction *inst);
|
||||
void insert_before(bblock_t *block, backend_instruction *inst);
|
||||
|
||||
/**
|
||||
* True if the instruction has side effects other than writing to
|
||||
* its destination registers. You are expected not to reorder or
|
||||
* optimize these out unless you know what you are doing.
|
||||
*/
|
||||
bool has_side_effects() const;
|
||||
|
||||
/**
|
||||
* True if the instruction might be affected by side effects of other
|
||||
* instructions.
|
||||
*/
|
||||
bool is_volatile() const;
|
||||
#else
|
||||
struct backend_instruction {
|
||||
struct exec_node link;
|
||||
#endif
|
||||
/** @{
|
||||
* Annotation for the generated IR. One of the two can be set.
|
||||
*/
|
||||
const void *ir;
|
||||
const char *annotation;
|
||||
/** @} */
|
||||
|
||||
/**
|
||||
* Execution size of the instruction. This is used by the generator to
|
||||
* generate the correct binary for the given instruction. Current valid
|
||||
* values are 1, 4, 8, 16, 32.
|
||||
*/
|
||||
uint8_t exec_size;
|
||||
|
||||
/**
|
||||
* Channel group from the hardware execution and predication mask that
|
||||
* should be applied to the instruction. The subset of channel enable
|
||||
* signals (calculated from the EU control flow and predication state)
|
||||
* given by [group, group + exec_size) will be used to mask GRF writes and
|
||||
* any other side effects of the instruction.
|
||||
*/
|
||||
uint8_t group;
|
||||
|
||||
uint32_t offset; /**< spill/unspill offset or texture offset bitfield */
|
||||
uint8_t mlen; /**< SEND message length */
|
||||
uint8_t ex_mlen; /**< SENDS extended message length */
|
||||
int8_t base_mrf; /**< First MRF in the SEND message, if mlen is nonzero. */
|
||||
uint8_t target; /**< MRT target. */
|
||||
uint8_t sfid; /**< SFID for SEND instructions */
|
||||
uint32_t desc; /**< SEND[S] message descriptor immediate */
|
||||
uint32_t ex_desc; /**< SEND[S] extended message descriptor immediate */
|
||||
unsigned size_written; /**< Data written to the destination register in bytes. */
|
||||
|
||||
enum opcode opcode; /* BRW_OPCODE_* or FS_OPCODE_* */
|
||||
enum brw_conditional_mod conditional_mod; /**< BRW_CONDITIONAL_* */
|
||||
enum brw_predicate predicate;
|
||||
bool predicate_inverse:1;
|
||||
bool writes_accumulator:1; /**< instruction implicitly writes accumulator */
|
||||
bool force_writemask_all:1;
|
||||
bool no_dd_clear:1;
|
||||
bool no_dd_check:1;
|
||||
bool saturate:1;
|
||||
bool shadow_compare:1;
|
||||
bool check_tdr:1; /**< Only valid for SEND; turns it into a SENDC */
|
||||
bool send_has_side_effects:1; /**< Only valid for SHADER_OPCODE_SEND */
|
||||
bool send_is_volatile:1; /**< Only valid for SHADER_OPCODE_SEND */
|
||||
bool send_ex_desc_scratch:1; /**< Only valid for SHADER_OPCODE_SEND, use
|
||||
* the scratch surface offset to build
|
||||
* extended descriptor
|
||||
*/
|
||||
bool send_ex_bso:1; /**< Only for SHADER_OPCODE_SEND, use extended bindless
|
||||
* surface offset (26bits instead of 20bits)
|
||||
*/
|
||||
bool predicate_trivial:1; /**< The predication mask applied to this
|
||||
* instruction is guaranteed to be uniform and
|
||||
* a superset of the execution mask of the
|
||||
* present block, no currently enabled channels
|
||||
* will be disabled by the predicate.
|
||||
*/
|
||||
bool eot:1;
|
||||
|
||||
/* Chooses which flag subregister (f0.0 to f3.1) is used for conditional
|
||||
* mod and predication.
|
||||
*/
|
||||
unsigned flag_subreg:3;
|
||||
|
||||
/**
|
||||
* Systolic depth used by DPAS instruction.
|
||||
*/
|
||||
unsigned sdepth:4;
|
||||
|
||||
/**
|
||||
* Repeat count used by DPAS instruction.
|
||||
*/
|
||||
unsigned rcount:4;
|
||||
|
||||
/** The number of hardware registers used for a message header. */
|
||||
uint8_t header_size;
|
||||
};
|
||||
|
||||
#endif
|
||||
92
src/intel/compiler/elk/brw_ir_allocator.h
Normal file
92
src/intel/compiler/elk/brw_ir_allocator.h
Normal file
|
|
@ -0,0 +1,92 @@
|
|||
/* -*- c++ -*- */
|
||||
/*
|
||||
* Copyright © 2010-2014 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BRW_IR_ALLOCATOR_H
|
||||
#define BRW_IR_ALLOCATOR_H
|
||||
|
||||
#include "util/compiler.h"
|
||||
#include "util/glheader.h"
|
||||
#include "util/macros.h"
|
||||
#include "util/rounding.h"
|
||||
#include "util/u_math.h"
|
||||
|
||||
namespace brw {
|
||||
/**
|
||||
* Simple allocator used to keep track of virtual GRFs.
|
||||
*/
|
||||
class simple_allocator {
|
||||
public:
|
||||
simple_allocator() :
|
||||
sizes(NULL), offsets(NULL), count(0), total_size(0), capacity(0)
|
||||
{
|
||||
}
|
||||
|
||||
~simple_allocator()
|
||||
{
|
||||
free(offsets);
|
||||
free(sizes);
|
||||
}
|
||||
|
||||
unsigned
|
||||
allocate(unsigned size)
|
||||
{
|
||||
assert(size > 0);
|
||||
if (capacity <= count) {
|
||||
capacity = MAX2(16, capacity * 2);
|
||||
sizes = (unsigned *)realloc(sizes, capacity * sizeof(unsigned));
|
||||
offsets = (unsigned *)realloc(offsets, capacity * sizeof(unsigned));
|
||||
}
|
||||
|
||||
sizes[count] = size;
|
||||
offsets[count] = total_size;
|
||||
total_size += size;
|
||||
|
||||
return count++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Array of sizes for each allocation. The allocation unit is up to the
|
||||
* back-end, but it's expected to be one scalar value in the FS back-end
|
||||
* and one vec4 in the VEC4 back-end.
|
||||
*/
|
||||
unsigned *sizes;
|
||||
|
||||
/**
|
||||
* Array of offsets from the start of the VGRF space in allocation
|
||||
* units.
|
||||
*/
|
||||
unsigned *offsets;
|
||||
|
||||
/** Total number of VGRFs allocated. */
|
||||
unsigned count;
|
||||
|
||||
/** Cumulative size in allocation units. */
|
||||
unsigned total_size;
|
||||
|
||||
private:
|
||||
unsigned capacity;
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
192
src/intel/compiler/elk/brw_ir_analysis.h
Normal file
192
src/intel/compiler/elk/brw_ir_analysis.h
Normal file
|
|
@ -0,0 +1,192 @@
|
|||
/* -*- c++ -*- */
|
||||
/*
|
||||
* Copyright © 2016 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BRW_IR_ANALYSIS_H
|
||||
#define BRW_IR_ANALYSIS_H
|
||||
|
||||
namespace brw {
|
||||
/**
|
||||
* Bitset of state categories that can influence the result of IR analysis
|
||||
* passes.
|
||||
*/
|
||||
enum analysis_dependency_class {
|
||||
/**
|
||||
* The analysis doesn't depend on the IR, its result is effectively a
|
||||
* constant during the compilation.
|
||||
*/
|
||||
DEPENDENCY_NOTHING = 0,
|
||||
/**
|
||||
* The analysis depends on the set of instructions in the program and
|
||||
* their naming. Note that because instructions are named sequentially
|
||||
* by IP this implies a dependency on the control flow edges between
|
||||
* instructions. This will be signaled whenever instructions are
|
||||
* inserted, removed or reordered in the program.
|
||||
*/
|
||||
DEPENDENCY_INSTRUCTION_IDENTITY = 0x1,
|
||||
/**
|
||||
* The analysis is sensitive to the detailed semantics of instructions
|
||||
* in the program, where "detailed" means any change in the instruction
|
||||
* data structures other than the linked-list pointers (which are
|
||||
* already covered by DEPENDENCY_INSTRUCTION_IDENTITY). E.g. changing
|
||||
* the negate or abs flags of an instruction source would signal this
|
||||
* flag alone because it would preserve all other instruction dependency
|
||||
* classes.
|
||||
*/
|
||||
DEPENDENCY_INSTRUCTION_DETAIL = 0x2,
|
||||
/**
|
||||
* The analysis depends on the set of data flow edges between
|
||||
* instructions. This will be signaled whenever the dataflow relation
|
||||
* between instructions has potentially changed, e.g. when the VGRF
|
||||
* index of an instruction source or destination changes (in which case
|
||||
* it will appear in combination with DEPENDENCY_INSTRUCTION_DETAIL), or
|
||||
* when data-dependent instructions are reordered (in which case it will
|
||||
* appear in combination with DEPENDENCY_INSTRUCTION_IDENTITY).
|
||||
*/
|
||||
DEPENDENCY_INSTRUCTION_DATA_FLOW = 0x4,
|
||||
/**
|
||||
* The analysis depends on all instruction dependency classes. These
|
||||
* will typically be signaled simultaneously when inserting or removing
|
||||
* instructions in the program (or if you're feeling too lazy to read
|
||||
* through your optimization pass to figure out which of the instruction
|
||||
* dependency classes above it invalidates).
|
||||
*/
|
||||
DEPENDENCY_INSTRUCTIONS = 0x7,
|
||||
/**
|
||||
* The analysis depends on the set of VGRFs in the program and their
|
||||
* naming. This will be signaled when VGRFs are allocated or released.
|
||||
*/
|
||||
DEPENDENCY_VARIABLES = 0x8,
|
||||
/**
|
||||
* The analysis depends on the set of basic blocks in the program, their
|
||||
* control flow edges and naming.
|
||||
*/
|
||||
DEPENDENCY_BLOCKS = 0x10,
|
||||
/**
|
||||
* The analysis depends on the program being literally the same (good
|
||||
* luck...), any change in the input invalidates previous analysis
|
||||
* computations.
|
||||
*/
|
||||
DEPENDENCY_EVERYTHING = ~0
|
||||
};
|
||||
|
||||
inline analysis_dependency_class
|
||||
operator|(analysis_dependency_class x, analysis_dependency_class y)
|
||||
{
|
||||
return static_cast<analysis_dependency_class>(
|
||||
static_cast<unsigned>(x) | static_cast<unsigned>(y));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiate a program analysis class \p L which can calculate an object of
|
||||
* type \p T as result. \p C is a closure that encapsulates whatever
|
||||
* information is required as argument to run the analysis pass. The purpose
|
||||
* of this class is to make sure that:
|
||||
*
|
||||
* - The analysis pass is executed lazily whenever it's needed and multiple
|
||||
* executions are optimized out as long as the cached result remains marked
|
||||
* up-to-date.
|
||||
*
|
||||
* - There is no way to access the cached analysis result without first
|
||||
* calling L::require(), which makes sure that the analysis pass is rerun
|
||||
* if necessary.
|
||||
*
|
||||
* - The cached result doesn't become inconsistent with the program for as
|
||||
* long as it remains marked up-to-date. (This is only enforced in debug
|
||||
* builds for performance reasons)
|
||||
*
|
||||
* The requirements on \p T are the following:
|
||||
*
|
||||
* - Constructible with a single argument, as in 'x = T(c)' for \p c of type
|
||||
* \p C.
|
||||
*
|
||||
* - 'x.dependency_class()' on const \p x returns a bitset of
|
||||
* brw::analysis_dependency_class specifying the set of IR objects that are
|
||||
* required to remain invariant for the cached analysis result to be
|
||||
* considered valid.
|
||||
*
|
||||
* - 'x.validate(c)' on const \p x returns a boolean result specifying
|
||||
* whether the analysis result \p x is consistent with the input IR. This
|
||||
* is currently only used for validation in debug builds.
|
||||
*/
|
||||
template<class T, class C>
|
||||
class brw_analysis {
|
||||
public:
|
||||
/**
|
||||
* Construct a program analysis. \p c is an arbitrary object
|
||||
* passed as argument to the constructor of the analysis result
|
||||
* object of type \p T.
|
||||
*/
|
||||
brw_analysis(const C *c) : c(c), p(NULL) {}
|
||||
|
||||
/**
|
||||
* Destroy a program analysis.
|
||||
*/
|
||||
~brw_analysis()
|
||||
{
|
||||
delete p;
|
||||
}
|
||||
|
||||
/**
|
||||
* Obtain the result of a program analysis. This gives a
|
||||
* guaranteed up-to-date result, the analysis pass will be
|
||||
* rerun implicitly if it has become stale.
|
||||
*/
|
||||
T &
|
||||
require()
|
||||
{
|
||||
if (p)
|
||||
assert(p->validate(c));
|
||||
else
|
||||
p = new T(c);
|
||||
|
||||
return *p;
|
||||
}
|
||||
|
||||
const T &
|
||||
require() const
|
||||
{
|
||||
return const_cast<brw_analysis<T, C> *>(this)->require();
|
||||
}
|
||||
|
||||
/**
|
||||
* Report that dependencies of the analysis pass may have changed
|
||||
* since the last calculation and the cached analysis result may
|
||||
* have to be discarded.
|
||||
*/
|
||||
void
|
||||
invalidate(brw::analysis_dependency_class c)
|
||||
{
|
||||
if (p && (c & p->dependency_class())) {
|
||||
delete p;
|
||||
p = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
const C *c;
|
||||
T *p;
|
||||
};
|
||||
|
||||
#endif
|
||||
737
src/intel/compiler/elk/brw_ir_fs.h
Normal file
737
src/intel/compiler/elk/brw_ir_fs.h
Normal file
|
|
@ -0,0 +1,737 @@
|
|||
/* -*- c++ -*- */
|
||||
/*
|
||||
* Copyright © 2010-2015 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BRW_IR_FS_H
|
||||
#define BRW_IR_FS_H
|
||||
|
||||
#include "brw_shader.h"
|
||||
|
||||
class fs_inst;
|
||||
|
||||
class fs_reg : public backend_reg {
|
||||
public:
|
||||
DECLARE_RALLOC_CXX_OPERATORS(fs_reg)
|
||||
|
||||
void init();
|
||||
|
||||
fs_reg();
|
||||
fs_reg(struct ::brw_reg reg);
|
||||
fs_reg(enum brw_reg_file file, unsigned nr);
|
||||
fs_reg(enum brw_reg_file file, unsigned nr, enum brw_reg_type type);
|
||||
|
||||
bool equals(const fs_reg &r) const;
|
||||
bool negative_equals(const fs_reg &r) const;
|
||||
bool is_contiguous() const;
|
||||
|
||||
/**
|
||||
* Return the size in bytes of a single logical component of the
|
||||
* register assuming the given execution width.
|
||||
*/
|
||||
unsigned component_size(unsigned width) const;
|
||||
|
||||
/** Register region horizontal stride */
|
||||
uint8_t stride;
|
||||
};
|
||||
|
||||
static inline fs_reg
|
||||
negate(fs_reg reg)
|
||||
{
|
||||
assert(reg.file != IMM);
|
||||
reg.negate = !reg.negate;
|
||||
return reg;
|
||||
}
|
||||
|
||||
static inline fs_reg
|
||||
retype(fs_reg reg, enum brw_reg_type type)
|
||||
{
|
||||
reg.type = type;
|
||||
return reg;
|
||||
}
|
||||
|
||||
static inline fs_reg
|
||||
byte_offset(fs_reg reg, unsigned delta)
|
||||
{
|
||||
switch (reg.file) {
|
||||
case BAD_FILE:
|
||||
break;
|
||||
case VGRF:
|
||||
case ATTR:
|
||||
case UNIFORM:
|
||||
reg.offset += delta;
|
||||
break;
|
||||
case MRF: {
|
||||
const unsigned suboffset = reg.offset + delta;
|
||||
reg.nr += suboffset / REG_SIZE;
|
||||
reg.offset = suboffset % REG_SIZE;
|
||||
break;
|
||||
}
|
||||
case ARF:
|
||||
case FIXED_GRF: {
|
||||
const unsigned suboffset = reg.subnr + delta;
|
||||
reg.nr += suboffset / REG_SIZE;
|
||||
reg.subnr = suboffset % REG_SIZE;
|
||||
break;
|
||||
}
|
||||
case IMM:
|
||||
default:
|
||||
assert(delta == 0);
|
||||
}
|
||||
return reg;
|
||||
}
|
||||
|
||||
static inline fs_reg
|
||||
horiz_offset(const fs_reg ®, unsigned delta)
|
||||
{
|
||||
switch (reg.file) {
|
||||
case BAD_FILE:
|
||||
case UNIFORM:
|
||||
case IMM:
|
||||
/* These only have a single component that is implicitly splatted. A
|
||||
* horizontal offset should be a harmless no-op.
|
||||
* XXX - Handle vector immediates correctly.
|
||||
*/
|
||||
return reg;
|
||||
case VGRF:
|
||||
case MRF:
|
||||
case ATTR:
|
||||
return byte_offset(reg, delta * reg.stride * type_sz(reg.type));
|
||||
case ARF:
|
||||
case FIXED_GRF:
|
||||
if (reg.is_null()) {
|
||||
return reg;
|
||||
} else {
|
||||
const unsigned hstride = reg.hstride ? 1 << (reg.hstride - 1) : 0;
|
||||
const unsigned vstride = reg.vstride ? 1 << (reg.vstride - 1) : 0;
|
||||
const unsigned width = 1 << reg.width;
|
||||
|
||||
if (delta % width == 0) {
|
||||
return byte_offset(reg, delta / width * vstride * type_sz(reg.type));
|
||||
} else {
|
||||
assert(vstride == hstride * width);
|
||||
return byte_offset(reg, delta * hstride * type_sz(reg.type));
|
||||
}
|
||||
}
|
||||
}
|
||||
unreachable("Invalid register file");
|
||||
}
|
||||
|
||||
static inline fs_reg
|
||||
offset(fs_reg reg, unsigned width, unsigned delta)
|
||||
{
|
||||
switch (reg.file) {
|
||||
case BAD_FILE:
|
||||
break;
|
||||
case ARF:
|
||||
case FIXED_GRF:
|
||||
case MRF:
|
||||
case VGRF:
|
||||
case ATTR:
|
||||
case UNIFORM:
|
||||
return byte_offset(reg, delta * reg.component_size(width));
|
||||
case IMM:
|
||||
assert(delta == 0);
|
||||
}
|
||||
return reg;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the scalar channel of \p reg given by \p idx and replicate it to all
|
||||
* channels of the result.
|
||||
*/
|
||||
static inline fs_reg
|
||||
component(fs_reg reg, unsigned idx)
|
||||
{
|
||||
reg = horiz_offset(reg, idx);
|
||||
reg.stride = 0;
|
||||
if (reg.file == ARF || reg.file == FIXED_GRF) {
|
||||
reg.vstride = BRW_VERTICAL_STRIDE_0;
|
||||
reg.width = BRW_WIDTH_1;
|
||||
reg.hstride = BRW_HORIZONTAL_STRIDE_0;
|
||||
}
|
||||
return reg;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an integer identifying the discrete address space a register is
|
||||
* contained in. A register is by definition fully contained in the single
|
||||
* reg_space it belongs to, so two registers with different reg_space ids are
|
||||
* guaranteed not to overlap. Most register files are a single reg_space of
|
||||
* its own, only the VGRF and ATTR files are composed of multiple discrete
|
||||
* address spaces, one for each allocation and input attribute respectively.
|
||||
*/
|
||||
static inline uint32_t
|
||||
reg_space(const fs_reg &r)
|
||||
{
|
||||
return r.file << 16 | (r.file == VGRF || r.file == ATTR ? r.nr : 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the base offset in bytes of a register relative to the start of its
|
||||
* reg_space().
|
||||
*/
|
||||
static inline unsigned
|
||||
reg_offset(const fs_reg &r)
|
||||
{
|
||||
return (r.file == VGRF || r.file == IMM || r.file == ATTR ? 0 : r.nr) *
|
||||
(r.file == UNIFORM ? 4 : REG_SIZE) + r.offset +
|
||||
(r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the amount of padding in bytes left unused between individual
|
||||
* components of register \p r due to a (horizontal) stride value greater than
|
||||
* one, or zero if components are tightly packed in the register file.
|
||||
*/
|
||||
static inline unsigned
|
||||
reg_padding(const fs_reg &r)
|
||||
{
|
||||
const unsigned stride = ((r.file != ARF && r.file != FIXED_GRF) ? r.stride :
|
||||
r.hstride == 0 ? 0 :
|
||||
1 << (r.hstride - 1));
|
||||
return (MAX2(1, stride) - 1) * type_sz(r.type);
|
||||
}
|
||||
|
||||
/* Do not call this directly. Call regions_overlap() instead. */
|
||||
static inline bool
|
||||
regions_overlap_MRF(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
|
||||
{
|
||||
if (r.nr & BRW_MRF_COMPR4) {
|
||||
fs_reg t = r;
|
||||
t.nr &= ~BRW_MRF_COMPR4;
|
||||
/* COMPR4 regions are translated by the hardware during decompression
|
||||
* into two separate half-regions 4 MRFs apart from each other.
|
||||
*
|
||||
* Note: swapping s and t in this parameter list eliminates one possible
|
||||
* level of recursion (since the s in the called versions of
|
||||
* regions_overlap_MRF can't be COMPR4), and that makes the compiled
|
||||
* code a lot smaller.
|
||||
*/
|
||||
return regions_overlap_MRF(s, ds, t, dr / 2) ||
|
||||
regions_overlap_MRF(s, ds, byte_offset(t, 4 * REG_SIZE), dr / 2);
|
||||
} else if (s.nr & BRW_MRF_COMPR4) {
|
||||
return regions_overlap_MRF(s, ds, r, dr);
|
||||
}
|
||||
|
||||
return !((r.nr * REG_SIZE + r.offset + dr) <= (s.nr * REG_SIZE + s.offset) ||
|
||||
(s.nr * REG_SIZE + s.offset + ds) <= (r.nr * REG_SIZE + r.offset));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return whether the register region starting at \p r and spanning \p dr
|
||||
* bytes could potentially overlap the register region starting at \p s and
|
||||
* spanning \p ds bytes.
|
||||
*/
|
||||
static inline bool
|
||||
regions_overlap(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
|
||||
{
|
||||
if (r.file != s.file)
|
||||
return false;
|
||||
|
||||
if (r.file == VGRF) {
|
||||
return r.nr == s.nr &&
|
||||
!(r.offset + dr <= s.offset || s.offset + ds <= r.offset);
|
||||
} else if (r.file != MRF) {
|
||||
return !(reg_offset(r) + dr <= reg_offset(s) ||
|
||||
reg_offset(s) + ds <= reg_offset(r));
|
||||
} else {
|
||||
return regions_overlap_MRF(r, dr, s, ds);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check that the register region given by r [r.offset, r.offset + dr[
|
||||
* is fully contained inside the register region given by s
|
||||
* [s.offset, s.offset + ds[.
|
||||
*/
|
||||
static inline bool
|
||||
region_contained_in(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
|
||||
{
|
||||
return reg_space(r) == reg_space(s) &&
|
||||
reg_offset(r) >= reg_offset(s) &&
|
||||
reg_offset(r) + dr <= reg_offset(s) + ds;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return whether the given register region is n-periodic, i.e. whether the
|
||||
* original region remains invariant after shifting it by \p n scalar
|
||||
* channels.
|
||||
*/
|
||||
static inline bool
|
||||
is_periodic(const fs_reg ®, unsigned n)
|
||||
{
|
||||
if (reg.file == BAD_FILE || reg.is_null()) {
|
||||
return true;
|
||||
|
||||
} else if (reg.file == IMM) {
|
||||
const unsigned period = (reg.type == BRW_REGISTER_TYPE_UV ||
|
||||
reg.type == BRW_REGISTER_TYPE_V ? 8 :
|
||||
reg.type == BRW_REGISTER_TYPE_VF ? 4 :
|
||||
1);
|
||||
return n % period == 0;
|
||||
|
||||
} else if (reg.file == ARF || reg.file == FIXED_GRF) {
|
||||
const unsigned period = (reg.hstride == 0 && reg.vstride == 0 ? 1 :
|
||||
reg.vstride == 0 ? 1 << reg.width :
|
||||
~0);
|
||||
return n % period == 0;
|
||||
|
||||
} else {
|
||||
return reg.stride == 0;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool
|
||||
is_uniform(const fs_reg ®)
|
||||
{
|
||||
return is_periodic(reg, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the specified 8-component quarter of a register.
|
||||
*/
|
||||
static inline fs_reg
|
||||
quarter(const fs_reg ®, unsigned idx)
|
||||
{
|
||||
assert(idx < 4);
|
||||
return horiz_offset(reg, 8 * idx);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reinterpret each channel of register \p reg as a vector of values of the
|
||||
* given smaller type and take the i-th subcomponent from each.
|
||||
*/
|
||||
static inline fs_reg
|
||||
subscript(fs_reg reg, brw_reg_type type, unsigned i)
|
||||
{
|
||||
assert((i + 1) * type_sz(type) <= type_sz(reg.type));
|
||||
|
||||
if (reg.file == ARF || reg.file == FIXED_GRF) {
|
||||
/* The stride is encoded inconsistently for fixed GRF and ARF registers
|
||||
* as the log2 of the actual vertical and horizontal strides.
|
||||
*/
|
||||
const int delta = util_logbase2(type_sz(reg.type)) -
|
||||
util_logbase2(type_sz(type));
|
||||
reg.hstride += (reg.hstride ? delta : 0);
|
||||
reg.vstride += (reg.vstride ? delta : 0);
|
||||
|
||||
} else if (reg.file == IMM) {
|
||||
unsigned bit_size = type_sz(type) * 8;
|
||||
reg.u64 >>= i * bit_size;
|
||||
reg.u64 &= BITFIELD64_MASK(bit_size);
|
||||
if (bit_size <= 16)
|
||||
reg.u64 |= reg.u64 << 16;
|
||||
return retype(reg, type);
|
||||
} else {
|
||||
reg.stride *= type_sz(reg.type) / type_sz(type);
|
||||
}
|
||||
|
||||
return byte_offset(retype(reg, type), i * type_sz(type));
|
||||
}
|
||||
|
||||
static inline fs_reg
|
||||
horiz_stride(fs_reg reg, unsigned s)
|
||||
{
|
||||
reg.stride *= s;
|
||||
return reg;
|
||||
}
|
||||
|
||||
static const fs_reg reg_undef;
|
||||
|
||||
class fs_inst : public backend_instruction {
|
||||
fs_inst &operator=(const fs_inst &);
|
||||
|
||||
void init(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
|
||||
const fs_reg *src, unsigned sources);
|
||||
|
||||
public:
|
||||
DECLARE_RALLOC_CXX_OPERATORS(fs_inst)
|
||||
|
||||
fs_inst();
|
||||
fs_inst(enum opcode opcode, uint8_t exec_size);
|
||||
fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst);
|
||||
fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
||||
const fs_reg &src0);
|
||||
fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
||||
const fs_reg &src0, const fs_reg &src1);
|
||||
fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
||||
const fs_reg &src0, const fs_reg &src1, const fs_reg &src2);
|
||||
fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
||||
const fs_reg src[], unsigned sources);
|
||||
fs_inst(const fs_inst &that);
|
||||
~fs_inst();
|
||||
|
||||
void resize_sources(uint8_t num_sources);
|
||||
|
||||
bool is_send_from_grf() const;
|
||||
bool is_payload(unsigned arg) const;
|
||||
bool is_partial_write() const;
|
||||
unsigned components_read(unsigned i) const;
|
||||
unsigned size_read(int arg) const;
|
||||
bool can_do_source_mods(const struct intel_device_info *devinfo) const;
|
||||
bool can_do_cmod();
|
||||
bool can_change_types() const;
|
||||
bool has_source_and_destination_hazard() const;
|
||||
unsigned implied_mrf_writes() const;
|
||||
|
||||
/**
|
||||
* Return whether \p arg is a control source of a virtual instruction which
|
||||
* shouldn't contribute to the execution type and usual regioning
|
||||
* restriction calculations of arithmetic instructions.
|
||||
*/
|
||||
bool is_control_source(unsigned arg) const;
|
||||
|
||||
/**
|
||||
* Return the subset of flag registers read by the instruction as a bitset
|
||||
* with byte granularity.
|
||||
*/
|
||||
unsigned flags_read(const intel_device_info *devinfo) const;
|
||||
|
||||
/**
|
||||
* Return the subset of flag registers updated by the instruction (either
|
||||
* partially or fully) as a bitset with byte granularity.
|
||||
*/
|
||||
unsigned flags_written(const intel_device_info *devinfo) const;
|
||||
|
||||
/**
|
||||
* Return true if this instruction is a sampler message gathering residency
|
||||
* data.
|
||||
*/
|
||||
bool has_sampler_residency() const;
|
||||
|
||||
fs_reg dst;
|
||||
fs_reg *src;
|
||||
|
||||
uint8_t sources; /**< Number of fs_reg sources. */
|
||||
|
||||
bool last_rt:1;
|
||||
bool pi_noperspective:1; /**< Pixel interpolator noperspective flag */
|
||||
bool keep_payload_trailing_zeros;
|
||||
|
||||
tgl_swsb sched; /**< Scheduling info. */
|
||||
};
|
||||
|
||||
/**
|
||||
* Make the execution of \p inst dependent on the evaluation of a possibly
|
||||
* inverted predicate.
|
||||
*/
|
||||
static inline fs_inst *
|
||||
set_predicate_inv(enum brw_predicate pred, bool inverse,
|
||||
fs_inst *inst)
|
||||
{
|
||||
inst->predicate = pred;
|
||||
inst->predicate_inverse = inverse;
|
||||
return inst;
|
||||
}
|
||||
|
||||
/**
|
||||
* Make the execution of \p inst dependent on the evaluation of a predicate.
|
||||
*/
|
||||
static inline fs_inst *
|
||||
set_predicate(enum brw_predicate pred, fs_inst *inst)
|
||||
{
|
||||
return set_predicate_inv(pred, false, inst);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the result of evaluating the condition given by \p mod to a flag
|
||||
* register.
|
||||
*/
|
||||
static inline fs_inst *
|
||||
set_condmod(enum brw_conditional_mod mod, fs_inst *inst)
|
||||
{
|
||||
inst->conditional_mod = mod;
|
||||
return inst;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clamp the result of \p inst to the saturation range of its destination
|
||||
* datatype.
|
||||
*/
|
||||
static inline fs_inst *
|
||||
set_saturate(bool saturate, fs_inst *inst)
|
||||
{
|
||||
inst->saturate = saturate;
|
||||
return inst;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of dataflow registers written by the instruction (either
|
||||
* fully or partially) counted from 'floor(reg_offset(inst->dst) /
|
||||
* register_size)'. The somewhat arbitrary register size unit is 4B for the
|
||||
* UNIFORM and IMM files and 32B for all other files.
|
||||
*/
|
||||
inline unsigned
|
||||
regs_written(const fs_inst *inst)
|
||||
{
|
||||
assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
|
||||
return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE +
|
||||
inst->size_written -
|
||||
MIN2(inst->size_written, reg_padding(inst->dst)),
|
||||
REG_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of dataflow registers read by the instruction (either
|
||||
* fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
|
||||
* register_size)'. The somewhat arbitrary register size unit is 4B for the
|
||||
* UNIFORM files and 32B for all other files.
|
||||
*/
|
||||
inline unsigned
|
||||
regs_read(const fs_inst *inst, unsigned i)
|
||||
{
|
||||
if (inst->src[i].file == IMM)
|
||||
return 1;
|
||||
|
||||
const unsigned reg_size = inst->src[i].file == UNIFORM ? 4 : REG_SIZE;
|
||||
return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size +
|
||||
inst->size_read(i) -
|
||||
MIN2(inst->size_read(i), reg_padding(inst->src[i])),
|
||||
reg_size);
|
||||
}
|
||||
|
||||
static inline enum brw_reg_type
|
||||
get_exec_type(const fs_inst *inst)
|
||||
{
|
||||
brw_reg_type exec_type = BRW_REGISTER_TYPE_B;
|
||||
|
||||
for (int i = 0; i < inst->sources; i++) {
|
||||
if (inst->src[i].file != BAD_FILE &&
|
||||
!inst->is_control_source(i)) {
|
||||
const brw_reg_type t = get_exec_type(inst->src[i].type);
|
||||
if (type_sz(t) > type_sz(exec_type))
|
||||
exec_type = t;
|
||||
else if (type_sz(t) == type_sz(exec_type) &&
|
||||
brw_reg_type_is_floating_point(t))
|
||||
exec_type = t;
|
||||
}
|
||||
}
|
||||
|
||||
if (exec_type == BRW_REGISTER_TYPE_B)
|
||||
exec_type = inst->dst.type;
|
||||
|
||||
assert(exec_type != BRW_REGISTER_TYPE_B);
|
||||
|
||||
/* Promotion of the execution type to 32-bit for conversions from or to
|
||||
* half-float seems to be consistent with the following text from the
|
||||
* Cherryview PRM Vol. 7, "Execution Data Type":
|
||||
*
|
||||
* "When single precision and half precision floats are mixed between
|
||||
* source operands or between source and destination operand [..] single
|
||||
* precision float is the execution datatype."
|
||||
*
|
||||
* and from "Register Region Restrictions":
|
||||
*
|
||||
* "Conversion between Integer and HF (Half Float) must be DWord aligned
|
||||
* and strided by a DWord on the destination."
|
||||
*/
|
||||
if (type_sz(exec_type) == 2 &&
|
||||
inst->dst.type != exec_type) {
|
||||
if (exec_type == BRW_REGISTER_TYPE_HF)
|
||||
exec_type = BRW_REGISTER_TYPE_F;
|
||||
else if (inst->dst.type == BRW_REGISTER_TYPE_HF)
|
||||
exec_type = BRW_REGISTER_TYPE_D;
|
||||
}
|
||||
|
||||
return exec_type;
|
||||
}
|
||||
|
||||
static inline unsigned
|
||||
get_exec_type_size(const fs_inst *inst)
|
||||
{
|
||||
return type_sz(get_exec_type(inst));
|
||||
}
|
||||
|
||||
static inline bool
|
||||
is_send(const fs_inst *inst)
|
||||
{
|
||||
return inst->mlen || inst->is_send_from_grf();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return whether the instruction isn't an ALU instruction and cannot be
|
||||
* assumed to complete in-order.
|
||||
*/
|
||||
static inline bool
|
||||
is_unordered(const intel_device_info *devinfo, const fs_inst *inst)
|
||||
{
|
||||
return is_send(inst) || (devinfo->ver < 20 && inst->is_math()) ||
|
||||
inst->opcode == BRW_OPCODE_DPAS ||
|
||||
(devinfo->has_64bit_float_via_math_pipe &&
|
||||
(get_exec_type(inst) == BRW_REGISTER_TYPE_DF ||
|
||||
inst->dst.type == BRW_REGISTER_TYPE_DF));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return whether the following regioning restriction applies to the specified
|
||||
* instruction. From the Cherryview PRM Vol 7. "Register Region
|
||||
* Restrictions":
|
||||
*
|
||||
* "When source or destination datatype is 64b or operation is integer DWord
|
||||
* multiply, regioning in Align1 must follow these rules:
|
||||
*
|
||||
* 1. Source and Destination horizontal stride must be aligned to the same qword.
|
||||
* 2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride.
|
||||
* 3. Source and Destination offset must be the same, except the case of
|
||||
* scalar source."
|
||||
*/
|
||||
static inline bool
|
||||
has_dst_aligned_region_restriction(const intel_device_info *devinfo,
|
||||
const fs_inst *inst,
|
||||
brw_reg_type dst_type)
|
||||
{
|
||||
const brw_reg_type exec_type = get_exec_type(inst);
|
||||
/* Even though the hardware spec claims that "integer DWord multiply"
|
||||
* operations are restricted, empirical evidence and the behavior of the
|
||||
* simulator suggest that only 32x32-bit integer multiplication is
|
||||
* restricted.
|
||||
*/
|
||||
const bool is_dword_multiply = !brw_reg_type_is_floating_point(exec_type) &&
|
||||
((inst->opcode == BRW_OPCODE_MUL &&
|
||||
MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4) ||
|
||||
(inst->opcode == BRW_OPCODE_MAD &&
|
||||
MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4));
|
||||
|
||||
if (type_sz(dst_type) > 4 || type_sz(exec_type) > 4 ||
|
||||
(type_sz(exec_type) == 4 && is_dword_multiply))
|
||||
return devinfo->platform == INTEL_PLATFORM_CHV ||
|
||||
intel_device_info_is_9lp(devinfo) ||
|
||||
devinfo->verx10 >= 125;
|
||||
|
||||
else if (brw_reg_type_is_floating_point(dst_type))
|
||||
return devinfo->verx10 >= 125;
|
||||
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
has_dst_aligned_region_restriction(const intel_device_info *devinfo,
|
||||
const fs_inst *inst)
|
||||
{
|
||||
return has_dst_aligned_region_restriction(devinfo, inst, inst->dst.type);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return whether the LOAD_PAYLOAD instruction is a plain copy of bits from
|
||||
* the specified register file into a VGRF.
|
||||
*
|
||||
* This implies identity register regions without any source-destination
|
||||
* overlap, but otherwise has no implications on the location of sources and
|
||||
* destination in the register file: Gathering any number of portions from
|
||||
* multiple virtual registers in any order is allowed.
|
||||
*/
|
||||
inline bool
|
||||
is_copy_payload(brw_reg_file file, const fs_inst *inst)
|
||||
{
|
||||
if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD ||
|
||||
inst->is_partial_write() || inst->saturate ||
|
||||
inst->dst.file != VGRF)
|
||||
return false;
|
||||
|
||||
for (unsigned i = 0; i < inst->sources; i++) {
|
||||
if (inst->src[i].file != file ||
|
||||
inst->src[i].abs || inst->src[i].negate)
|
||||
return false;
|
||||
|
||||
if (!inst->src[i].is_contiguous())
|
||||
return false;
|
||||
|
||||
if (regions_overlap(inst->dst, inst->size_written,
|
||||
inst->src[i], inst->size_read(i)))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Like is_copy_payload(), but the instruction is required to copy a single
|
||||
* contiguous block of registers from the given register file into the
|
||||
* destination without any reordering.
|
||||
*/
|
||||
inline bool
|
||||
is_identity_payload(brw_reg_file file, const fs_inst *inst) {
|
||||
if (is_copy_payload(file, inst)) {
|
||||
fs_reg reg = inst->src[0];
|
||||
|
||||
for (unsigned i = 0; i < inst->sources; i++) {
|
||||
reg.type = inst->src[i].type;
|
||||
if (!inst->src[i].equals(reg))
|
||||
return false;
|
||||
|
||||
reg = byte_offset(reg, inst->size_read(i));
|
||||
}
|
||||
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Like is_copy_payload(), but the instruction is required to source data from
|
||||
* at least two disjoint VGRFs.
|
||||
*
|
||||
* This doesn't necessarily rule out the elimination of this instruction
|
||||
* through register coalescing, but due to limitations of the register
|
||||
* coalesce pass it might be impossible to do so directly until a later stage,
|
||||
* when the LOAD_PAYLOAD instruction is unrolled into a sequence of MOV
|
||||
* instructions.
|
||||
*/
|
||||
inline bool
|
||||
is_multi_copy_payload(const fs_inst *inst) {
|
||||
if (is_copy_payload(VGRF, inst)) {
|
||||
for (unsigned i = 0; i < inst->sources; i++) {
|
||||
if (inst->src[i].nr != inst->src[0].nr)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Like is_identity_payload(), but the instruction is required to copy the
|
||||
* whole contents of a single VGRF into the destination.
|
||||
*
|
||||
* This means that there is a good chance that the instruction will be
|
||||
* eliminated through register coalescing, but it's neither a necessary nor a
|
||||
* sufficient condition for that to happen -- E.g. consider the case where
|
||||
* source and destination registers diverge due to other instructions in the
|
||||
* program overwriting part of their contents, which isn't something we can
|
||||
* predict up front based on a cheap strictly local test of the copy
|
||||
* instruction.
|
||||
*/
|
||||
inline bool
|
||||
is_coalescing_payload(const brw::simple_allocator &alloc, const fs_inst *inst)
|
||||
{
|
||||
return is_identity_payload(VGRF, inst) &&
|
||||
inst->src[0].offset == 0 &&
|
||||
alloc.sizes[inst->src[0].nr] * REG_SIZE == inst->size_written;
|
||||
}
|
||||
|
||||
bool
|
||||
has_bank_conflict(const struct brw_isa_info *isa, const fs_inst *inst);
|
||||
|
||||
#endif
|
||||
1698
src/intel/compiler/elk/brw_ir_performance.cpp
Normal file
1698
src/intel/compiler/elk/brw_ir_performance.cpp
Normal file
File diff suppressed because it is too large
Load diff
86
src/intel/compiler/elk/brw_ir_performance.h
Normal file
86
src/intel/compiler/elk/brw_ir_performance.h
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
/* -*- c++ -*- */
|
||||
/*
|
||||
* Copyright © 2020 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BRW_IR_PERFORMANCE_H
|
||||
#define BRW_IR_PERFORMANCE_H
|
||||
|
||||
class fs_visitor;
|
||||
|
||||
namespace brw {
|
||||
class vec4_visitor;
|
||||
|
||||
/**
|
||||
* Various estimates of the performance of a shader based on static
|
||||
* analysis.
|
||||
*/
|
||||
struct performance {
|
||||
performance(const fs_visitor *v);
|
||||
performance(const vec4_visitor *v);
|
||||
~performance();
|
||||
|
||||
analysis_dependency_class
|
||||
dependency_class() const
|
||||
{
|
||||
return (DEPENDENCY_INSTRUCTIONS |
|
||||
DEPENDENCY_BLOCKS);
|
||||
}
|
||||
|
||||
bool
|
||||
validate(const backend_shader *) const
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Array containing estimates of the runtime of each basic block of the
|
||||
* program in cycle units.
|
||||
*/
|
||||
unsigned *block_latency;
|
||||
|
||||
/**
|
||||
* Estimate of the runtime of the whole program in cycle units assuming
|
||||
* uncontended execution.
|
||||
*/
|
||||
unsigned latency;
|
||||
|
||||
/**
|
||||
* Estimate of the throughput of the whole program in
|
||||
* invocations-per-cycle units.
|
||||
*
|
||||
* Note that this might be lower than the ratio between the dispatch
|
||||
* width of the program and its latency estimate in cases where
|
||||
* performance doesn't scale without limits as a function of its thread
|
||||
* parallelism, e.g. due to the existence of a bottleneck in a shared
|
||||
* function.
|
||||
*/
|
||||
float throughput;
|
||||
|
||||
private:
|
||||
performance(const performance &perf);
|
||||
performance &
|
||||
operator=(performance u);
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
475
src/intel/compiler/elk/brw_ir_vec4.h
Normal file
475
src/intel/compiler/elk/brw_ir_vec4.h
Normal file
|
|
@ -0,0 +1,475 @@
|
|||
/* -*- c++ -*- */
|
||||
/*
|
||||
* Copyright © 2011-2015 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BRW_IR_VEC4_H
|
||||
#define BRW_IR_VEC4_H
|
||||
|
||||
#include "brw_shader.h"
|
||||
|
||||
namespace brw {
|
||||
|
||||
class dst_reg;
|
||||
|
||||
class src_reg : public backend_reg
|
||||
{
|
||||
public:
|
||||
DECLARE_RALLOC_CXX_OPERATORS(src_reg)
|
||||
|
||||
void init();
|
||||
|
||||
src_reg(enum brw_reg_file file, int nr, const glsl_type *type);
|
||||
src_reg();
|
||||
src_reg(struct ::brw_reg reg);
|
||||
|
||||
bool equals(const src_reg &r) const;
|
||||
bool negative_equals(const src_reg &r) const;
|
||||
|
||||
src_reg(class vec4_visitor *v, const struct glsl_type *type);
|
||||
src_reg(class vec4_visitor *v, const struct glsl_type *type, int size);
|
||||
|
||||
explicit src_reg(const dst_reg ®);
|
||||
|
||||
src_reg *reladdr;
|
||||
};
|
||||
|
||||
static inline src_reg
|
||||
retype(src_reg reg, enum brw_reg_type type)
|
||||
{
|
||||
reg.type = type;
|
||||
return reg;
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
|
||||
static inline void
|
||||
add_byte_offset(backend_reg *reg, unsigned bytes)
|
||||
{
|
||||
switch (reg->file) {
|
||||
case BAD_FILE:
|
||||
break;
|
||||
case VGRF:
|
||||
case ATTR:
|
||||
case UNIFORM:
|
||||
reg->offset += bytes;
|
||||
assert(reg->offset % 16 == 0);
|
||||
break;
|
||||
case MRF: {
|
||||
const unsigned suboffset = reg->offset + bytes;
|
||||
reg->nr += suboffset / REG_SIZE;
|
||||
reg->offset = suboffset % REG_SIZE;
|
||||
assert(reg->offset % 16 == 0);
|
||||
break;
|
||||
}
|
||||
case ARF:
|
||||
case FIXED_GRF: {
|
||||
const unsigned suboffset = reg->subnr + bytes;
|
||||
reg->nr += suboffset / REG_SIZE;
|
||||
reg->subnr = suboffset % REG_SIZE;
|
||||
assert(reg->subnr % 16 == 0);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
assert(bytes == 0);
|
||||
}
|
||||
}
|
||||
|
||||
} /* namespace detail */
|
||||
|
||||
static inline src_reg
|
||||
byte_offset(src_reg reg, unsigned bytes)
|
||||
{
|
||||
detail::add_byte_offset(®, bytes);
|
||||
return reg;
|
||||
}
|
||||
|
||||
static inline src_reg
|
||||
offset(src_reg reg, unsigned width, unsigned delta)
|
||||
{
|
||||
const unsigned stride = (reg.file == UNIFORM ? 0 : 4);
|
||||
const unsigned num_components = MAX2(width / 4 * stride, 4);
|
||||
return byte_offset(reg, num_components * type_sz(reg.type) * delta);
|
||||
}
|
||||
|
||||
static inline src_reg
|
||||
horiz_offset(src_reg reg, unsigned delta)
|
||||
{
|
||||
return byte_offset(reg, delta * type_sz(reg.type));
|
||||
}
|
||||
|
||||
/**
|
||||
* Reswizzle a given source register.
|
||||
* \sa brw_swizzle().
|
||||
*/
|
||||
static inline src_reg
|
||||
swizzle(src_reg reg, unsigned swizzle)
|
||||
{
|
||||
if (reg.file == IMM)
|
||||
reg.ud = brw_swizzle_immediate(reg.type, reg.ud, swizzle);
|
||||
else
|
||||
reg.swizzle = brw_compose_swizzle(swizzle, reg.swizzle);
|
||||
|
||||
return reg;
|
||||
}
|
||||
|
||||
static inline src_reg
|
||||
negate(src_reg reg)
|
||||
{
|
||||
assert(reg.file != IMM);
|
||||
reg.negate = !reg.negate;
|
||||
return reg;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
is_uniform(const src_reg ®)
|
||||
{
|
||||
return (reg.file == IMM || reg.file == UNIFORM || reg.is_null()) &&
|
||||
(!reg.reladdr || is_uniform(*reg.reladdr));
|
||||
}
|
||||
|
||||
class dst_reg : public backend_reg
|
||||
{
|
||||
public:
|
||||
DECLARE_RALLOC_CXX_OPERATORS(dst_reg)
|
||||
|
||||
void init();
|
||||
|
||||
dst_reg();
|
||||
dst_reg(enum brw_reg_file file, int nr);
|
||||
dst_reg(enum brw_reg_file file, int nr, const glsl_type *type,
|
||||
unsigned writemask);
|
||||
dst_reg(enum brw_reg_file file, int nr, brw_reg_type type,
|
||||
unsigned writemask);
|
||||
dst_reg(struct ::brw_reg reg);
|
||||
dst_reg(class vec4_visitor *v, const struct glsl_type *type);
|
||||
|
||||
explicit dst_reg(const src_reg ®);
|
||||
|
||||
bool equals(const dst_reg &r) const;
|
||||
|
||||
src_reg *reladdr;
|
||||
};
|
||||
|
||||
static inline dst_reg
|
||||
retype(dst_reg reg, enum brw_reg_type type)
|
||||
{
|
||||
reg.type = type;
|
||||
return reg;
|
||||
}
|
||||
|
||||
static inline dst_reg
|
||||
byte_offset(dst_reg reg, unsigned bytes)
|
||||
{
|
||||
detail::add_byte_offset(®, bytes);
|
||||
return reg;
|
||||
}
|
||||
|
||||
static inline dst_reg
|
||||
offset(dst_reg reg, unsigned width, unsigned delta)
|
||||
{
|
||||
const unsigned stride = (reg.file == UNIFORM ? 0 : 4);
|
||||
const unsigned num_components = MAX2(width / 4 * stride, 4);
|
||||
return byte_offset(reg, num_components * type_sz(reg.type) * delta);
|
||||
}
|
||||
|
||||
static inline dst_reg
|
||||
horiz_offset(const dst_reg ®, unsigned delta)
|
||||
{
|
||||
if (is_uniform(src_reg(reg)))
|
||||
return reg;
|
||||
else
|
||||
return byte_offset(reg, delta * type_sz(reg.type));
|
||||
}
|
||||
|
||||
static inline dst_reg
|
||||
writemask(dst_reg reg, unsigned mask)
|
||||
{
|
||||
assert(reg.file != IMM);
|
||||
assert((reg.writemask & mask) != 0);
|
||||
reg.writemask &= mask;
|
||||
return reg;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an integer identifying the discrete address space a register is
|
||||
* contained in. A register is by definition fully contained in the single
|
||||
* reg_space it belongs to, so two registers with different reg_space ids are
|
||||
* guaranteed not to overlap. Most register files are a single reg_space of
|
||||
* its own, only the VGRF file is composed of multiple discrete address
|
||||
* spaces, one for each VGRF allocation.
|
||||
*/
|
||||
static inline uint32_t
|
||||
reg_space(const backend_reg &r)
|
||||
{
|
||||
return r.file << 16 | (r.file == VGRF ? r.nr : 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the base offset in bytes of a register relative to the start of its
|
||||
* reg_space().
|
||||
*/
|
||||
static inline unsigned
|
||||
reg_offset(const backend_reg &r)
|
||||
{
|
||||
return (r.file == VGRF || r.file == IMM ? 0 : r.nr) *
|
||||
(r.file == UNIFORM ? 16 : REG_SIZE) + r.offset +
|
||||
(r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return whether the register region starting at \p r and spanning \p dr
|
||||
* bytes could potentially overlap the register region starting at \p s and
|
||||
* spanning \p ds bytes.
|
||||
*/
|
||||
static inline bool
|
||||
regions_overlap(const backend_reg &r, unsigned dr,
|
||||
const backend_reg &s, unsigned ds)
|
||||
{
|
||||
if (r.file == MRF && (r.nr & BRW_MRF_COMPR4)) {
|
||||
/* COMPR4 regions are translated by the hardware during decompression
|
||||
* into two separate half-regions 4 MRFs apart from each other.
|
||||
*/
|
||||
backend_reg t0 = r;
|
||||
t0.nr &= ~BRW_MRF_COMPR4;
|
||||
backend_reg t1 = t0;
|
||||
t1.offset += 4 * REG_SIZE;
|
||||
return regions_overlap(t0, dr / 2, s, ds) ||
|
||||
regions_overlap(t1, dr / 2, s, ds);
|
||||
|
||||
} else if (s.file == MRF && (s.nr & BRW_MRF_COMPR4)) {
|
||||
return regions_overlap(s, ds, r, dr);
|
||||
|
||||
} else {
|
||||
return reg_space(r) == reg_space(s) &&
|
||||
!(reg_offset(r) + dr <= reg_offset(s) ||
|
||||
reg_offset(s) + ds <= reg_offset(r));
|
||||
}
|
||||
}
|
||||
|
||||
class vec4_instruction : public backend_instruction {
|
||||
public:
|
||||
DECLARE_RALLOC_CXX_OPERATORS(vec4_instruction)
|
||||
|
||||
vec4_instruction(enum opcode opcode,
|
||||
const dst_reg &dst = dst_reg(),
|
||||
const src_reg &src0 = src_reg(),
|
||||
const src_reg &src1 = src_reg(),
|
||||
const src_reg &src2 = src_reg());
|
||||
|
||||
dst_reg dst;
|
||||
src_reg src[3];
|
||||
|
||||
enum brw_urb_write_flags urb_write_flags;
|
||||
|
||||
unsigned sol_binding; /**< gfx6: SOL binding table index */
|
||||
bool sol_final_write; /**< gfx6: send commit message */
|
||||
unsigned sol_vertex; /**< gfx6: used for setting dst index in SVB header */
|
||||
|
||||
bool is_send_from_grf() const;
|
||||
unsigned size_read(unsigned arg) const;
|
||||
bool can_reswizzle(const struct intel_device_info *devinfo,
|
||||
int dst_writemask,
|
||||
int swizzle, int swizzle_mask);
|
||||
void reswizzle(int dst_writemask, int swizzle);
|
||||
bool can_do_source_mods(const struct intel_device_info *devinfo);
|
||||
bool can_do_cmod();
|
||||
bool can_do_writemask(const struct intel_device_info *devinfo);
|
||||
bool can_change_types() const;
|
||||
bool has_source_and_destination_hazard() const;
|
||||
unsigned implied_mrf_writes() const;
|
||||
|
||||
bool is_align1_partial_write()
|
||||
{
|
||||
return opcode == VEC4_OPCODE_SET_LOW_32BIT ||
|
||||
opcode == VEC4_OPCODE_SET_HIGH_32BIT;
|
||||
}
|
||||
|
||||
bool reads_flag() const
|
||||
{
|
||||
return predicate || opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2;
|
||||
}
|
||||
|
||||
bool reads_flag(unsigned c)
|
||||
{
|
||||
if (opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2)
|
||||
return true;
|
||||
|
||||
switch (predicate) {
|
||||
case BRW_PREDICATE_NONE:
|
||||
return false;
|
||||
case BRW_PREDICATE_ALIGN16_REPLICATE_X:
|
||||
return c == 0;
|
||||
case BRW_PREDICATE_ALIGN16_REPLICATE_Y:
|
||||
return c == 1;
|
||||
case BRW_PREDICATE_ALIGN16_REPLICATE_Z:
|
||||
return c == 2;
|
||||
case BRW_PREDICATE_ALIGN16_REPLICATE_W:
|
||||
return c == 3;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
bool writes_flag(const intel_device_info *devinfo) const
|
||||
{
|
||||
return (conditional_mod && ((opcode != BRW_OPCODE_SEL || devinfo->ver <= 5) &&
|
||||
opcode != BRW_OPCODE_CSEL &&
|
||||
opcode != BRW_OPCODE_IF &&
|
||||
opcode != BRW_OPCODE_WHILE));
|
||||
}
|
||||
|
||||
bool reads_g0_implicitly() const
|
||||
{
|
||||
switch (opcode) {
|
||||
case SHADER_OPCODE_TEX:
|
||||
case SHADER_OPCODE_TXL:
|
||||
case SHADER_OPCODE_TXD:
|
||||
case SHADER_OPCODE_TXF:
|
||||
case SHADER_OPCODE_TXF_CMS_W:
|
||||
case SHADER_OPCODE_TXF_CMS:
|
||||
case SHADER_OPCODE_TXF_MCS:
|
||||
case SHADER_OPCODE_TXS:
|
||||
case SHADER_OPCODE_TG4:
|
||||
case SHADER_OPCODE_TG4_OFFSET:
|
||||
case SHADER_OPCODE_SAMPLEINFO:
|
||||
case VS_OPCODE_PULL_CONSTANT_LOAD:
|
||||
case GS_OPCODE_SET_PRIMITIVE_ID:
|
||||
case GS_OPCODE_GET_INSTANCE_ID:
|
||||
case SHADER_OPCODE_GFX4_SCRATCH_READ:
|
||||
case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Make the execution of \p inst dependent on the evaluation of a possibly
|
||||
* inverted predicate.
|
||||
*/
|
||||
inline vec4_instruction *
|
||||
set_predicate_inv(enum brw_predicate pred, bool inverse,
|
||||
vec4_instruction *inst)
|
||||
{
|
||||
inst->predicate = pred;
|
||||
inst->predicate_inverse = inverse;
|
||||
return inst;
|
||||
}
|
||||
|
||||
/**
|
||||
* Make the execution of \p inst dependent on the evaluation of a predicate.
|
||||
*/
|
||||
inline vec4_instruction *
|
||||
set_predicate(enum brw_predicate pred, vec4_instruction *inst)
|
||||
{
|
||||
return set_predicate_inv(pred, false, inst);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the result of evaluating the condition given by \p mod to a flag
|
||||
* register.
|
||||
*/
|
||||
inline vec4_instruction *
|
||||
set_condmod(enum brw_conditional_mod mod, vec4_instruction *inst)
|
||||
{
|
||||
inst->conditional_mod = mod;
|
||||
return inst;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clamp the result of \p inst to the saturation range of its destination
|
||||
* datatype.
|
||||
*/
|
||||
inline vec4_instruction *
|
||||
set_saturate(bool saturate, vec4_instruction *inst)
|
||||
{
|
||||
inst->saturate = saturate;
|
||||
return inst;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of dataflow registers written by the instruction (either
|
||||
* fully or partially) counted from 'floor(reg_offset(inst->dst) /
|
||||
* register_size)'. The somewhat arbitrary register size unit is 16B for the
|
||||
* UNIFORM and IMM files and 32B for all other files.
|
||||
*/
|
||||
inline unsigned
|
||||
regs_written(const vec4_instruction *inst)
|
||||
{
|
||||
assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
|
||||
return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE + inst->size_written,
|
||||
REG_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of dataflow registers read by the instruction (either
|
||||
* fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
|
||||
* register_size)'. The somewhat arbitrary register size unit is 16B for the
|
||||
* UNIFORM and IMM files and 32B for all other files.
|
||||
*/
|
||||
inline unsigned
|
||||
regs_read(const vec4_instruction *inst, unsigned i)
|
||||
{
|
||||
const unsigned reg_size =
|
||||
inst->src[i].file == UNIFORM || inst->src[i].file == IMM ? 16 : REG_SIZE;
|
||||
return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size + inst->size_read(i),
|
||||
reg_size);
|
||||
}
|
||||
|
||||
static inline enum brw_reg_type
|
||||
get_exec_type(const vec4_instruction *inst)
|
||||
{
|
||||
enum brw_reg_type exec_type = BRW_REGISTER_TYPE_B;
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
if (inst->src[i].file != BAD_FILE) {
|
||||
const brw_reg_type t = get_exec_type(brw_reg_type(inst->src[i].type));
|
||||
if (type_sz(t) > type_sz(exec_type))
|
||||
exec_type = t;
|
||||
else if (type_sz(t) == type_sz(exec_type) &&
|
||||
brw_reg_type_is_floating_point(t))
|
||||
exec_type = t;
|
||||
}
|
||||
}
|
||||
|
||||
if (exec_type == BRW_REGISTER_TYPE_B)
|
||||
exec_type = inst->dst.type;
|
||||
|
||||
/* TODO: We need to handle half-float conversions. */
|
||||
assert(exec_type != BRW_REGISTER_TYPE_HF ||
|
||||
inst->dst.type == BRW_REGISTER_TYPE_HF);
|
||||
assert(exec_type != BRW_REGISTER_TYPE_B);
|
||||
|
||||
return exec_type;
|
||||
}
|
||||
|
||||
static inline unsigned
|
||||
get_exec_type_size(const vec4_instruction *inst)
|
||||
{
|
||||
return type_sz(get_exec_type(inst));
|
||||
}
|
||||
|
||||
} /* namespace brw */
|
||||
|
||||
#endif
|
||||
86
src/intel/compiler/elk/brw_isa_info.h
Normal file
86
src/intel/compiler/elk/brw_isa_info.h
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
/*
|
||||
* Copyright © 2022 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* on the rights to use, copy, modify, merge, publish, distribute, sub
|
||||
* license, and/or sell copies of the Software, and to permit persons to whom
|
||||
* the Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
|
||||
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
|
||||
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
|
||||
* USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
#ifndef BRW_ISA_ENCODING_H
|
||||
#define BRW_ISA_ENCODING_H
|
||||
|
||||
#include "dev/intel_device_info.h"
|
||||
#include "brw_eu_defines.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct opcode_desc;
|
||||
|
||||
struct brw_isa_info {
|
||||
const struct intel_device_info *devinfo;
|
||||
|
||||
/* A mapping from enum opcode to the corresponding opcode_desc */
|
||||
const struct opcode_desc *ir_to_descs[NUM_BRW_OPCODES];
|
||||
|
||||
/** A mapping from a HW opcode encoding to the corresponding opcode_desc */
|
||||
const struct opcode_desc *hw_to_descs[128];
|
||||
};
|
||||
|
||||
void brw_init_isa_info(struct brw_isa_info *isa,
|
||||
const struct intel_device_info *devinfo);
|
||||
|
||||
struct opcode_desc {
|
||||
unsigned ir;
|
||||
unsigned hw;
|
||||
const char *name;
|
||||
int nsrc;
|
||||
int ndst;
|
||||
int gfx_vers;
|
||||
};
|
||||
|
||||
const struct opcode_desc *
|
||||
brw_opcode_desc(const struct brw_isa_info *isa, enum opcode opcode);
|
||||
|
||||
const struct opcode_desc *
|
||||
brw_opcode_desc_from_hw(const struct brw_isa_info *isa, unsigned hw);
|
||||
|
||||
static inline unsigned
|
||||
brw_opcode_encode(const struct brw_isa_info *isa, enum opcode opcode)
|
||||
{
|
||||
return brw_opcode_desc(isa, opcode)->hw;
|
||||
}
|
||||
|
||||
static inline enum opcode
|
||||
brw_opcode_decode(const struct brw_isa_info *isa, unsigned hw)
|
||||
{
|
||||
const struct opcode_desc *desc = brw_opcode_desc_from_hw(isa, hw);
|
||||
return desc ? (enum opcode)desc->ir : BRW_OPCODE_ILLEGAL;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
is_3src(const struct brw_isa_info *isa, enum opcode opcode)
|
||||
{
|
||||
const struct opcode_desc *desc = brw_opcode_desc(isa, opcode);
|
||||
return desc && desc->nsrc == 3;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
790
src/intel/compiler/elk/brw_kernel.c
Normal file
790
src/intel/compiler/elk/brw_kernel.c
Normal file
|
|
@ -0,0 +1,790 @@
|
|||
/*
|
||||
* Copyright © 2020 Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_kernel.h"
|
||||
#include "brw_nir.h"
|
||||
#include "intel_nir.h"
|
||||
|
||||
#include "intel_nir.h"
|
||||
#include "nir_clc_helpers.h"
|
||||
#include "compiler/nir/nir_builder.h"
|
||||
#include "compiler/spirv/nir_spirv.h"
|
||||
#include "dev/intel_debug.h"
|
||||
#include "util/u_atomic.h"
|
||||
#include "util/u_dynarray.h"
|
||||
|
||||
static const nir_shader *
|
||||
load_clc_shader(struct brw_compiler *compiler, struct disk_cache *disk_cache,
|
||||
const nir_shader_compiler_options *nir_options,
|
||||
const struct spirv_to_nir_options *spirv_options)
|
||||
{
|
||||
if (compiler->clc_shader)
|
||||
return compiler->clc_shader;
|
||||
|
||||
nir_shader *nir = nir_load_libclc_shader(64, disk_cache,
|
||||
spirv_options, nir_options,
|
||||
disk_cache != NULL);
|
||||
if (nir == NULL)
|
||||
return NULL;
|
||||
|
||||
const nir_shader *old_nir =
|
||||
p_atomic_cmpxchg(&compiler->clc_shader, NULL, nir);
|
||||
if (old_nir == NULL) {
|
||||
/* We won the race */
|
||||
ralloc_steal(compiler, nir);
|
||||
return nir;
|
||||
} else {
|
||||
/* Someone else built the shader first */
|
||||
ralloc_free(nir);
|
||||
return old_nir;
|
||||
}
|
||||
}
|
||||
|
||||
static nir_builder
|
||||
builder_init_new_impl(nir_function *func)
|
||||
{
|
||||
nir_function_impl *impl = nir_function_impl_create(func);
|
||||
return nir_builder_at(nir_before_impl(impl));
|
||||
}
|
||||
|
||||
static void
|
||||
implement_atomic_builtin(nir_function *func, nir_atomic_op atomic_op,
|
||||
enum glsl_base_type data_base_type,
|
||||
nir_variable_mode mode)
|
||||
{
|
||||
nir_builder b = builder_init_new_impl(func);
|
||||
const struct glsl_type *data_type = glsl_scalar_type(data_base_type);
|
||||
|
||||
unsigned p = 0;
|
||||
|
||||
nir_deref_instr *ret = NULL;
|
||||
ret = nir_build_deref_cast(&b, nir_load_param(&b, p++),
|
||||
nir_var_function_temp, data_type, 0);
|
||||
|
||||
nir_intrinsic_op op = nir_intrinsic_deref_atomic;
|
||||
nir_intrinsic_instr *atomic = nir_intrinsic_instr_create(b.shader, op);
|
||||
nir_intrinsic_set_atomic_op(atomic, atomic_op);
|
||||
|
||||
for (unsigned i = 0; i < nir_intrinsic_infos[op].num_srcs; i++) {
|
||||
nir_def *src = nir_load_param(&b, p++);
|
||||
if (i == 0) {
|
||||
/* The first source is our deref */
|
||||
assert(nir_intrinsic_infos[op].src_components[i] == -1);
|
||||
src = &nir_build_deref_cast(&b, src, mode, data_type, 0)->def;
|
||||
}
|
||||
atomic->src[i] = nir_src_for_ssa(src);
|
||||
}
|
||||
|
||||
nir_def_init_for_type(&atomic->instr, &atomic->def, data_type);
|
||||
|
||||
nir_builder_instr_insert(&b, &atomic->instr);
|
||||
nir_store_deref(&b, ret, &atomic->def, ~0);
|
||||
}
|
||||
|
||||
static void
|
||||
implement_sub_group_ballot_builtin(nir_function *func)
|
||||
{
|
||||
nir_builder b = builder_init_new_impl(func);
|
||||
nir_deref_instr *ret =
|
||||
nir_build_deref_cast(&b, nir_load_param(&b, 0),
|
||||
nir_var_function_temp, glsl_uint_type(), 0);
|
||||
nir_def *cond = nir_load_param(&b, 1);
|
||||
|
||||
nir_intrinsic_instr *ballot =
|
||||
nir_intrinsic_instr_create(b.shader, nir_intrinsic_ballot);
|
||||
ballot->src[0] = nir_src_for_ssa(cond);
|
||||
ballot->num_components = 1;
|
||||
nir_def_init(&ballot->instr, &ballot->def, 1, 32);
|
||||
nir_builder_instr_insert(&b, &ballot->instr);
|
||||
|
||||
nir_store_deref(&b, ret, &ballot->def, ~0);
|
||||
}
|
||||
|
||||
static bool
|
||||
implement_intel_builtins(nir_shader *nir)
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
nir_foreach_function(func, nir) {
|
||||
if (strcmp(func->name, "_Z10atomic_minPU3AS1Vff") == 0) {
|
||||
/* float atom_min(__global float volatile *p, float val) */
|
||||
implement_atomic_builtin(func, nir_atomic_op_fmin,
|
||||
GLSL_TYPE_FLOAT, nir_var_mem_global);
|
||||
progress = true;
|
||||
} else if (strcmp(func->name, "_Z10atomic_maxPU3AS1Vff") == 0) {
|
||||
/* float atom_max(__global float volatile *p, float val) */
|
||||
implement_atomic_builtin(func, nir_atomic_op_fmax,
|
||||
GLSL_TYPE_FLOAT, nir_var_mem_global);
|
||||
progress = true;
|
||||
} else if (strcmp(func->name, "_Z10atomic_minPU3AS3Vff") == 0) {
|
||||
/* float atomic_min(__shared float volatile *, float) */
|
||||
implement_atomic_builtin(func, nir_atomic_op_fmin,
|
||||
GLSL_TYPE_FLOAT, nir_var_mem_shared);
|
||||
progress = true;
|
||||
} else if (strcmp(func->name, "_Z10atomic_maxPU3AS3Vff") == 0) {
|
||||
/* float atomic_max(__shared float volatile *, float) */
|
||||
implement_atomic_builtin(func, nir_atomic_op_fmax,
|
||||
GLSL_TYPE_FLOAT, nir_var_mem_shared);
|
||||
progress = true;
|
||||
} else if (strcmp(func->name, "intel_sub_group_ballot") == 0) {
|
||||
implement_sub_group_ballot_builtin(func);
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
|
||||
nir_shader_preserve_all_metadata(nir);
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_kernel_intrinsics(nir_shader *nir)
|
||||
{
|
||||
nir_function_impl *impl = nir_shader_get_entrypoint(nir);
|
||||
|
||||
bool progress = false;
|
||||
|
||||
unsigned kernel_sysvals_start = 0;
|
||||
unsigned kernel_arg_start = sizeof(struct brw_kernel_sysvals);
|
||||
nir->num_uniforms += kernel_arg_start;
|
||||
|
||||
nir_builder b = nir_builder_create(impl);
|
||||
|
||||
nir_foreach_block(block, impl) {
|
||||
nir_foreach_instr_safe(instr, block) {
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
continue;
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_load_kernel_input: {
|
||||
b.cursor = nir_instr_remove(&intrin->instr);
|
||||
|
||||
nir_intrinsic_instr *load =
|
||||
nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform);
|
||||
load->num_components = intrin->num_components;
|
||||
load->src[0] = nir_src_for_ssa(nir_u2u32(&b, intrin->src[0].ssa));
|
||||
nir_intrinsic_set_base(load, kernel_arg_start);
|
||||
nir_intrinsic_set_range(load, nir->num_uniforms);
|
||||
nir_def_init(&load->instr, &load->def,
|
||||
intrin->def.num_components,
|
||||
intrin->def.bit_size);
|
||||
nir_builder_instr_insert(&b, &load->instr);
|
||||
|
||||
nir_def_rewrite_uses(&intrin->def, &load->def);
|
||||
progress = true;
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_constant_base_ptr: {
|
||||
b.cursor = nir_instr_remove(&intrin->instr);
|
||||
nir_def *const_data_base_addr = nir_pack_64_2x32_split(&b,
|
||||
nir_load_reloc_const_intel(&b, BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW),
|
||||
nir_load_reloc_const_intel(&b, BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH));
|
||||
nir_def_rewrite_uses(&intrin->def, const_data_base_addr);
|
||||
progress = true;
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_num_workgroups: {
|
||||
b.cursor = nir_instr_remove(&intrin->instr);
|
||||
|
||||
nir_intrinsic_instr *load =
|
||||
nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform);
|
||||
load->num_components = 3;
|
||||
load->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
|
||||
nir_intrinsic_set_base(load, kernel_sysvals_start +
|
||||
offsetof(struct brw_kernel_sysvals, num_work_groups));
|
||||
nir_intrinsic_set_range(load, 3 * 4);
|
||||
nir_def_init(&load->instr, &load->def, 3, 32);
|
||||
nir_builder_instr_insert(&b, &load->instr);
|
||||
nir_def_rewrite_uses(&intrin->def, &load->def);
|
||||
progress = true;
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (progress) {
|
||||
nir_metadata_preserve(impl, nir_metadata_block_index |
|
||||
nir_metadata_dominance);
|
||||
} else {
|
||||
nir_metadata_preserve(impl, nir_metadata_all);
|
||||
}
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
bool
|
||||
brw_kernel_from_spirv(struct brw_compiler *compiler,
|
||||
struct disk_cache *disk_cache,
|
||||
struct brw_kernel *kernel,
|
||||
void *log_data, void *mem_ctx,
|
||||
const uint32_t *spirv, size_t spirv_size,
|
||||
const char *entrypoint_name,
|
||||
char **error_str)
|
||||
{
|
||||
const struct intel_device_info *devinfo = compiler->devinfo;
|
||||
const nir_shader_compiler_options *nir_options =
|
||||
compiler->nir_options[MESA_SHADER_KERNEL];
|
||||
|
||||
struct spirv_to_nir_options spirv_options = {
|
||||
.environment = NIR_SPIRV_OPENCL,
|
||||
.caps = {
|
||||
.address = true,
|
||||
.float16 = devinfo->ver >= 8,
|
||||
.float64 = devinfo->ver >= 8,
|
||||
.groups = true,
|
||||
.image_write_without_format = true,
|
||||
.int8 = devinfo->ver >= 8,
|
||||
.int16 = devinfo->ver >= 8,
|
||||
.int64 = devinfo->ver >= 8,
|
||||
.int64_atomics = devinfo->ver >= 9,
|
||||
.kernel = true,
|
||||
.linkage = true, /* We receive linked kernel from clc */
|
||||
.float_controls = devinfo->ver >= 8,
|
||||
.generic_pointers = true,
|
||||
.storage_8bit = devinfo->ver >= 8,
|
||||
.storage_16bit = devinfo->ver >= 8,
|
||||
.subgroup_arithmetic = true,
|
||||
.subgroup_basic = true,
|
||||
.subgroup_ballot = true,
|
||||
.subgroup_dispatch = true,
|
||||
.subgroup_quad = true,
|
||||
.subgroup_shuffle = true,
|
||||
.subgroup_vote = true,
|
||||
|
||||
.intel_subgroup_shuffle = true,
|
||||
.intel_subgroup_buffer_block_io = true,
|
||||
},
|
||||
.shared_addr_format = nir_address_format_62bit_generic,
|
||||
.global_addr_format = nir_address_format_62bit_generic,
|
||||
.temp_addr_format = nir_address_format_62bit_generic,
|
||||
.constant_addr_format = nir_address_format_64bit_global,
|
||||
};
|
||||
|
||||
spirv_options.clc_shader = load_clc_shader(compiler, disk_cache,
|
||||
nir_options, &spirv_options);
|
||||
if (spirv_options.clc_shader == NULL) {
|
||||
fprintf(stderr, "ERROR: libclc shader missing."
|
||||
" Consider installing the libclc package\n");
|
||||
abort();
|
||||
}
|
||||
|
||||
assert(spirv_size % 4 == 0);
|
||||
nir_shader *nir =
|
||||
spirv_to_nir(spirv, spirv_size / 4, NULL, 0, MESA_SHADER_KERNEL,
|
||||
entrypoint_name, &spirv_options, nir_options);
|
||||
nir_validate_shader(nir, "after spirv_to_nir");
|
||||
nir_validate_ssa_dominance(nir, "after spirv_to_nir");
|
||||
ralloc_steal(mem_ctx, nir);
|
||||
nir->info.name = ralloc_strdup(nir, entrypoint_name);
|
||||
|
||||
if (INTEL_DEBUG(DEBUG_CS)) {
|
||||
/* Re-index SSA defs so we print more sensible numbers. */
|
||||
nir_foreach_function_impl(impl, nir) {
|
||||
nir_index_ssa_defs(impl);
|
||||
}
|
||||
|
||||
fprintf(stderr, "NIR (from SPIR-V) for kernel\n");
|
||||
nir_print_shader(nir, stderr);
|
||||
}
|
||||
|
||||
NIR_PASS_V(nir, implement_intel_builtins);
|
||||
NIR_PASS_V(nir, nir_link_shader_functions, spirv_options.clc_shader);
|
||||
|
||||
/* We have to lower away local constant initializers right before we
|
||||
* inline functions. That way they get properly initialized at the top
|
||||
* of the function and not at the top of its caller.
|
||||
*/
|
||||
NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
|
||||
NIR_PASS_V(nir, nir_lower_returns);
|
||||
NIR_PASS_V(nir, nir_inline_functions);
|
||||
NIR_PASS_V(nir, nir_copy_prop);
|
||||
NIR_PASS_V(nir, nir_opt_deref);
|
||||
|
||||
/* Pick off the single entrypoint that we want */
|
||||
nir_remove_non_entrypoints(nir);
|
||||
|
||||
/* Now that we've deleted all but the main function, we can go ahead and
|
||||
* lower the rest of the constant initializers. We do this here so that
|
||||
* nir_remove_dead_variables and split_per_member_structs below see the
|
||||
* corresponding stores.
|
||||
*/
|
||||
NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);
|
||||
|
||||
/* LLVM loves take advantage of the fact that vec3s in OpenCL are 16B
|
||||
* aligned and so it can just read/write them as vec4s. This results in a
|
||||
* LOT of vec4->vec3 casts on loads and stores. One solution to this
|
||||
* problem is to get rid of all vec3 variables.
|
||||
*/
|
||||
NIR_PASS_V(nir, nir_lower_vec3_to_vec4,
|
||||
nir_var_shader_temp | nir_var_function_temp |
|
||||
nir_var_mem_shared | nir_var_mem_global|
|
||||
nir_var_mem_constant);
|
||||
|
||||
/* We assign explicit types early so that the optimizer can take advantage
|
||||
* of that information and hopefully get rid of some of our memcpys.
|
||||
*/
|
||||
NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
|
||||
nir_var_uniform |
|
||||
nir_var_shader_temp | nir_var_function_temp |
|
||||
nir_var_mem_shared | nir_var_mem_global,
|
||||
glsl_get_cl_type_size_align);
|
||||
|
||||
struct brw_nir_compiler_opts opts = {};
|
||||
brw_preprocess_nir(compiler, nir, &opts);
|
||||
|
||||
int max_arg_idx = -1;
|
||||
nir_foreach_uniform_variable(var, nir) {
|
||||
assert(var->data.location < 256);
|
||||
max_arg_idx = MAX2(max_arg_idx, var->data.location);
|
||||
}
|
||||
|
||||
kernel->args_size = nir->num_uniforms;
|
||||
kernel->arg_count = max_arg_idx + 1;
|
||||
|
||||
/* No bindings */
|
||||
struct brw_kernel_arg_desc *args =
|
||||
rzalloc_array(mem_ctx, struct brw_kernel_arg_desc, kernel->arg_count);
|
||||
kernel->args = args;
|
||||
|
||||
nir_foreach_uniform_variable(var, nir) {
|
||||
struct brw_kernel_arg_desc arg_desc = {
|
||||
.offset = var->data.driver_location,
|
||||
.size = glsl_get_explicit_size(var->type, false),
|
||||
};
|
||||
assert(arg_desc.offset + arg_desc.size <= nir->num_uniforms);
|
||||
|
||||
assert(var->data.location >= 0);
|
||||
args[var->data.location] = arg_desc;
|
||||
}
|
||||
|
||||
NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_all, NULL);
|
||||
|
||||
/* Lower again, this time after dead-variables to get more compact variable
|
||||
* layouts.
|
||||
*/
|
||||
nir->global_mem_size = 0;
|
||||
nir->scratch_size = 0;
|
||||
nir->info.shared_size = 0;
|
||||
NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
|
||||
nir_var_shader_temp | nir_var_function_temp |
|
||||
nir_var_mem_shared | nir_var_mem_global | nir_var_mem_constant,
|
||||
glsl_get_cl_type_size_align);
|
||||
if (nir->constant_data_size > 0) {
|
||||
assert(nir->constant_data == NULL);
|
||||
nir->constant_data = rzalloc_size(nir, nir->constant_data_size);
|
||||
nir_gather_explicit_io_initializers(nir, nir->constant_data,
|
||||
nir->constant_data_size,
|
||||
nir_var_mem_constant);
|
||||
}
|
||||
|
||||
if (INTEL_DEBUG(DEBUG_CS)) {
|
||||
/* Re-index SSA defs so we print more sensible numbers. */
|
||||
nir_foreach_function_impl(impl, nir) {
|
||||
nir_index_ssa_defs(impl);
|
||||
}
|
||||
|
||||
fprintf(stderr, "NIR (before I/O lowering) for kernel\n");
|
||||
nir_print_shader(nir, stderr);
|
||||
}
|
||||
|
||||
NIR_PASS_V(nir, nir_lower_memcpy);
|
||||
|
||||
NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_constant,
|
||||
nir_address_format_64bit_global);
|
||||
|
||||
NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_uniform,
|
||||
nir_address_format_32bit_offset_as_64bit);
|
||||
|
||||
NIR_PASS_V(nir, nir_lower_explicit_io,
|
||||
nir_var_shader_temp | nir_var_function_temp |
|
||||
nir_var_mem_shared | nir_var_mem_global,
|
||||
nir_address_format_62bit_generic);
|
||||
|
||||
NIR_PASS_V(nir, nir_lower_convert_alu_types, NULL);
|
||||
|
||||
NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics, devinfo, NULL);
|
||||
NIR_PASS_V(nir, lower_kernel_intrinsics);
|
||||
|
||||
struct brw_cs_prog_key key = { };
|
||||
|
||||
memset(&kernel->prog_data, 0, sizeof(kernel->prog_data));
|
||||
kernel->prog_data.base.nr_params = DIV_ROUND_UP(nir->num_uniforms, 4);
|
||||
|
||||
struct brw_compile_cs_params params = {
|
||||
.base = {
|
||||
.nir = nir,
|
||||
.stats = kernel->stats,
|
||||
.log_data = log_data,
|
||||
.mem_ctx = mem_ctx,
|
||||
},
|
||||
.key = &key,
|
||||
.prog_data = &kernel->prog_data,
|
||||
};
|
||||
|
||||
kernel->code = brw_compile_cs(compiler, ¶ms);
|
||||
|
||||
if (error_str)
|
||||
*error_str = params.base.error_str;
|
||||
|
||||
return kernel->code != NULL;
|
||||
}
|
||||
|
||||
static nir_def *
|
||||
rebuild_value_from_store(struct util_dynarray *stores,
|
||||
nir_def *value, unsigned read_offset)
|
||||
{
|
||||
unsigned read_size = value->num_components * value->bit_size / 8;
|
||||
|
||||
util_dynarray_foreach(stores, nir_intrinsic_instr *, _store) {
|
||||
nir_intrinsic_instr *store = *_store;
|
||||
|
||||
unsigned write_offset = nir_src_as_uint(store->src[1]);
|
||||
unsigned write_size = nir_src_num_components(store->src[0]) *
|
||||
nir_src_bit_size(store->src[0]) / 8;
|
||||
if (write_offset <= read_offset &&
|
||||
(write_offset + write_size) >= (read_offset + read_size)) {
|
||||
assert(nir_block_dominates(store->instr.block, value->parent_instr->block));
|
||||
assert(write_size == read_size);
|
||||
return store->src[0].ssa;
|
||||
}
|
||||
}
|
||||
unreachable("Matching scratch store not found");
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove temporary variables stored to scratch to be then reloaded
|
||||
* immediately. Remap the load to the store SSA value.
|
||||
*
|
||||
* This workaround is only meant to be applied to shaders in src/intel/shaders
|
||||
* were we know there should be no issue. More complex cases might not work
|
||||
* with this approach.
|
||||
*/
|
||||
static bool
|
||||
nir_remove_llvm17_scratch(nir_shader *nir)
|
||||
{
|
||||
struct util_dynarray scratch_stores;
|
||||
void *mem_ctx = ralloc_context(NULL);
|
||||
|
||||
util_dynarray_init(&scratch_stores, mem_ctx);
|
||||
|
||||
nir_foreach_function_impl(func, nir) {
|
||||
nir_foreach_block(block, func) {
|
||||
nir_foreach_instr(instr, block) {
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
continue;
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
|
||||
if (intrin->intrinsic != nir_intrinsic_store_scratch)
|
||||
continue;
|
||||
|
||||
nir_const_value *offset = nir_src_as_const_value(intrin->src[1]);
|
||||
if (offset != NULL) {
|
||||
util_dynarray_append(&scratch_stores, nir_intrinsic_instr *, intrin);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool progress = false;
|
||||
if (util_dynarray_num_elements(&scratch_stores, nir_intrinsic_instr *) > 0) {
|
||||
nir_foreach_function_impl(func, nir) {
|
||||
nir_foreach_block(block, func) {
|
||||
nir_foreach_instr_safe(instr, block) {
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
continue;
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
|
||||
if (intrin->intrinsic != nir_intrinsic_load_scratch)
|
||||
continue;
|
||||
|
||||
nir_const_value *offset = nir_src_as_const_value(intrin->src[0]);
|
||||
if (offset == NULL)
|
||||
continue;
|
||||
|
||||
nir_def_rewrite_uses(&intrin->def,
|
||||
rebuild_value_from_store(
|
||||
&scratch_stores, &intrin->def,
|
||||
nir_src_as_uint(intrin->src[0])));
|
||||
nir_instr_remove(instr);
|
||||
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
util_dynarray_foreach(&scratch_stores, nir_intrinsic_instr *, _store) {
|
||||
nir_intrinsic_instr *store = *_store;
|
||||
nir_instr_remove(&store->instr);
|
||||
}
|
||||
|
||||
/* Quick sanity check */
|
||||
assert(util_dynarray_num_elements(&scratch_stores, nir_intrinsic_instr *) == 0 ||
|
||||
progress);
|
||||
|
||||
ralloc_free(mem_ctx);
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
static void
|
||||
cleanup_llvm17_scratch(nir_shader *nir)
|
||||
{
|
||||
{
|
||||
bool progress;
|
||||
do {
|
||||
progress = false;
|
||||
NIR_PASS(progress, nir, nir_copy_prop);
|
||||
NIR_PASS(progress, nir, nir_opt_dce);
|
||||
NIR_PASS(progress, nir, nir_opt_constant_folding);
|
||||
NIR_PASS(progress, nir, nir_opt_cse);
|
||||
NIR_PASS(progress, nir, nir_opt_algebraic);
|
||||
} while (progress);
|
||||
}
|
||||
|
||||
nir_remove_llvm17_scratch(nir);
|
||||
|
||||
{
|
||||
bool progress;
|
||||
do {
|
||||
progress = false;
|
||||
NIR_PASS(progress, nir, nir_copy_prop);
|
||||
NIR_PASS(progress, nir, nir_opt_dce);
|
||||
NIR_PASS(progress, nir, nir_opt_constant_folding);
|
||||
NIR_PASS(progress, nir, nir_opt_cse);
|
||||
NIR_PASS(progress, nir, nir_opt_algebraic);
|
||||
} while (progress);
|
||||
}
|
||||
}
|
||||
|
||||
nir_shader *
|
||||
brw_nir_from_spirv(void *mem_ctx, const uint32_t *spirv, size_t spirv_size,
|
||||
bool llvm17_wa)
|
||||
{
|
||||
struct spirv_to_nir_options spirv_options = {
|
||||
.environment = NIR_SPIRV_OPENCL,
|
||||
.caps = {
|
||||
.address = true,
|
||||
.groups = true,
|
||||
.image_write_without_format = true,
|
||||
.int8 = true,
|
||||
.int16 = true,
|
||||
.int64 = true,
|
||||
.int64_atomics = true,
|
||||
.kernel = true,
|
||||
.linkage = true, /* We receive linked kernel from clc */
|
||||
.float_controls = true,
|
||||
.generic_pointers = true,
|
||||
.storage_8bit = true,
|
||||
.storage_16bit = true,
|
||||
.subgroup_arithmetic = true,
|
||||
.subgroup_basic = true,
|
||||
.subgroup_ballot = true,
|
||||
.subgroup_dispatch = true,
|
||||
.subgroup_quad = true,
|
||||
.subgroup_shuffle = true,
|
||||
.subgroup_vote = true,
|
||||
|
||||
.intel_subgroup_shuffle = true,
|
||||
.intel_subgroup_buffer_block_io = true,
|
||||
},
|
||||
.shared_addr_format = nir_address_format_62bit_generic,
|
||||
.global_addr_format = nir_address_format_62bit_generic,
|
||||
.temp_addr_format = nir_address_format_62bit_generic,
|
||||
.constant_addr_format = nir_address_format_64bit_global,
|
||||
.create_library = true,
|
||||
};
|
||||
|
||||
assert(spirv_size % 4 == 0);
|
||||
nir_shader *nir =
|
||||
spirv_to_nir(spirv, spirv_size / 4, NULL, 0, MESA_SHADER_KERNEL,
|
||||
"library", &spirv_options, &brw_scalar_nir_options);
|
||||
nir_validate_shader(nir, "after spirv_to_nir");
|
||||
nir_validate_ssa_dominance(nir, "after spirv_to_nir");
|
||||
ralloc_steal(mem_ctx, nir);
|
||||
nir->info.name = ralloc_strdup(nir, "library");
|
||||
|
||||
if (INTEL_DEBUG(DEBUG_CS)) {
|
||||
/* Re-index SSA defs so we print more sensible numbers. */
|
||||
nir_foreach_function_impl(impl, nir) {
|
||||
nir_index_ssa_defs(impl);
|
||||
}
|
||||
|
||||
fprintf(stderr, "NIR (from SPIR-V) for kernel\n");
|
||||
nir_print_shader(nir, stderr);
|
||||
}
|
||||
|
||||
NIR_PASS_V(nir, implement_intel_builtins);
|
||||
NIR_PASS_V(nir, nir_link_shader_functions, spirv_options.clc_shader);
|
||||
|
||||
/* We have to lower away local constant initializers right before we
|
||||
* inline functions. That way they get properly initialized at the top
|
||||
* of the function and not at the top of its caller.
|
||||
*/
|
||||
NIR_PASS_V(nir, nir_lower_variable_initializers, ~(nir_var_shader_temp |
|
||||
nir_var_function_temp));
|
||||
NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo |
|
||||
nir_var_mem_constant | nir_var_function_temp | nir_var_image, NULL);
|
||||
{
|
||||
bool progress;
|
||||
do
|
||||
{
|
||||
progress = false;
|
||||
NIR_PASS(progress, nir, nir_copy_prop);
|
||||
NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
|
||||
NIR_PASS(progress, nir, nir_opt_deref);
|
||||
NIR_PASS(progress, nir, nir_opt_dce);
|
||||
NIR_PASS(progress, nir, nir_opt_undef);
|
||||
NIR_PASS(progress, nir, nir_opt_constant_folding);
|
||||
NIR_PASS(progress, nir, nir_opt_cse);
|
||||
NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
|
||||
NIR_PASS(progress, nir, nir_opt_algebraic);
|
||||
} while (progress);
|
||||
}
|
||||
|
||||
NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
|
||||
NIR_PASS_V(nir, nir_lower_returns);
|
||||
NIR_PASS_V(nir, nir_inline_functions);
|
||||
|
||||
assert(nir->scratch_size == 0);
|
||||
NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp, glsl_get_cl_type_size_align);
|
||||
|
||||
{
|
||||
bool progress;
|
||||
do
|
||||
{
|
||||
progress = false;
|
||||
NIR_PASS(progress, nir, nir_copy_prop);
|
||||
NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
|
||||
NIR_PASS(progress, nir, nir_opt_deref);
|
||||
NIR_PASS(progress, nir, nir_opt_dce);
|
||||
NIR_PASS(progress, nir, nir_opt_undef);
|
||||
NIR_PASS(progress, nir, nir_opt_constant_folding);
|
||||
NIR_PASS(progress, nir, nir_opt_cse);
|
||||
NIR_PASS(progress, nir, nir_split_var_copies);
|
||||
NIR_PASS(progress, nir, nir_lower_var_copies);
|
||||
NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
|
||||
NIR_PASS(progress, nir, nir_opt_algebraic);
|
||||
NIR_PASS(progress, nir, nir_opt_if, nir_opt_if_optimize_phi_true_false);
|
||||
NIR_PASS(progress, nir, nir_opt_dead_cf);
|
||||
NIR_PASS(progress, nir, nir_opt_remove_phis);
|
||||
NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
|
||||
NIR_PASS(progress, nir, nir_lower_vec3_to_vec4, nir_var_mem_generic | nir_var_uniform);
|
||||
NIR_PASS(progress, nir, nir_opt_memcpy);
|
||||
} while (progress);
|
||||
}
|
||||
|
||||
NIR_PASS_V(nir, nir_scale_fdiv);
|
||||
|
||||
NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo |
|
||||
nir_var_mem_constant | nir_var_function_temp | nir_var_image, NULL);
|
||||
|
||||
|
||||
NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_mem_shared | nir_var_function_temp, NULL);
|
||||
|
||||
nir->scratch_size = 0;
|
||||
NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
|
||||
nir_var_mem_shared | nir_var_function_temp | nir_var_shader_temp |
|
||||
nir_var_mem_global | nir_var_mem_constant,
|
||||
glsl_get_cl_type_size_align);
|
||||
|
||||
// Lower memcpy - needs to wait until types are sized
|
||||
{
|
||||
bool progress;
|
||||
do {
|
||||
progress = false;
|
||||
NIR_PASS(progress, nir, nir_opt_memcpy);
|
||||
NIR_PASS(progress, nir, nir_copy_prop);
|
||||
NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
|
||||
NIR_PASS(progress, nir, nir_opt_deref);
|
||||
NIR_PASS(progress, nir, nir_opt_dce);
|
||||
NIR_PASS(progress, nir, nir_split_var_copies);
|
||||
NIR_PASS(progress, nir, nir_lower_var_copies);
|
||||
NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
|
||||
NIR_PASS(progress, nir, nir_opt_constant_folding);
|
||||
NIR_PASS(progress, nir, nir_opt_cse);
|
||||
} while (progress);
|
||||
}
|
||||
NIR_PASS_V(nir, nir_lower_memcpy);
|
||||
|
||||
NIR_PASS_V(nir, nir_lower_explicit_io,
|
||||
nir_var_mem_shared | nir_var_function_temp | nir_var_shader_temp | nir_var_uniform,
|
||||
nir_address_format_32bit_offset_as_64bit);
|
||||
|
||||
NIR_PASS_V(nir, nir_lower_system_values);
|
||||
|
||||
/* Hopefully we can drop this once lower_vars_to_ssa has improved to not
|
||||
* lower everything to scratch.
|
||||
*/
|
||||
if (llvm17_wa)
|
||||
cleanup_llvm17_scratch(nir);
|
||||
|
||||
/* Lower again, this time after dead-variables to get more compact variable
|
||||
* layouts.
|
||||
*/
|
||||
nir->global_mem_size = 0;
|
||||
nir->scratch_size = 0;
|
||||
nir->info.shared_size = 0;
|
||||
NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
|
||||
nir_var_mem_shared | nir_var_mem_global | nir_var_mem_constant,
|
||||
glsl_get_cl_type_size_align);
|
||||
if (nir->constant_data_size > 0) {
|
||||
assert(nir->constant_data == NULL);
|
||||
nir->constant_data = rzalloc_size(nir, nir->constant_data_size);
|
||||
nir_gather_explicit_io_initializers(nir, nir->constant_data,
|
||||
nir->constant_data_size,
|
||||
nir_var_mem_constant);
|
||||
}
|
||||
|
||||
NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_constant,
|
||||
nir_address_format_64bit_global);
|
||||
|
||||
NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_uniform,
|
||||
nir_address_format_32bit_offset_as_64bit);
|
||||
|
||||
NIR_PASS_V(nir, nir_lower_explicit_io,
|
||||
nir_var_shader_temp | nir_var_function_temp |
|
||||
nir_var_mem_shared | nir_var_mem_global,
|
||||
nir_address_format_62bit_generic);
|
||||
|
||||
if (INTEL_DEBUG(DEBUG_CS)) {
|
||||
/* Re-index SSA defs so we print more sensible numbers. */
|
||||
nir_foreach_function_impl(impl, nir) {
|
||||
nir_index_ssa_defs(impl);
|
||||
}
|
||||
|
||||
fprintf(stderr, "NIR (before I/O lowering) for kernel\n");
|
||||
nir_print_shader(nir, stderr);
|
||||
}
|
||||
|
||||
return nir;
|
||||
}
|
||||
78
src/intel/compiler/elk/brw_kernel.h
Normal file
78
src/intel/compiler/elk/brw_kernel.h
Normal file
|
|
@ -0,0 +1,78 @@
|
|||
/*
|
||||
* Copyright © 2020 Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BRW_KERNEL_H
|
||||
#define BRW_KERNEL_H
|
||||
|
||||
#include "brw_compiler.h"
|
||||
|
||||
struct disk_cache;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/** Software interface for system values in kernels
|
||||
*
|
||||
* These are intended to go at the start of the kernel argument buffer.
|
||||
*/
|
||||
struct brw_kernel_sysvals {
|
||||
uint32_t num_work_groups[3];
|
||||
uint32_t pad[5];
|
||||
};
|
||||
|
||||
struct brw_kernel_arg_desc {
|
||||
uint16_t offset;
|
||||
uint16_t size;
|
||||
};
|
||||
|
||||
struct brw_kernel {
|
||||
struct brw_cs_prog_data prog_data;
|
||||
|
||||
struct brw_compile_stats stats[3];
|
||||
|
||||
uint16_t args_size;
|
||||
uint16_t arg_count;
|
||||
const struct brw_kernel_arg_desc *args;
|
||||
|
||||
const void *code;
|
||||
};
|
||||
|
||||
bool
|
||||
brw_kernel_from_spirv(struct brw_compiler *compiler,
|
||||
struct disk_cache *disk_cache,
|
||||
struct brw_kernel *kernel,
|
||||
void *log_data, void *mem_ctx,
|
||||
const uint32_t *spirv, size_t spirv_size,
|
||||
const char *entrypoint_name,
|
||||
char **error_str);
|
||||
|
||||
nir_shader *
|
||||
brw_nir_from_spirv(void *mem_ctx, const uint32_t *spirv, size_t spirv_size,
|
||||
bool llvm17_wa);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* BRW_KERNEL_H */
|
||||
465
src/intel/compiler/elk/brw_lex.l
Normal file
465
src/intel/compiler/elk/brw_lex.l
Normal file
|
|
@ -0,0 +1,465 @@
|
|||
%option yylineno
|
||||
%option nounput
|
||||
%{
|
||||
#include <string.h>
|
||||
#include "brw_asm.h"
|
||||
#undef ALIGN16
|
||||
#include "brw_gram.tab.h"
|
||||
|
||||
/* Locations */
|
||||
int yycolumn = 1;
|
||||
|
||||
int saved_state = 0;
|
||||
extern char *input_filename;
|
||||
|
||||
#define YY_NO_INPUT
|
||||
#define YY_USER_ACTION \
|
||||
yylloc.first_line = yylloc.last_line = yylineno; \
|
||||
yylloc.first_column = yycolumn; \
|
||||
yylloc.last_column = yycolumn + yyleng - 1; \
|
||||
yycolumn += yyleng;
|
||||
%}
|
||||
|
||||
%x BLOCK_COMMENT
|
||||
%x FILENAME
|
||||
%x CHANNEL
|
||||
%x REG
|
||||
%x DOTSEL
|
||||
%x LABEL
|
||||
%x MSGDESC
|
||||
%%
|
||||
|
||||
/* eat up single line comment */
|
||||
\/\/.*[\r\n] { yycolumn = 1; }
|
||||
|
||||
/* eat up multiline comment */
|
||||
\/\* { saved_state = YYSTATE; BEGIN(BLOCK_COMMENT); }
|
||||
|
||||
<BLOCK_COMMENT>\*\/ { BEGIN(saved_state); }
|
||||
|
||||
<BLOCK_COMMENT>. { }
|
||||
<BLOCK_COMMENT>[\r\n] { }
|
||||
|
||||
<FILENAME>\"[^\"]+\" {
|
||||
char *name = malloc(yyleng - 1);
|
||||
memmove(name, yytext + 1, yyleng - 2);
|
||||
name[yyleng-1] = '\0';
|
||||
input_filename = name;
|
||||
}
|
||||
|
||||
/* null register */
|
||||
null { BEGIN(REG); return NULL_TOKEN; }
|
||||
|
||||
/* Opcodes */
|
||||
add { yylval.integer = BRW_OPCODE_ADD; return ADD; }
|
||||
add3 { yylval.integer = BRW_OPCODE_ADD3; return ADD3; }
|
||||
addc { yylval.integer = BRW_OPCODE_ADDC; return ADDC; }
|
||||
and { yylval.integer = BRW_OPCODE_AND; return AND; }
|
||||
asr { yylval.integer = BRW_OPCODE_ASR; return ASR; }
|
||||
avg { yylval.integer = BRW_OPCODE_AVG; return AVG; }
|
||||
bfe { yylval.integer = BRW_OPCODE_BFE; return BFE; }
|
||||
bfi1 { yylval.integer = BRW_OPCODE_BFI1; return BFI1; }
|
||||
bfi2 { yylval.integer = BRW_OPCODE_BFI2; return BFI2; }
|
||||
bfrev { yylval.integer = BRW_OPCODE_BFREV; return BFREV; }
|
||||
brc { yylval.integer = BRW_OPCODE_BRC; return BRC; }
|
||||
brd { yylval.integer = BRW_OPCODE_BRD; return BRD; }
|
||||
break { yylval.integer = BRW_OPCODE_BREAK; return BREAK; }
|
||||
call { yylval.integer = BRW_OPCODE_CALL; return CALL; }
|
||||
calla { yylval.integer = BRW_OPCODE_CALLA; return CALLA; }
|
||||
case { yylval.integer = BRW_OPCODE_CASE; return CASE; }
|
||||
cbit { yylval.integer = BRW_OPCODE_CBIT; return CBIT; }
|
||||
cmp { yylval.integer = BRW_OPCODE_CMP; return CMP; }
|
||||
cmpn { yylval.integer = BRW_OPCODE_CMPN; return CMPN; }
|
||||
cont { yylval.integer = BRW_OPCODE_CONTINUE; return CONT; }
|
||||
csel { yylval.integer = BRW_OPCODE_CSEL; return CSEL; }
|
||||
dim { yylval.integer = BRW_OPCODE_DIM; return DIM; }
|
||||
do { yylval.integer = BRW_OPCODE_DO; return DO; }
|
||||
dp2 { yylval.integer = BRW_OPCODE_DP2; return DP2; }
|
||||
dp3 { yylval.integer = BRW_OPCODE_DP3; return DP3; }
|
||||
dp4 { yylval.integer = BRW_OPCODE_DP4; return DP4; }
|
||||
dp4a { yylval.integer = BRW_OPCODE_DP4A; return DP4A; }
|
||||
dph { yylval.integer = BRW_OPCODE_DPH; return DPH; }
|
||||
else { yylval.integer = BRW_OPCODE_ELSE; return ELSE; }
|
||||
endif { yylval.integer = BRW_OPCODE_ENDIF; return ENDIF; }
|
||||
f16to32 { yylval.integer = BRW_OPCODE_F16TO32; return F16TO32; }
|
||||
f32to16 { yylval.integer = BRW_OPCODE_F32TO16; return F32TO16; }
|
||||
fbh { yylval.integer = BRW_OPCODE_FBH; return FBH; }
|
||||
fbl { yylval.integer = BRW_OPCODE_FBL; return FBL; }
|
||||
fork { yylval.integer = BRW_OPCODE_FORK; return FORK; }
|
||||
frc { yylval.integer = BRW_OPCODE_FRC; return FRC; }
|
||||
goto { yylval.integer = BRW_OPCODE_GOTO; return GOTO; }
|
||||
halt { yylval.integer = BRW_OPCODE_HALT; return HALT; }
|
||||
if { yylval.integer = BRW_OPCODE_IF; return IF; }
|
||||
iff { yylval.integer = BRW_OPCODE_IFF; return IFF; }
|
||||
illegal { yylval.integer = BRW_OPCODE_ILLEGAL; return ILLEGAL; }
|
||||
jmpi { yylval.integer = BRW_OPCODE_JMPI; return JMPI; }
|
||||
line { yylval.integer = BRW_OPCODE_LINE; return LINE; }
|
||||
lrp { yylval.integer = BRW_OPCODE_LRP; return LRP; }
|
||||
lzd { yylval.integer = BRW_OPCODE_LZD; return LZD; }
|
||||
mac { yylval.integer = BRW_OPCODE_MAC; return MAC; }
|
||||
mach { yylval.integer = BRW_OPCODE_MACH; return MACH; }
|
||||
mad { yylval.integer = BRW_OPCODE_MAD; return MAD; }
|
||||
madm { yylval.integer = BRW_OPCODE_MADM; return MADM; }
|
||||
mov { yylval.integer = BRW_OPCODE_MOV; return MOV; }
|
||||
movi { yylval.integer = BRW_OPCODE_MOVI; return MOVI; }
|
||||
mul { yylval.integer = BRW_OPCODE_MUL; return MUL; }
|
||||
mrest { yylval.integer = BRW_OPCODE_MREST; return MREST; }
|
||||
msave { yylval.integer = BRW_OPCODE_MSAVE; return MSAVE; }
|
||||
nenop { yylval.integer = BRW_OPCODE_NENOP; return NENOP; }
|
||||
nop { yylval.integer = BRW_OPCODE_NOP; return NOP; }
|
||||
not { yylval.integer = BRW_OPCODE_NOT; return NOT; }
|
||||
or { yylval.integer = BRW_OPCODE_OR; return OR; }
|
||||
pln { yylval.integer = BRW_OPCODE_PLN; return PLN; }
|
||||
pop { yylval.integer = BRW_OPCODE_POP; return POP; }
|
||||
push { yylval.integer = BRW_OPCODE_PUSH; return PUSH; }
|
||||
ret { yylval.integer = BRW_OPCODE_RET; return RET; }
|
||||
rndd { yylval.integer = BRW_OPCODE_RNDD; return RNDD; }
|
||||
rnde { yylval.integer = BRW_OPCODE_RNDE; return RNDE; }
|
||||
rndu { yylval.integer = BRW_OPCODE_RNDU; return RNDU; }
|
||||
rndz { yylval.integer = BRW_OPCODE_RNDZ; return RNDZ; }
|
||||
rol { yylval.integer = BRW_OPCODE_ROL; return ROL; }
|
||||
ror { yylval.integer = BRW_OPCODE_ROR; return ROR; }
|
||||
sad2 { yylval.integer = BRW_OPCODE_SAD2; return SAD2; }
|
||||
sada2 { yylval.integer = BRW_OPCODE_SADA2; return SADA2; }
|
||||
sel { yylval.integer = BRW_OPCODE_SEL; return SEL; }
|
||||
send {
|
||||
yylval.integer = BRW_OPCODE_SEND;
|
||||
return p->devinfo->ver < 12 ? SEND_GFX4 : SEND_GFX12;
|
||||
}
|
||||
sendc {
|
||||
yylval.integer = BRW_OPCODE_SENDC;
|
||||
return p->devinfo->ver < 12 ? SENDC_GFX4 : SENDC_GFX12;
|
||||
}
|
||||
sends { yylval.integer = BRW_OPCODE_SENDS; return SENDS; }
|
||||
sendsc { yylval.integer = BRW_OPCODE_SENDSC; return SENDSC; }
|
||||
shl { yylval.integer = BRW_OPCODE_SHL; return SHL; }
|
||||
shr { yylval.integer = BRW_OPCODE_SHR; return SHR; }
|
||||
smov { yylval.integer = BRW_OPCODE_SMOV; return SMOV; }
|
||||
subb { yylval.integer = BRW_OPCODE_SUBB; return SUBB; }
|
||||
wait { yylval.integer = BRW_OPCODE_WAIT; return WAIT; }
|
||||
while { yylval.integer = BRW_OPCODE_WHILE; return WHILE; }
|
||||
xor { yylval.integer = BRW_OPCODE_XOR; return XOR; }
|
||||
sync { yylval.integer = BRW_OPCODE_SYNC; return SYNC; }
|
||||
|
||||
/* extended math functions */
|
||||
cos { yylval.integer = BRW_MATH_FUNCTION_COS; return COS; }
|
||||
exp { yylval.integer = BRW_MATH_FUNCTION_EXP; return EXP; }
|
||||
fdiv { yylval.integer = BRW_MATH_FUNCTION_FDIV; return FDIV; }
|
||||
inv { yylval.integer = BRW_MATH_FUNCTION_INV; return INV; }
|
||||
invm { yylval.integer = GFX8_MATH_FUNCTION_INVM; return INVM; }
|
||||
intdiv {
|
||||
yylval.integer = BRW_MATH_FUNCTION_INT_DIV_QUOTIENT;
|
||||
return INTDIV;
|
||||
}
|
||||
intdivmod {
|
||||
yylval.integer =
|
||||
BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER;
|
||||
return INTDIVMOD;
|
||||
}
|
||||
intmod {
|
||||
yylval.integer = BRW_MATH_FUNCTION_INT_DIV_REMAINDER;
|
||||
return INTMOD;
|
||||
}
|
||||
log { yylval.integer = BRW_MATH_FUNCTION_LOG; return LOG; }
|
||||
pow { yylval.integer = BRW_MATH_FUNCTION_POW; return POW; }
|
||||
rsq { yylval.integer = BRW_MATH_FUNCTION_RSQ; return RSQ; }
|
||||
rsqrtm { yylval.integer = GFX8_MATH_FUNCTION_RSQRTM; return RSQRTM; }
|
||||
sin { yylval.integer = BRW_MATH_FUNCTION_SIN; return SIN; }
|
||||
sqrt { yylval.integer = BRW_MATH_FUNCTION_SQRT; return SQRT; }
|
||||
sincos { yylval.integer = BRW_MATH_FUNCTION_SINCOS; return SINCOS; }
|
||||
|
||||
/* sync instruction */
|
||||
allrd { yylval.integer = TGL_SYNC_ALLRD; return ALLRD; }
|
||||
allwr { yylval.integer = TGL_SYNC_ALLWR; return ALLWR; }
|
||||
fence { yylval.integer = TGL_SYNC_FENCE; return FENCE; }
|
||||
bar { yylval.integer = TGL_SYNC_BAR; return BAR; }
|
||||
host { yylval.integer = TGL_SYNC_HOST; return HOST; }
|
||||
|
||||
/* shared functions for send instruction */
|
||||
sampler { return SAMPLER; }
|
||||
dp_sampler { return DP_SAMPLER; }
|
||||
gateway { return GATEWAY; }
|
||||
urb { return URB; }
|
||||
thread_spawner { return THREAD_SPAWNER; }
|
||||
render { return RENDER; }
|
||||
const { return CONST; }
|
||||
data { return DATA; }
|
||||
cre { return CRE; }
|
||||
math { return MATH; }
|
||||
read { return READ; }
|
||||
write { return WRITE; }
|
||||
vme { return VME; }
|
||||
"pixel interp" { return PIXEL_INTERP; }
|
||||
"dp data 1" { return DP_DATA_1; }
|
||||
"rt accel" { return RT_ACCEL; }
|
||||
slm { return SLM; }
|
||||
tgm { return TGM; }
|
||||
ugm { return UGM; }
|
||||
|
||||
";" { return SEMICOLON; }
|
||||
":" { return COLON; }
|
||||
"(" { return LPAREN; }
|
||||
")" { return RPAREN; }
|
||||
"{" { return LCURLY; }
|
||||
"}" { return RCURLY; }
|
||||
"[" { return LSQUARE; }
|
||||
"]" { return RSQUARE; }
|
||||
"<" { return LANGLE; }
|
||||
">" { return RANGLE; }
|
||||
"," { return COMMA; }
|
||||
"." { return DOT; }
|
||||
"+" { return PLUS; }
|
||||
"-" { return MINUS; }
|
||||
"~" { return MINUS; }
|
||||
"(abs)" { return ABS; }
|
||||
|
||||
|
||||
"VxH" { return VxH; }
|
||||
<REG>"<" { return LANGLE; }
|
||||
<REG>[0-9][0-9]* {
|
||||
yylval.integer = strtoul(yytext, NULL, 10);
|
||||
return INTEGER;
|
||||
}
|
||||
<REG>">" { return RANGLE; }
|
||||
<REG>"," { return COMMA; }
|
||||
<REG>"." { BEGIN(DOTSEL); return DOT; }
|
||||
<REG>";" { return SEMICOLON; }
|
||||
|
||||
<DOTSEL>"x" { yylval.integer = BRW_CHANNEL_X; return X; }
|
||||
<DOTSEL>"y" { yylval.integer = BRW_CHANNEL_Y; return Y; }
|
||||
<DOTSEL>"z" { yylval.integer = BRW_CHANNEL_Z; return Z; }
|
||||
<DOTSEL>"w" { yylval.integer = BRW_CHANNEL_W; return W; }
|
||||
<DOTSEL>[0-9][0-9]* {
|
||||
yylval.integer = strtoul(yytext, NULL, 10);
|
||||
BEGIN(REG);
|
||||
return INTEGER;
|
||||
}
|
||||
<DOTSEL>. { yyless(0); BEGIN(INITIAL); }
|
||||
<REG>. { yyless(0); BEGIN(INITIAL); }
|
||||
|
||||
/* Access mode */
|
||||
"align1" { return ALIGN1; }
|
||||
"align16" { return ALIGN16; }
|
||||
|
||||
/* Accumulator write control */
|
||||
AccWrEnable { return ACCWREN; }
|
||||
|
||||
/* Mask control (formerly WECtrl/Write Enable Control) */
|
||||
"WE_all" { return WECTRL; }
|
||||
|
||||
/* Compaction control */
|
||||
compacted { return CMPTCTRL; }
|
||||
|
||||
/* Debug control */
|
||||
breakpoint { return BREAKPOINT; }
|
||||
|
||||
/* Dependency control */
|
||||
NoDDClr { return NODDCLR; }
|
||||
NoDDChk { return NODDCHK; }
|
||||
|
||||
/* End of thread */
|
||||
EOT { return EOT; }
|
||||
|
||||
/* Mask control */
|
||||
nomask { return MASK_DISABLE; }
|
||||
|
||||
/* Channel */
|
||||
<CHANNEL>"x" { yylval.integer = BRW_CHANNEL_X; return X; }
|
||||
<CHANNEL>"y" { yylval.integer = BRW_CHANNEL_Y; return Y; }
|
||||
<CHANNEL>"z" { yylval.integer = BRW_CHANNEL_Z; return Z; }
|
||||
<CHANNEL>"w" { yylval.integer = BRW_CHANNEL_W; return W; }
|
||||
<CHANNEL>[0-9][0-9]* {
|
||||
yylval.integer = strtoul(yytext, NULL, 10);
|
||||
return INTEGER;
|
||||
}
|
||||
<CHANNEL>"." { return DOT; }
|
||||
<CHANNEL>. { yyless(0); BEGIN(INITIAL); }
|
||||
|
||||
|
||||
/* Predicate Control */
|
||||
<CHANNEL>".anyv" { yylval.integer = BRW_PREDICATE_ALIGN1_ANYV; return ANYV; }
|
||||
<CHANNEL>".allv" { yylval.integer = BRW_PREDICATE_ALIGN1_ALLV; return ALLV; }
|
||||
<CHANNEL>".any2h" { yylval.integer = BRW_PREDICATE_ALIGN1_ANY2H; return ANY2H; }
|
||||
<CHANNEL>".all2h" { yylval.integer = BRW_PREDICATE_ALIGN1_ALL2H; return ALL2H; }
|
||||
<CHANNEL>".any4h" { yylval.integer = BRW_PREDICATE_ALIGN16_ANY4H; return ANY4H; }
|
||||
<CHANNEL>".all4h" { yylval.integer = BRW_PREDICATE_ALIGN16_ALL4H; return ALL4H; }
|
||||
<CHANNEL>".any8h" { yylval.integer = BRW_PREDICATE_ALIGN1_ANY8H; return ANY8H; }
|
||||
<CHANNEL>".all8h" { yylval.integer = BRW_PREDICATE_ALIGN1_ALL8H; return ALL8H; }
|
||||
<CHANNEL>".any16h" { yylval.integer = BRW_PREDICATE_ALIGN1_ANY16H; return ANY16H; }
|
||||
<CHANNEL>".all16h" { yylval.integer = BRW_PREDICATE_ALIGN1_ALL16H; return ALL16H; }
|
||||
<CHANNEL>".any32h" { yylval.integer = BRW_PREDICATE_ALIGN1_ANY32H; return ANY32H; }
|
||||
<CHANNEL>".all32h" { yylval.integer = BRW_PREDICATE_ALIGN1_ALL32H; return ALL32H; }
|
||||
|
||||
/* Saturation */
|
||||
".sat" { return SATURATE; }
|
||||
|
||||
/* Thread control */
|
||||
atomic { return ATOMIC; }
|
||||
switch { return SWITCH; }
|
||||
|
||||
/* compression control */
|
||||
compr { return COMPR; }
|
||||
compr4 { return COMPR4; }
|
||||
sechalf { return SECHALF; }
|
||||
|
||||
/* Quarter Control */
|
||||
1[HNQ] { }
|
||||
"2Q" { return QTR_2Q; }
|
||||
"3Q" { return QTR_3Q; }
|
||||
"4Q" { return QTR_4Q; }
|
||||
"2H" { return QTR_2H; }
|
||||
"2N" { return QTR_2N; }
|
||||
"3N" { return QTR_3N; }
|
||||
"4N" { return QTR_4N; }
|
||||
"5N" { return QTR_5N; }
|
||||
"6N" { return QTR_6N; }
|
||||
"7N" { return QTR_7N; }
|
||||
"8N" { return QTR_8N; }
|
||||
|
||||
/* data types */
|
||||
:?B { return TYPE_B; }
|
||||
:?D { return TYPE_D; }
|
||||
:?DF { return TYPE_DF; }
|
||||
:?F { return TYPE_F; }
|
||||
:?HF { return TYPE_HF; }
|
||||
:?NF { return TYPE_NF; }
|
||||
:?Q { return TYPE_Q; }
|
||||
:?UB { return TYPE_UB; }
|
||||
:?UD { return TYPE_UD; }
|
||||
:?UW { return TYPE_UW; }
|
||||
:?UQ { return TYPE_UQ; }
|
||||
:?UV { return TYPE_UV; }
|
||||
:?V { return TYPE_V; }
|
||||
:?VF { return TYPE_VF; }
|
||||
:?W { return TYPE_W; }
|
||||
|
||||
/* Address registers */
|
||||
"a0" { return ADDRREG; }
|
||||
|
||||
/* accumulator registers */
|
||||
"acc"[0-9]+ { yylval.integer = atoi(yytext + 3); return ACCREG; }
|
||||
|
||||
/* channel enable registers */
|
||||
"ce0" { return CHANNELENABLEREG; }
|
||||
|
||||
/* control registers */
|
||||
"cr0" { return CONTROLREG; }
|
||||
|
||||
/* flag registers */
|
||||
"f"[0|1] { BEGIN(CHANNEL); yylval.integer = atoi(yytext + 1); return FLAGREG; }
|
||||
|
||||
/* message control registers */
|
||||
"m" { return MSGREGFILE; }
|
||||
m[0-9]+ { yylval.integer = atoi(yytext + 1); BEGIN(REG); return MSGREG; }
|
||||
|
||||
/* state register */
|
||||
sr[0-9]+ { yylval.integer = atoi(yytext + 2); return STATEREG; }
|
||||
|
||||
/* notification registers */
|
||||
"n0" { BEGIN(REG); return NOTIFYREG; }
|
||||
|
||||
/* IP register */
|
||||
"ip" { return IPREG; }
|
||||
|
||||
/* Thread control register */
|
||||
"tdr0" { return THREADREG; }
|
||||
|
||||
/* performance register */
|
||||
"tm0" { BEGIN(REG); return PERFORMANCEREG; }
|
||||
|
||||
[gr][0-9]+ {
|
||||
yylval.integer = atoi(yytext + 1);
|
||||
BEGIN(REG); return GENREG;
|
||||
}
|
||||
[gr] { return GENREGFILE; }
|
||||
"mask"[0-9]+ { yylval.integer = atoi(yytext + 4); return MASKREG; }
|
||||
|
||||
/* Conditional modifiers */
|
||||
".e" { yylval.integer = BRW_CONDITIONAL_Z; return EQUAL; }
|
||||
".g" { yylval.integer = BRW_CONDITIONAL_G; return GREATER; }
|
||||
".ge" { yylval.integer = BRW_CONDITIONAL_GE; return GREATER_EQUAL; }
|
||||
".l" { yylval.integer = BRW_CONDITIONAL_L; return LESS; }
|
||||
".le" { yylval.integer = BRW_CONDITIONAL_LE; return LESS_EQUAL; }
|
||||
".ne" { yylval.integer = BRW_CONDITIONAL_NZ; return NOT_EQUAL; }
|
||||
".nz" { yylval.integer = BRW_CONDITIONAL_NZ; return NOT_ZERO; }
|
||||
".o" { yylval.integer = BRW_CONDITIONAL_O; return OVERFLOW; }
|
||||
".r" { yylval.integer = BRW_CONDITIONAL_R; return ROUND_INCREMENT; }
|
||||
".u" { yylval.integer = BRW_CONDITIONAL_U; return UNORDERED; }
|
||||
".z" { yylval.integer = BRW_CONDITIONAL_Z; return ZERO; }
|
||||
|
||||
/* Eat up JIP and UIP token, their values will be parsed
|
||||
* in numeric section
|
||||
*/
|
||||
"JIP: " { BEGIN(LABEL); }
|
||||
"UIP: " { BEGIN(LABEL); }
|
||||
"Jump: " { }
|
||||
"Pop: " { }
|
||||
[ \t]+ { }
|
||||
|
||||
"MsgDesc: " { BEGIN(MSGDESC); return MSGDESC_BEGIN; }
|
||||
<MSGDESC>ex_bso { return EX_BSO; }
|
||||
<MSGDESC>src1_len { return SRC1_LEN; }
|
||||
<MSGDESC>"=" { return ASSIGN; }
|
||||
<MSGDESC>[0-9][0-9]* {
|
||||
yylval.integer = strtoul(yytext, NULL, 10);
|
||||
return INTEGER;
|
||||
}
|
||||
<MSGDESC>"{" { yyless(0); BEGIN(INITIAL); return MSGDESC_END; }
|
||||
<MSGDESC>. { }
|
||||
|
||||
"0x"[0-9a-f][0-9a-f]* {
|
||||
yylval.llint = strtoull(yytext + 2, NULL, 16);
|
||||
return LONG;
|
||||
}
|
||||
[0-9][0-9]* {
|
||||
yylval.llint = strtoll(yytext, NULL, 10);
|
||||
return LONG;
|
||||
}
|
||||
|
||||
/* jump label target */
|
||||
[a-zA-Z_][0-9a-zA-Z_]*":" {
|
||||
yylval.string = ralloc_strdup(p->mem_ctx, yytext);
|
||||
/* Stomp the trailing ':' */
|
||||
yylval.string[yyleng - 1] = '\0';
|
||||
return JUMP_LABEL_TARGET;
|
||||
}
|
||||
|
||||
/* jump label */
|
||||
<LABEL>[a-zA-Z_][0-9a-zA-Z_]* {
|
||||
yylval.string = ralloc_strdup(p->mem_ctx, yytext);
|
||||
BEGIN(INITIAL);
|
||||
return JUMP_LABEL;
|
||||
}
|
||||
|
||||
/* SWSB */
|
||||
"@"[1-7] { yylval.integer = atoi(yytext + 1); return REG_DIST_CURRENT; }
|
||||
"F@"[1-7] { yylval.integer = atoi(yytext + 2); return REG_DIST_FLOAT; }
|
||||
"I@"[1-7] { yylval.integer = atoi(yytext + 2); return REG_DIST_INT; }
|
||||
"L@"[1-7] { yylval.integer = atoi(yytext + 2); return REG_DIST_LONG; }
|
||||
"A@"[1-7] { yylval.integer = atoi(yytext + 2); return REG_DIST_ALL; }
|
||||
|
||||
"$"[0-9]* { yylval.integer = atoi(yytext + 1); return SBID_ALLOC; }
|
||||
"$"[0-9]*".src" { yylval.integer = atoi(yytext + 1); return SBID_WAIT_SRC; }
|
||||
"$"[0-9]*".dst" { yylval.integer = atoi(yytext + 1); return SBID_WAIT_DST; }
|
||||
|
||||
\n { yycolumn = 1; }
|
||||
|
||||
. {
|
||||
fprintf(stderr, "%s: %d: %s: at \"%s\"\n",
|
||||
input_filename, yylineno,
|
||||
"unexpected token", lex_text());
|
||||
}
|
||||
%%
|
||||
|
||||
char *
|
||||
lex_text(void)
|
||||
{
|
||||
return yytext;
|
||||
}
|
||||
|
||||
#ifndef yywrap
|
||||
int yywrap()
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
#endif
|
||||
3398
src/intel/compiler/elk/brw_lower_logical_sends.cpp
Normal file
3398
src/intel/compiler/elk/brw_lower_logical_sends.cpp
Normal file
File diff suppressed because it is too large
Load diff
1606
src/intel/compiler/elk/brw_mesh.cpp
Normal file
1606
src/intel/compiler/elk/brw_mesh.cpp
Normal file
File diff suppressed because it is too large
Load diff
2153
src/intel/compiler/elk/brw_nir.c
Normal file
2153
src/intel/compiler/elk/brw_nir.c
Normal file
File diff suppressed because it is too large
Load diff
298
src/intel/compiler/elk/brw_nir.h
Normal file
298
src/intel/compiler/elk/brw_nir.h
Normal file
|
|
@ -0,0 +1,298 @@
|
|||
/*
|
||||
* Copyright © 2015 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BRW_NIR_H
|
||||
#define BRW_NIR_H
|
||||
|
||||
#include "brw_reg.h"
|
||||
#include "compiler/nir/nir.h"
|
||||
#include "brw_compiler.h"
|
||||
#include "nir_builder.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern const struct nir_shader_compiler_options brw_scalar_nir_options;
|
||||
extern const struct nir_shader_compiler_options brw_vector_nir_options;
|
||||
|
||||
int type_size_vec4(const struct glsl_type *type, bool bindless);
|
||||
int type_size_dvec4(const struct glsl_type *type, bool bindless);
|
||||
|
||||
static inline int
|
||||
type_size_scalar_bytes(const struct glsl_type *type, bool bindless)
|
||||
{
|
||||
return glsl_count_dword_slots(type, bindless) * 4;
|
||||
}
|
||||
|
||||
static inline int
|
||||
type_size_vec4_bytes(const struct glsl_type *type, bool bindless)
|
||||
{
|
||||
return type_size_vec4(type, bindless) * 16;
|
||||
}
|
||||
|
||||
/* Flags set in the instr->pass_flags field by i965 analysis passes */
|
||||
enum {
|
||||
BRW_NIR_NON_BOOLEAN = 0x0,
|
||||
|
||||
/* Indicates that the given instruction's destination is a boolean
|
||||
* value but that it needs to be resolved before it can be used.
|
||||
* On Gen <= 5, CMP instructions return a 32-bit value where the bottom
|
||||
* bit represents the actual true/false value of the compare and the top
|
||||
* 31 bits are undefined. In order to use this value, we have to do a
|
||||
* "resolve" operation by replacing the value of the CMP with -(x & 1)
|
||||
* to sign-extend the bottom bit to 0/~0.
|
||||
*/
|
||||
BRW_NIR_BOOLEAN_NEEDS_RESOLVE = 0x1,
|
||||
|
||||
/* Indicates that the given instruction's destination is a boolean
|
||||
* value that has intentionally been left unresolved. Not all boolean
|
||||
* values need to be resolved immediately. For instance, if we have
|
||||
*
|
||||
* CMP r1 r2 r3
|
||||
* CMP r4 r5 r6
|
||||
* AND r7 r1 r4
|
||||
*
|
||||
* We don't have to resolve the result of the two CMP instructions
|
||||
* immediately because the AND still does an AND of the bottom bits.
|
||||
* Instead, we can save ourselves instructions by delaying the resolve
|
||||
* until after the AND. The result of the two CMP instructions is left
|
||||
* as BRW_NIR_BOOLEAN_UNRESOLVED.
|
||||
*/
|
||||
BRW_NIR_BOOLEAN_UNRESOLVED = 0x2,
|
||||
|
||||
/* Indicates a that the given instruction's destination is a boolean
|
||||
* value that does not need a resolve. For instance, if you AND two
|
||||
* values that are BRW_NIR_BOOLEAN_NEEDS_RESOLVE then we know that both
|
||||
* values will be 0/~0 before we get them and the result of the AND is
|
||||
* also guaranteed to be 0/~0 and does not need a resolve.
|
||||
*/
|
||||
BRW_NIR_BOOLEAN_NO_RESOLVE = 0x3,
|
||||
|
||||
/* A mask to mask the boolean status values off of instr->pass_flags */
|
||||
BRW_NIR_BOOLEAN_MASK = 0x3,
|
||||
};
|
||||
|
||||
void brw_nir_analyze_boolean_resolves(nir_shader *nir);
|
||||
|
||||
struct brw_nir_compiler_opts {
|
||||
/* Soft floating point implementation shader */
|
||||
const nir_shader *softfp64;
|
||||
|
||||
/* Whether robust image access is enabled */
|
||||
bool robust_image_access;
|
||||
|
||||
/* Input vertices for TCS stage (0 means dynamic) */
|
||||
unsigned input_vertices;
|
||||
};
|
||||
|
||||
/* UBO surface index can come in 2 flavors :
|
||||
* - nir_intrinsic_resource_intel
|
||||
* - anything else
|
||||
*
|
||||
* In the first case, checking that the surface index is const requires
|
||||
* checking resource_intel::src[1]. In any other case it's a simple
|
||||
* nir_src_is_const().
|
||||
*
|
||||
* This function should only be called on src[0] of load_ubo intrinsics.
|
||||
*/
|
||||
static inline bool
|
||||
brw_nir_ubo_surface_index_is_pushable(nir_src src)
|
||||
{
|
||||
nir_intrinsic_instr *intrin =
|
||||
src.ssa->parent_instr->type == nir_instr_type_intrinsic ?
|
||||
nir_instr_as_intrinsic(src.ssa->parent_instr) : NULL;
|
||||
|
||||
if (intrin && intrin->intrinsic == nir_intrinsic_resource_intel) {
|
||||
return (nir_intrinsic_resource_access_intel(intrin) &
|
||||
nir_resource_intel_pushable);
|
||||
}
|
||||
|
||||
return nir_src_is_const(src);
|
||||
}
|
||||
|
||||
static inline unsigned
|
||||
brw_nir_ubo_surface_index_get_push_block(nir_src src)
|
||||
{
|
||||
if (nir_src_is_const(src))
|
||||
return nir_src_as_uint(src);
|
||||
|
||||
if (!brw_nir_ubo_surface_index_is_pushable(src))
|
||||
return UINT32_MAX;
|
||||
|
||||
assert(src.ssa->parent_instr->type == nir_instr_type_intrinsic);
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src.ssa->parent_instr);
|
||||
assert(intrin->intrinsic == nir_intrinsic_resource_intel);
|
||||
|
||||
return nir_intrinsic_resource_block_intel(intrin);
|
||||
}
|
||||
|
||||
/* This helper return the binding table index of a surface access (any
|
||||
* buffer/image/etc...). It works off the source of one of the intrinsics
|
||||
* (load_ubo, load_ssbo, store_ssbo, load_image, store_image, etc...).
|
||||
*
|
||||
* If the source is constant, then this is the binding table index. If we're
|
||||
* going through a resource_intel intel intrinsic, then we need to check
|
||||
* src[1] of that intrinsic.
|
||||
*/
|
||||
static inline unsigned
|
||||
brw_nir_ubo_surface_index_get_bti(nir_src src)
|
||||
{
|
||||
if (nir_src_is_const(src))
|
||||
return nir_src_as_uint(src);
|
||||
|
||||
assert(src.ssa->parent_instr->type == nir_instr_type_intrinsic);
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src.ssa->parent_instr);
|
||||
if (!intrin || intrin->intrinsic != nir_intrinsic_resource_intel)
|
||||
return UINT32_MAX;
|
||||
|
||||
/* In practice we could even drop this intrinsic because the bindless
|
||||
* access always operate from a base offset coming from a push constant, so
|
||||
* they can never be constant.
|
||||
*/
|
||||
if (nir_intrinsic_resource_access_intel(intrin) &
|
||||
nir_resource_intel_bindless)
|
||||
return UINT32_MAX;
|
||||
|
||||
if (!nir_src_is_const(intrin->src[1]))
|
||||
return UINT32_MAX;
|
||||
|
||||
return nir_src_as_uint(intrin->src[1]);
|
||||
}
|
||||
|
||||
void brw_preprocess_nir(const struct brw_compiler *compiler,
|
||||
nir_shader *nir,
|
||||
const struct brw_nir_compiler_opts *opts);
|
||||
|
||||
void
|
||||
brw_nir_link_shaders(const struct brw_compiler *compiler,
|
||||
nir_shader *producer, nir_shader *consumer);
|
||||
|
||||
bool brw_nir_lower_cs_intrinsics(nir_shader *nir,
|
||||
const struct intel_device_info *devinfo,
|
||||
struct brw_cs_prog_data *prog_data);
|
||||
bool brw_nir_lower_alpha_to_coverage(nir_shader *shader,
|
||||
const struct brw_wm_prog_key *key,
|
||||
const struct brw_wm_prog_data *prog_data);
|
||||
void brw_nir_lower_vs_inputs(nir_shader *nir,
|
||||
bool edgeflag_is_last,
|
||||
const uint8_t *vs_attrib_wa_flags);
|
||||
void brw_nir_lower_vue_inputs(nir_shader *nir,
|
||||
const struct intel_vue_map *vue_map);
|
||||
void brw_nir_lower_tes_inputs(nir_shader *nir, const struct intel_vue_map *vue);
|
||||
void brw_nir_lower_fs_inputs(nir_shader *nir,
|
||||
const struct intel_device_info *devinfo,
|
||||
const struct brw_wm_prog_key *key);
|
||||
void brw_nir_lower_vue_outputs(nir_shader *nir);
|
||||
void brw_nir_lower_tcs_outputs(nir_shader *nir, const struct intel_vue_map *vue,
|
||||
enum tess_primitive_mode tes_primitive_mode);
|
||||
void brw_nir_lower_fs_outputs(nir_shader *nir);
|
||||
|
||||
bool brw_nir_lower_cmat(nir_shader *nir, unsigned subgroup_size);
|
||||
|
||||
bool brw_nir_lower_shading_rate_output(nir_shader *nir);
|
||||
|
||||
bool brw_nir_lower_sparse_intrinsics(nir_shader *nir);
|
||||
|
||||
struct brw_nir_lower_storage_image_opts {
|
||||
const struct intel_device_info *devinfo;
|
||||
|
||||
bool lower_loads;
|
||||
bool lower_stores;
|
||||
bool lower_atomics;
|
||||
bool lower_get_size;
|
||||
};
|
||||
|
||||
bool brw_nir_lower_storage_image(nir_shader *nir,
|
||||
const struct brw_nir_lower_storage_image_opts *opts);
|
||||
|
||||
bool brw_nir_lower_mem_access_bit_sizes(nir_shader *shader,
|
||||
const struct
|
||||
intel_device_info *devinfo);
|
||||
|
||||
void brw_postprocess_nir(nir_shader *nir,
|
||||
const struct brw_compiler *compiler,
|
||||
bool debug_enabled,
|
||||
enum brw_robustness_flags robust_flags);
|
||||
|
||||
bool brw_nir_apply_attribute_workarounds(nir_shader *nir,
|
||||
const uint8_t *attrib_wa_flags);
|
||||
|
||||
bool brw_nir_apply_trig_workarounds(nir_shader *nir);
|
||||
|
||||
bool brw_nir_limit_trig_input_range_workaround(nir_shader *nir);
|
||||
|
||||
void brw_nir_apply_key(nir_shader *nir,
|
||||
const struct brw_compiler *compiler,
|
||||
const struct brw_base_prog_key *key,
|
||||
unsigned max_subgroup_size);
|
||||
|
||||
unsigned brw_nir_api_subgroup_size(const nir_shader *nir,
|
||||
unsigned hw_subgroup_size);
|
||||
|
||||
enum brw_conditional_mod brw_cmod_for_nir_comparison(nir_op op);
|
||||
enum lsc_opcode lsc_aop_for_nir_intrinsic(const nir_intrinsic_instr *atomic);
|
||||
enum brw_reg_type brw_type_for_nir_type(const struct intel_device_info *devinfo,
|
||||
nir_alu_type type);
|
||||
|
||||
bool brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
|
||||
unsigned bit_size,
|
||||
unsigned num_components,
|
||||
nir_intrinsic_instr *low,
|
||||
nir_intrinsic_instr *high,
|
||||
void *data);
|
||||
|
||||
void brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
|
||||
nir_shader *nir,
|
||||
struct brw_ubo_range out_ranges[4]);
|
||||
|
||||
void brw_nir_optimize(nir_shader *nir, bool is_scalar,
|
||||
const struct intel_device_info *devinfo);
|
||||
|
||||
nir_shader *brw_nir_create_passthrough_tcs(void *mem_ctx,
|
||||
const struct brw_compiler *compiler,
|
||||
const struct brw_tcs_prog_key *key);
|
||||
|
||||
#define BRW_NIR_FRAG_OUTPUT_INDEX_SHIFT 0
|
||||
#define BRW_NIR_FRAG_OUTPUT_INDEX_MASK INTEL_MASK(0, 0)
|
||||
#define BRW_NIR_FRAG_OUTPUT_LOCATION_SHIFT 1
|
||||
#define BRW_NIR_FRAG_OUTPUT_LOCATION_MASK INTEL_MASK(31, 1)
|
||||
|
||||
bool brw_nir_move_interpolation_to_top(nir_shader *nir);
|
||||
nir_def *brw_nir_load_global_const(nir_builder *b,
|
||||
nir_intrinsic_instr *load_uniform,
|
||||
nir_def *base_addr,
|
||||
unsigned off);
|
||||
|
||||
const struct glsl_type *brw_nir_get_var_type(const struct nir_shader *nir,
|
||||
nir_variable *var);
|
||||
|
||||
void brw_nir_adjust_payload(nir_shader *shader);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* BRW_NIR_H */
|
||||
258
src/intel/compiler/elk/brw_nir_analyze_boolean_resolves.c
Normal file
258
src/intel/compiler/elk/brw_nir_analyze_boolean_resolves.c
Normal file
|
|
@ -0,0 +1,258 @@
|
|||
/*
|
||||
* Copyright © 2015 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_nir.h"
|
||||
|
||||
/*
|
||||
* This file implements an analysis pass that determines when we have to do
|
||||
* a boolean resolve on Gen <= 5. Instructions that need a boolean resolve
|
||||
* will have the booleans portion of the instr->pass_flags field set to
|
||||
* BRW_NIR_BOOLEAN_NEEDS_RESOLVE.
|
||||
*/
|
||||
|
||||
|
||||
/** Returns the resolve status for the given source
|
||||
*
|
||||
* If the source has a parent instruction then the resolve status is the
|
||||
* status of the parent instruction. If the source does not have a parent
|
||||
* instruction then we don't know so we return NON_BOOLEAN.
|
||||
*/
|
||||
static uint8_t
|
||||
get_resolve_status_for_src(nir_src *src)
|
||||
{
|
||||
nir_instr *src_instr = src->ssa->parent_instr;
|
||||
uint8_t resolve_status = src_instr->pass_flags & BRW_NIR_BOOLEAN_MASK;
|
||||
|
||||
/* If the source instruction needs resolve, then from the perspective
|
||||
* of the user, it's a true boolean.
|
||||
*/
|
||||
if (resolve_status == BRW_NIR_BOOLEAN_NEEDS_RESOLVE)
|
||||
resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE;
|
||||
return resolve_status;
|
||||
}
|
||||
|
||||
/** Marks the given source as needing a resolve
|
||||
*
|
||||
* If the given source corresponds to an unresolved boolean it marks it as
|
||||
* needing a resolve. Otherwise, we leave it alone.
|
||||
*/
|
||||
static bool
|
||||
src_mark_needs_resolve(nir_src *src, void *void_state)
|
||||
{
|
||||
nir_instr *src_instr = src->ssa->parent_instr;
|
||||
uint8_t resolve_status = src_instr->pass_flags & BRW_NIR_BOOLEAN_MASK;
|
||||
|
||||
/* If the source instruction is unresolved, then mark it as needing
|
||||
* to be resolved.
|
||||
*/
|
||||
if (resolve_status == BRW_NIR_BOOLEAN_UNRESOLVED) {
|
||||
src_instr->pass_flags &= ~BRW_NIR_BOOLEAN_MASK;
|
||||
src_instr->pass_flags |= BRW_NIR_BOOLEAN_NEEDS_RESOLVE;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
analyze_boolean_resolves_block(nir_block *block)
|
||||
{
|
||||
nir_foreach_instr(instr, block) {
|
||||
switch (instr->type) {
|
||||
case nir_instr_type_alu: {
|
||||
/* For ALU instructions, the resolve status is handled in a
|
||||
* three-step process.
|
||||
*
|
||||
* 1) Look at the instruction type and sources and determine if it
|
||||
* can be left unresolved.
|
||||
*
|
||||
* 2) Look at the destination and see if we have to resolve
|
||||
* anyway. (This is the case if this instruction is not the
|
||||
* only instruction writing to a given register.)
|
||||
*
|
||||
* 3) If the instruction has a resolve status other than
|
||||
* BOOL_UNRESOLVED or BOOL_NEEDS_RESOLVE then we walk through
|
||||
* the sources and ensure that they are also resolved. This
|
||||
* ensures that we don't end up with any stray unresolved
|
||||
* booleans going into ADDs or something like that.
|
||||
*/
|
||||
|
||||
uint8_t resolve_status;
|
||||
nir_alu_instr *alu = nir_instr_as_alu(instr);
|
||||
switch (alu->op) {
|
||||
case nir_op_b32all_fequal2:
|
||||
case nir_op_b32all_iequal2:
|
||||
case nir_op_b32all_fequal3:
|
||||
case nir_op_b32all_iequal3:
|
||||
case nir_op_b32all_fequal4:
|
||||
case nir_op_b32all_iequal4:
|
||||
case nir_op_b32any_fnequal2:
|
||||
case nir_op_b32any_inequal2:
|
||||
case nir_op_b32any_fnequal3:
|
||||
case nir_op_b32any_inequal3:
|
||||
case nir_op_b32any_fnequal4:
|
||||
case nir_op_b32any_inequal4:
|
||||
/* These are only implemented by the vec4 backend and its
|
||||
* implementation emits resolved booleans. At some point in the
|
||||
* future, this may change and we'll have to remove some of the
|
||||
* above cases.
|
||||
*/
|
||||
resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE;
|
||||
break;
|
||||
|
||||
case nir_op_mov:
|
||||
case nir_op_inot:
|
||||
/* This is a single-source instruction. Just copy the resolve
|
||||
* status from the source.
|
||||
*/
|
||||
resolve_status = get_resolve_status_for_src(&alu->src[0].src);
|
||||
break;
|
||||
|
||||
case nir_op_b32csel:
|
||||
case nir_op_iand:
|
||||
case nir_op_ior:
|
||||
case nir_op_ixor: {
|
||||
const unsigned first = alu->op == nir_op_b32csel ? 1 : 0;
|
||||
uint8_t src0_status = get_resolve_status_for_src(&alu->src[first + 0].src);
|
||||
uint8_t src1_status = get_resolve_status_for_src(&alu->src[first + 1].src);
|
||||
|
||||
/* src0 of a bcsel is evaluated as a Boolean with the expectation
|
||||
* that it has already been resolved. Mark it as such.
|
||||
*/
|
||||
if (alu->op == nir_op_b32csel)
|
||||
src_mark_needs_resolve(&alu->src[0].src, NULL);
|
||||
|
||||
if (src0_status == src1_status) {
|
||||
resolve_status = src0_status;
|
||||
} else if (src0_status == BRW_NIR_NON_BOOLEAN ||
|
||||
src1_status == BRW_NIR_NON_BOOLEAN) {
|
||||
/* If one of the sources is a non-boolean then the whole
|
||||
* thing is a non-boolean.
|
||||
*/
|
||||
resolve_status = BRW_NIR_NON_BOOLEAN;
|
||||
} else {
|
||||
/* At this point one of them is a true boolean and one is a
|
||||
* boolean that needs a resolve. We could either resolve the
|
||||
* unresolved source or we could resolve here. If we resolve
|
||||
* the unresolved source then we get two resolves for the price
|
||||
* of one. Just set this one to BOOLEAN_NO_RESOLVE and we'll
|
||||
* let the code below force a resolve on the unresolved source.
|
||||
*/
|
||||
resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) == nir_type_bool) {
|
||||
/* This instructions will turn into a CMP when we actually emit
|
||||
* them so the result will have to be resolved before it can be
|
||||
* used.
|
||||
*/
|
||||
resolve_status = BRW_NIR_BOOLEAN_UNRESOLVED;
|
||||
|
||||
/* Even though the destination is allowed to be left
|
||||
* unresolved, the sources are treated as regular integers or
|
||||
* floats so they need to be resolved.
|
||||
*/
|
||||
nir_foreach_src(instr, src_mark_needs_resolve, NULL);
|
||||
} else {
|
||||
resolve_status = BRW_NIR_NON_BOOLEAN;
|
||||
}
|
||||
}
|
||||
|
||||
/* Go ahead allow unresolved booleans. */
|
||||
instr->pass_flags = (instr->pass_flags & ~BRW_NIR_BOOLEAN_MASK) |
|
||||
resolve_status;
|
||||
|
||||
/* Finally, resolve sources if it's needed */
|
||||
switch (resolve_status) {
|
||||
case BRW_NIR_BOOLEAN_NEEDS_RESOLVE:
|
||||
case BRW_NIR_BOOLEAN_UNRESOLVED:
|
||||
/* This instruction is either unresolved or we're doing the
|
||||
* resolve here; leave the sources alone.
|
||||
*/
|
||||
break;
|
||||
|
||||
case BRW_NIR_BOOLEAN_NO_RESOLVE:
|
||||
case BRW_NIR_NON_BOOLEAN:
|
||||
nir_foreach_src(instr, src_mark_needs_resolve, NULL);
|
||||
break;
|
||||
|
||||
default:
|
||||
unreachable("Invalid boolean flag");
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_instr_type_load_const: {
|
||||
nir_load_const_instr *load = nir_instr_as_load_const(instr);
|
||||
|
||||
/* For load_const instructions, it's a boolean exactly when it holds
|
||||
* one of the values NIR_TRUE or NIR_FALSE.
|
||||
*
|
||||
* Since load_const instructions don't have any sources, we don't
|
||||
* have to worry about resolving them.
|
||||
*/
|
||||
instr->pass_flags &= ~BRW_NIR_BOOLEAN_MASK;
|
||||
if (load->value[0].u32 == NIR_TRUE || load->value[0].u32 == NIR_FALSE) {
|
||||
instr->pass_flags |= BRW_NIR_BOOLEAN_NO_RESOLVE;
|
||||
} else {
|
||||
instr->pass_flags |= BRW_NIR_NON_BOOLEAN;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
default:
|
||||
/* Everything else is an unknown non-boolean value and needs to
|
||||
* have all sources resolved.
|
||||
*/
|
||||
instr->pass_flags = (instr->pass_flags & ~BRW_NIR_BOOLEAN_MASK) |
|
||||
BRW_NIR_NON_BOOLEAN;
|
||||
nir_foreach_src(instr, src_mark_needs_resolve, NULL);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
nir_if *following_if = nir_block_get_following_if(block);
|
||||
if (following_if)
|
||||
src_mark_needs_resolve(&following_if->condition, NULL);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
analyze_boolean_resolves_impl(nir_function_impl *impl)
|
||||
{
|
||||
nir_foreach_block(block, impl) {
|
||||
analyze_boolean_resolves_block(block);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
brw_nir_analyze_boolean_resolves(nir_shader *shader)
|
||||
{
|
||||
nir_foreach_function_impl(impl, shader) {
|
||||
analyze_boolean_resolves_impl(impl);
|
||||
}
|
||||
}
|
||||
317
src/intel/compiler/elk/brw_nir_analyze_ubo_ranges.c
Normal file
317
src/intel/compiler/elk/brw_nir_analyze_ubo_ranges.c
Normal file
|
|
@ -0,0 +1,317 @@
|
|||
/*
|
||||
* Copyright © 2015 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_nir.h"
|
||||
#include "compiler/nir/nir.h"
|
||||
#include "util/u_dynarray.h"
|
||||
|
||||
/**
|
||||
* \file brw_nir_analyze_ubo_ranges.c
|
||||
*
|
||||
* This pass decides which portions of UBOs to upload as push constants,
|
||||
* so shaders can access them as part of the thread payload, rather than
|
||||
* having to issue expensive memory reads to pull the data.
|
||||
*
|
||||
* The 3DSTATE_CONSTANT_* mechanism can push data from up to 4 different
|
||||
* buffers, in GRF (256-bit/32-byte) units.
|
||||
*
|
||||
* To do this, we examine NIR load_ubo intrinsics, recording the number of
|
||||
* loads at each offset. We track offsets at a 32-byte granularity, so even
|
||||
* fields with a bit of padding between them tend to fall into contiguous
|
||||
* ranges. We build a list of these ranges, tracking their "cost" (number
|
||||
* of registers required) and "benefit" (number of pull loads eliminated
|
||||
* by pushing the range). We then sort the list to obtain the four best
|
||||
* ranges (most benefit for the least cost).
|
||||
*/
|
||||
|
||||
struct ubo_range_entry
|
||||
{
|
||||
struct brw_ubo_range range;
|
||||
int benefit;
|
||||
};
|
||||
|
||||
static int
|
||||
score(const struct ubo_range_entry *entry)
|
||||
{
|
||||
return 2 * entry->benefit - entry->range.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares score for two UBO range entries.
|
||||
*
|
||||
* For a descending qsort().
|
||||
*/
|
||||
static int
|
||||
cmp_ubo_range_entry(const void *va, const void *vb)
|
||||
{
|
||||
const struct ubo_range_entry *a = va;
|
||||
const struct ubo_range_entry *b = vb;
|
||||
|
||||
/* Rank based on scores, descending order */
|
||||
int delta = score(b) - score(a);
|
||||
|
||||
/* Then use the UBO block index as a tie-breaker, descending order */
|
||||
if (delta == 0)
|
||||
delta = b->range.block - a->range.block;
|
||||
|
||||
/* Finally use the start offset as a second tie-breaker, ascending order */
|
||||
if (delta == 0)
|
||||
delta = a->range.start - b->range.start;
|
||||
|
||||
return delta;
|
||||
}
|
||||
|
||||
struct ubo_block_info
|
||||
{
|
||||
/* Each bit in the offsets bitfield represents a 32-byte section of data.
|
||||
* If it's set to one, there is interesting UBO data at that offset. If
|
||||
* not, there's a "hole" - padding between data - or just nothing at all.
|
||||
*/
|
||||
uint64_t offsets;
|
||||
uint8_t uses[64];
|
||||
};
|
||||
|
||||
struct ubo_analysis_state
|
||||
{
|
||||
struct hash_table *blocks;
|
||||
bool uses_regular_uniforms;
|
||||
};
|
||||
|
||||
static struct ubo_block_info *
|
||||
get_block_info(struct ubo_analysis_state *state, int block)
|
||||
{
|
||||
uint32_t hash = block + 1;
|
||||
void *key = (void *) (uintptr_t) hash;
|
||||
|
||||
struct hash_entry *entry =
|
||||
_mesa_hash_table_search_pre_hashed(state->blocks, hash, key);
|
||||
|
||||
if (entry)
|
||||
return (struct ubo_block_info *) entry->data;
|
||||
|
||||
struct ubo_block_info *info =
|
||||
rzalloc(state->blocks, struct ubo_block_info);
|
||||
_mesa_hash_table_insert_pre_hashed(state->blocks, hash, key, info);
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
static void
|
||||
analyze_ubos_block(struct ubo_analysis_state *state, nir_block *block)
|
||||
{
|
||||
nir_foreach_instr(instr, block) {
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
continue;
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_load_uniform:
|
||||
case nir_intrinsic_image_deref_load:
|
||||
case nir_intrinsic_image_deref_store:
|
||||
case nir_intrinsic_image_deref_atomic:
|
||||
case nir_intrinsic_image_deref_atomic_swap:
|
||||
case nir_intrinsic_image_deref_size:
|
||||
state->uses_regular_uniforms = true;
|
||||
continue;
|
||||
|
||||
case nir_intrinsic_load_ubo:
|
||||
break; /* Fall through to the analysis below */
|
||||
|
||||
default:
|
||||
continue; /* Not a uniform or UBO intrinsic */
|
||||
}
|
||||
|
||||
if (brw_nir_ubo_surface_index_is_pushable(intrin->src[0]) &&
|
||||
nir_src_is_const(intrin->src[1])) {
|
||||
const int block = brw_nir_ubo_surface_index_get_push_block(intrin->src[0]);
|
||||
const unsigned byte_offset = nir_src_as_uint(intrin->src[1]);
|
||||
const int offset = byte_offset / 32;
|
||||
|
||||
/* Avoid shifting by larger than the width of our bitfield, as this
|
||||
* is undefined in C. Even if we require multiple bits to represent
|
||||
* the entire value, it's OK to record a partial value - the backend
|
||||
* is capable of falling back to pull loads for later components of
|
||||
* vectors, as it has to shrink ranges for other reasons anyway.
|
||||
*/
|
||||
if (offset >= 64)
|
||||
continue;
|
||||
|
||||
/* The value might span multiple 32-byte chunks. */
|
||||
const int bytes = nir_intrinsic_dest_components(intrin) *
|
||||
(intrin->def.bit_size / 8);
|
||||
const int start = ROUND_DOWN_TO(byte_offset, 32);
|
||||
const int end = ALIGN(byte_offset + bytes, 32);
|
||||
const int chunks = (end - start) / 32;
|
||||
|
||||
/* TODO: should we count uses in loops as higher benefit? */
|
||||
|
||||
struct ubo_block_info *info = get_block_info(state, block);
|
||||
info->offsets |= ((1ull << chunks) - 1) << offset;
|
||||
info->uses[offset]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
print_ubo_entry(FILE *file,
|
||||
const struct ubo_range_entry *entry,
|
||||
struct ubo_analysis_state *state)
|
||||
{
|
||||
struct ubo_block_info *info = get_block_info(state, entry->range.block);
|
||||
|
||||
fprintf(file,
|
||||
"block %2d, start %2d, length %2d, bits = %"PRIx64", "
|
||||
"benefit %2d, cost %2d, score = %2d\n",
|
||||
entry->range.block, entry->range.start, entry->range.length,
|
||||
info->offsets, entry->benefit, entry->range.length, score(entry));
|
||||
}
|
||||
|
||||
void
|
||||
brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
|
||||
nir_shader *nir,
|
||||
struct brw_ubo_range out_ranges[4])
|
||||
{
|
||||
void *mem_ctx = ralloc_context(NULL);
|
||||
|
||||
struct ubo_analysis_state state = {
|
||||
.uses_regular_uniforms = false,
|
||||
.blocks =
|
||||
_mesa_hash_table_create(mem_ctx, NULL, _mesa_key_pointer_equal),
|
||||
};
|
||||
|
||||
/* Compute shaders use push constants to get the subgroup ID so it's
|
||||
* best to just assume some system values are pushed.
|
||||
*/
|
||||
if (nir->info.stage == MESA_SHADER_COMPUTE)
|
||||
state.uses_regular_uniforms = true;
|
||||
|
||||
/* Walk the IR, recording how many times each UBO block/offset is used. */
|
||||
nir_foreach_function_impl(impl, nir) {
|
||||
nir_foreach_block(block, impl) {
|
||||
analyze_ubos_block(&state, block);
|
||||
}
|
||||
}
|
||||
|
||||
/* Find ranges: a block, starting 32-byte offset, and length. */
|
||||
struct util_dynarray ranges;
|
||||
util_dynarray_init(&ranges, mem_ctx);
|
||||
|
||||
hash_table_foreach(state.blocks, entry) {
|
||||
const int b = entry->hash - 1;
|
||||
const struct ubo_block_info *info = entry->data;
|
||||
uint64_t offsets = info->offsets;
|
||||
|
||||
/* Walk through the offsets bitfield, finding contiguous regions of
|
||||
* set bits:
|
||||
*
|
||||
* 0000000001111111111111000000000000111111111111110000000011111100
|
||||
* ^^^^^^^^^^^^^ ^^^^^^^^^^^^^^ ^^^^^^
|
||||
*
|
||||
* Each of these will become a UBO range.
|
||||
*/
|
||||
while (offsets != 0) {
|
||||
/* Find the first 1 in the offsets bitfield. This represents the
|
||||
* start of a range of interesting UBO data. Make it zero-indexed.
|
||||
*/
|
||||
int first_bit = ffsll(offsets) - 1;
|
||||
|
||||
/* Find the first 0 bit in offsets beyond first_bit. To find the
|
||||
* first zero bit, we find the first 1 bit in the complement. In
|
||||
* order to ignore bits before first_bit, we mask off those bits.
|
||||
*/
|
||||
int first_hole = ffsll(~offsets & ~((1ull << first_bit) - 1)) - 1;
|
||||
|
||||
if (first_hole == -1) {
|
||||
/* If we didn't find a hole, then set it to the end of the
|
||||
* bitfield. There are no more ranges to process.
|
||||
*/
|
||||
first_hole = 64;
|
||||
offsets = 0;
|
||||
} else {
|
||||
/* We've processed all bits before first_hole. Mask them off. */
|
||||
offsets &= ~((1ull << first_hole) - 1);
|
||||
}
|
||||
|
||||
struct ubo_range_entry *entry =
|
||||
util_dynarray_grow(&ranges, struct ubo_range_entry, 1);
|
||||
|
||||
entry->range.block = b;
|
||||
entry->range.start = first_bit;
|
||||
/* first_hole is one beyond the end, so we don't need to add 1 */
|
||||
entry->range.length = first_hole - first_bit;
|
||||
entry->benefit = 0;
|
||||
|
||||
for (int i = 0; i < entry->range.length; i++)
|
||||
entry->benefit += info->uses[first_bit + i];
|
||||
}
|
||||
}
|
||||
|
||||
int nr_entries = ranges.size / sizeof(struct ubo_range_entry);
|
||||
|
||||
if (0) {
|
||||
util_dynarray_foreach(&ranges, struct ubo_range_entry, entry) {
|
||||
print_ubo_entry(stderr, entry, &state);
|
||||
}
|
||||
}
|
||||
|
||||
/* TODO: Consider combining ranges.
|
||||
*
|
||||
* We can only push 3-4 ranges via 3DSTATE_CONSTANT_XS. If there are
|
||||
* more ranges, and two are close by with only a small hole, it may be
|
||||
* worth combining them. The holes will waste register space, but the
|
||||
* benefit of removing pulls may outweigh that cost.
|
||||
*/
|
||||
|
||||
/* Sort the list so the most beneficial ranges are at the front. */
|
||||
if (nr_entries > 0) {
|
||||
qsort(ranges.data, nr_entries, sizeof(struct ubo_range_entry),
|
||||
cmp_ubo_range_entry);
|
||||
}
|
||||
|
||||
struct ubo_range_entry *entries = ranges.data;
|
||||
|
||||
/* Return the top 4 or so. We drop by one if regular uniforms are in
|
||||
* use, assuming one push buffer will be dedicated to those. We may
|
||||
* also only get 3 on Haswell if we can't write INSTPM.
|
||||
*
|
||||
* The backend may need to shrink these ranges to ensure that they
|
||||
* don't exceed the maximum push constant limits. It can simply drop
|
||||
* the tail of the list, as that's the least valuable portion. We
|
||||
* unfortunately can't truncate it here, because we don't know what
|
||||
* the backend is planning to do with regular uniforms.
|
||||
*/
|
||||
const int max_ubos = (compiler->constant_buffer_0_is_relative ? 3 : 4) -
|
||||
state.uses_regular_uniforms;
|
||||
nr_entries = MIN2(nr_entries, max_ubos);
|
||||
|
||||
for (int i = 0; i < nr_entries; i++) {
|
||||
out_ranges[i] = entries[i].range;
|
||||
}
|
||||
for (int i = nr_entries; i < 4; i++) {
|
||||
out_ranges[i].block = 0;
|
||||
out_ranges[i].start = 0;
|
||||
out_ranges[i].length = 0;
|
||||
}
|
||||
|
||||
ralloc_free(ranges.mem_ctx);
|
||||
}
|
||||
132
src/intel/compiler/elk/brw_nir_attribute_workarounds.c
Normal file
132
src/intel/compiler/elk/brw_nir_attribute_workarounds.c
Normal file
|
|
@ -0,0 +1,132 @@
|
|||
/*
|
||||
* Copyright © 2016 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "compiler/nir/nir_builder.h"
|
||||
#include "brw_nir.h"
|
||||
|
||||
/**
|
||||
* Prior to Haswell, the hardware can't natively support GL_FIXED or
|
||||
* 2_10_10_10_REV vertex formats. This pass inserts extra shader code
|
||||
* to produce the correct values.
|
||||
*/
|
||||
|
||||
static bool
|
||||
apply_attr_wa_instr(nir_builder *b, nir_instr *instr, void *cb_data)
|
||||
{
|
||||
const uint8_t *attrib_wa_flags = cb_data;
|
||||
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
return false;
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
if (intrin->intrinsic != nir_intrinsic_load_input)
|
||||
return false;
|
||||
|
||||
uint8_t wa_flags = attrib_wa_flags[nir_intrinsic_base(intrin)];
|
||||
if (wa_flags == 0)
|
||||
return false;
|
||||
|
||||
b->cursor = nir_after_instr(instr);
|
||||
|
||||
nir_def *val = &intrin->def;
|
||||
|
||||
/* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
|
||||
* come in as floating point conversions of the integer values.
|
||||
*/
|
||||
if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
|
||||
nir_def *scaled =
|
||||
nir_fmul_imm(b, val, 1.0f / 65536.0f);
|
||||
nir_def *comps[4];
|
||||
for (int i = 0; i < val->num_components; i++) {
|
||||
bool rescale = i < (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK);
|
||||
comps[i] = nir_channel(b, rescale ? scaled : val, i);
|
||||
}
|
||||
val = nir_vec(b, comps, val->num_components);
|
||||
}
|
||||
|
||||
/* Do sign recovery for 2101010 formats if required. */
|
||||
if (wa_flags & BRW_ATTRIB_WA_SIGN) {
|
||||
/* sign recovery shift: <22, 22, 22, 30> */
|
||||
nir_def *shift = nir_imm_ivec4(b, 22, 22, 22, 30);
|
||||
val = nir_ishr(b, nir_ishl(b, val, shift), shift);
|
||||
}
|
||||
|
||||
/* Apply BGRA swizzle if required. */
|
||||
if (wa_flags & BRW_ATTRIB_WA_BGRA) {
|
||||
val = nir_swizzle(b, val, (unsigned[4]){2,1,0,3}, 4);
|
||||
}
|
||||
|
||||
if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
|
||||
/* ES 3.0 has different rules for converting signed normalized
|
||||
* fixed-point numbers than desktop GL.
|
||||
*/
|
||||
if (wa_flags & BRW_ATTRIB_WA_SIGN) {
|
||||
/* According to equation 2.2 of the ES 3.0 specification,
|
||||
* signed normalization conversion is done by:
|
||||
*
|
||||
* f = c / (2^(b-1)-1)
|
||||
*
|
||||
* OpenGL 4.2+ uses this equation as well. Since most contexts
|
||||
* promote to the new higher version, and this is what Haswell+
|
||||
* hardware does anyway, we just always use this formula.
|
||||
*/
|
||||
nir_def *es3_normalize_factor =
|
||||
nir_imm_vec4(b, 1.0f / ((1 << 9) - 1), 1.0f / ((1 << 9) - 1),
|
||||
1.0f / ((1 << 9) - 1), 1.0f / ((1 << 1) - 1));
|
||||
val = nir_fmax(b,
|
||||
nir_fmul(b, nir_i2f32(b, val), es3_normalize_factor),
|
||||
nir_imm_float(b, -1.0f));
|
||||
} else {
|
||||
/* The following equation is from the OpenGL 3.2 specification:
|
||||
*
|
||||
* 2.1 unsigned normalization
|
||||
* f = c/(2^n-1)
|
||||
*/
|
||||
nir_def *normalize_factor =
|
||||
nir_imm_vec4(b, 1.0f / ((1 << 10) - 1), 1.0f / ((1 << 10) - 1),
|
||||
1.0f / ((1 << 10) - 1), 1.0f / ((1 << 2) - 1));
|
||||
|
||||
val = nir_fmul(b, nir_u2f32(b, val), normalize_factor);
|
||||
}
|
||||
}
|
||||
|
||||
if (wa_flags & BRW_ATTRIB_WA_SCALE) {
|
||||
val = (wa_flags & BRW_ATTRIB_WA_SIGN) ? nir_i2f32(b, val)
|
||||
: nir_u2f32(b, val);
|
||||
}
|
||||
|
||||
nir_def_rewrite_uses_after(&intrin->def, val,
|
||||
val->parent_instr);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
brw_nir_apply_attribute_workarounds(nir_shader *shader,
|
||||
const uint8_t *attrib_wa_flags)
|
||||
{
|
||||
return nir_shader_instructions_pass(shader, apply_attr_wa_instr,
|
||||
nir_metadata_block_index |
|
||||
nir_metadata_dominance,
|
||||
(void *)attrib_wa_flags);
|
||||
}
|
||||
192
src/intel/compiler/elk/brw_nir_lower_alpha_to_coverage.c
Normal file
192
src/intel/compiler/elk/brw_nir_lower_alpha_to_coverage.c
Normal file
|
|
@ -0,0 +1,192 @@
|
|||
/*
|
||||
* Copyright © 2019 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "compiler/nir/nir_builder.h"
|
||||
#include "brw_nir.h"
|
||||
|
||||
/**
|
||||
* We need to compute alpha to coverage dithering manually in shader
|
||||
* and replace sample mask store with the bitwise-AND of sample mask and
|
||||
* alpha to coverage dithering.
|
||||
*
|
||||
* The following formula is used to compute final sample mask:
|
||||
* m = int(16.0 * clamp(src0_alpha, 0.0, 1.0))
|
||||
* dither_mask = 0x1111 * ((0xfea80 >> (m & ~3)) & 0xf) |
|
||||
* 0x0808 * (m & 2) | 0x0100 * (m & 1)
|
||||
* sample_mask = sample_mask & dither_mask
|
||||
*
|
||||
* It gives a number of ones proportional to the alpha for 2, 4, 8 or 16
|
||||
* least significant bits of the result:
|
||||
* 0.0000 0000000000000000
|
||||
* 0.0625 0000000100000000
|
||||
* 0.1250 0001000000010000
|
||||
* 0.1875 0001000100010000
|
||||
* 0.2500 1000100010001000
|
||||
* 0.3125 1000100110001000
|
||||
* 0.3750 1001100010011000
|
||||
* 0.4375 1001100110011000
|
||||
* 0.5000 1010101010101010
|
||||
* 0.5625 1010101110101010
|
||||
* 0.6250 1011101010111010
|
||||
* 0.6875 1011101110111010
|
||||
* 0.7500 1110111011101110
|
||||
* 0.8125 1110111111101110
|
||||
* 0.8750 1111111011111110
|
||||
* 0.9375 1111111111111110
|
||||
* 1.0000 1111111111111111
|
||||
*/
|
||||
static nir_def *
|
||||
build_dither_mask(nir_builder *b, nir_def *color)
|
||||
{
|
||||
assert(color->num_components == 4);
|
||||
nir_def *alpha = nir_channel(b, color, 3);
|
||||
|
||||
nir_def *m =
|
||||
nir_f2i32(b, nir_fmul_imm(b, nir_fsat(b, alpha), 16.0));
|
||||
|
||||
nir_def *part_a =
|
||||
nir_iand_imm(b, nir_ushr(b, nir_imm_int(b, 0xfea80),
|
||||
nir_iand_imm(b, m, ~3)),
|
||||
0xf);
|
||||
|
||||
nir_def *part_b = nir_iand_imm(b, m, 2);
|
||||
nir_def *part_c = nir_iand_imm(b, m, 1);
|
||||
|
||||
return nir_ior(b, nir_imul_imm(b, part_a, 0x1111),
|
||||
nir_ior(b, nir_imul_imm(b, part_b, 0x0808),
|
||||
nir_imul_imm(b, part_c, 0x0100)));
|
||||
}
|
||||
|
||||
bool
|
||||
brw_nir_lower_alpha_to_coverage(nir_shader *shader,
|
||||
const struct brw_wm_prog_key *key,
|
||||
const struct brw_wm_prog_data *prog_data)
|
||||
{
|
||||
assert(shader->info.stage == MESA_SHADER_FRAGMENT);
|
||||
assert(key->alpha_to_coverage != BRW_NEVER);
|
||||
|
||||
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
|
||||
|
||||
const uint64_t outputs_written = shader->info.outputs_written;
|
||||
if (!(outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) ||
|
||||
!(outputs_written & (BITFIELD64_BIT(FRAG_RESULT_COLOR) |
|
||||
BITFIELD64_BIT(FRAG_RESULT_DATA0))))
|
||||
goto skip;
|
||||
|
||||
nir_intrinsic_instr *sample_mask_write = NULL;
|
||||
nir_intrinsic_instr *color0_write = NULL;
|
||||
bool sample_mask_write_first = false;
|
||||
|
||||
nir_foreach_block(block, impl) {
|
||||
nir_foreach_instr_safe(instr, block) {
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
continue;
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
if (intrin->intrinsic != nir_intrinsic_store_output)
|
||||
continue;
|
||||
|
||||
/* We call nir_lower_io_to_temporaries to lower FS outputs to
|
||||
* temporaries with a copy at the end so this should be the last
|
||||
* block in the shader.
|
||||
*/
|
||||
assert(block->cf_node.parent == &impl->cf_node);
|
||||
assert(nir_cf_node_is_last(&block->cf_node));
|
||||
|
||||
/* See store_output in fs_visitor::nir_emit_fs_intrinsic */
|
||||
const unsigned store_offset = nir_src_as_uint(intrin->src[1]);
|
||||
const unsigned driver_location = nir_intrinsic_base(intrin) +
|
||||
SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION);
|
||||
|
||||
/* Extract the FRAG_RESULT */
|
||||
const unsigned location =
|
||||
GET_FIELD(driver_location, BRW_NIR_FRAG_OUTPUT_LOCATION);
|
||||
|
||||
if (location == FRAG_RESULT_SAMPLE_MASK) {
|
||||
assert(sample_mask_write == NULL);
|
||||
sample_mask_write = intrin;
|
||||
sample_mask_write_first = (color0_write == NULL);
|
||||
}
|
||||
|
||||
if (location == FRAG_RESULT_COLOR ||
|
||||
location == FRAG_RESULT_DATA0) {
|
||||
assert(color0_write == NULL);
|
||||
color0_write = intrin;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* It's possible that shader_info may be out-of-date and the writes to
|
||||
* either gl_SampleMask or the first color value may have been removed.
|
||||
* This can happen if, for instance a nir_undef is written to the
|
||||
* color value. In that case, just bail and don't do anything rather
|
||||
* than crashing.
|
||||
*/
|
||||
if (color0_write == NULL || sample_mask_write == NULL)
|
||||
goto skip;
|
||||
|
||||
/* It's possible that the color value isn't actually a vec4. In this case,
|
||||
* assuming an alpha of 1.0 and letting the sample mask pass through
|
||||
* unaltered seems like the kindest thing to do to apps.
|
||||
*/
|
||||
nir_def *color0 = color0_write->src[0].ssa;
|
||||
if (color0->num_components < 4)
|
||||
goto skip;
|
||||
|
||||
nir_def *sample_mask = sample_mask_write->src[0].ssa;
|
||||
|
||||
if (sample_mask_write_first) {
|
||||
/* If the sample mask write comes before the write to color0, we need
|
||||
* to move it because it's going to use the value from color0 to
|
||||
* compute the sample mask.
|
||||
*/
|
||||
nir_instr_remove(&sample_mask_write->instr);
|
||||
nir_instr_insert(nir_after_instr(&color0_write->instr),
|
||||
&sample_mask_write->instr);
|
||||
}
|
||||
|
||||
nir_builder b = nir_builder_at(nir_before_instr(&sample_mask_write->instr));
|
||||
|
||||
/* Combine dither_mask and the gl_SampleMask value */
|
||||
nir_def *dither_mask = build_dither_mask(&b, color0);
|
||||
dither_mask = nir_iand(&b, sample_mask, dither_mask);
|
||||
|
||||
if (key->alpha_to_coverage == BRW_SOMETIMES) {
|
||||
nir_def *push_flags =
|
||||
nir_load_uniform(&b, 1, 32, nir_imm_int(&b, prog_data->msaa_flags_param * 4));
|
||||
nir_def *alpha_to_coverage =
|
||||
nir_test_mask(&b, push_flags, INTEL_MSAA_FLAG_ALPHA_TO_COVERAGE);
|
||||
dither_mask = nir_bcsel(&b, alpha_to_coverage,
|
||||
dither_mask, sample_mask_write->src[0].ssa);
|
||||
}
|
||||
|
||||
nir_src_rewrite(&sample_mask_write->src[0], dither_mask);
|
||||
|
||||
nir_metadata_preserve(impl, nir_metadata_block_index |
|
||||
nir_metadata_dominance);
|
||||
return true;
|
||||
|
||||
skip:
|
||||
nir_metadata_preserve(impl, nir_metadata_all);
|
||||
return false;
|
||||
}
|
||||
818
src/intel/compiler/elk/brw_nir_lower_cooperative_matrix.c
Normal file
818
src/intel/compiler/elk/brw_nir_lower_cooperative_matrix.c
Normal file
|
|
@ -0,0 +1,818 @@
|
|||
/*
|
||||
* Copyright 2023 Intel Corporation
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
/**
|
||||
* \file brw_nir_lower_cooperative_matrix.c
|
||||
* Lower cooperative matrix to subgroup operations.
|
||||
*
|
||||
* All supported matrix types are assumed to have either 8 rows or 8
|
||||
* columns. The other dimension of the matrix is typically 8 times the number
|
||||
* of data elements that can be stored in a 32-bit dword. Matrix data is
|
||||
* indexed by a combination of an array element and a subgroup invocation ID.
|
||||
*
|
||||
* Two layouts for matrix data are used. In the first layout,
|
||||
* subgroupShuffle(slice[N], ...) accesses row N of the matrix. This will be
|
||||
* called row-major hereafter. In the other layout,
|
||||
* subgroupShuffle(slice[...], M) accesses column M of the matrix. This will
|
||||
* be called column-major hereafter. In cases where a single 32-bit value is
|
||||
* stored in each entry, these layouts are identical.
|
||||
*
|
||||
* The subtle difference arises when multiple values are packed into a single
|
||||
* 32-bit dword. If two 16-bit values are packed in a single 32-bit value in
|
||||
* column-major, subgroupShuffle(slice[0], 1) holds matrix entries m[1][1] and
|
||||
* m[2][1] (in m[row][column] notation). In row-major, that same shuffle holds
|
||||
* m[0][2] and m[0][3].
|
||||
*
|
||||
* There is an alternate way to think about the matrix layouts. Every matrix
|
||||
* size supported by the Intel driver is either Sx8 (e.g., 16x8 for float16 B
|
||||
* matrix) or Sx8T (e.g., 8x32 for int8 A matrix). The A matrix and B matrix
|
||||
* layouts are such that a single 8 dword register hold an entire row of the
|
||||
* matrix.
|
||||
*
|
||||
* Consider a matrix stored starting in register g32. In an A matrix, the
|
||||
* packed dwords of g32 contain only the data for a single row of the
|
||||
* matrix. g32 is row 0, g33 is row 1, etc. In a B matrix, the packed dwords
|
||||
* of g(32+N).X contain only the data for a single column of the
|
||||
* matrix. g[32:40].0 is column 0, g[32:40].1 is column 1, etc.
|
||||
*
|
||||
* This leads to some shenanigans in \c lower_cmat_load_store.
|
||||
*
|
||||
* In the common case, A, C, and result matrices are stored row major while B
|
||||
* matrices are stored column major. This arrangement facilitates efficient
|
||||
* dot product operations using DPAS or DP4A instructions.
|
||||
*
|
||||
* Future optimizations are possible when row and column major are
|
||||
* flipped. That is, efficient dot products are also possible when A, C, and
|
||||
* result matrices are column major while B is row major.
|
||||
*/
|
||||
|
||||
#include "brw_nir.h"
|
||||
|
||||
struct lower_cmat_state {
|
||||
nir_shader *shader;
|
||||
|
||||
struct hash_table *slice_coop_types;
|
||||
|
||||
struct hash_table *vars_to_slice;
|
||||
|
||||
unsigned subgroup_size;
|
||||
};
|
||||
|
||||
static void
|
||||
print_coop_types(struct lower_cmat_state *state)
|
||||
{
|
||||
fprintf(stderr, "--- Slices to Cooperative Matrix type table\n");
|
||||
hash_table_foreach(state->slice_coop_types, e) {
|
||||
nir_variable *var = (void *)e->key;
|
||||
const struct glsl_type *t = e->data;
|
||||
fprintf(stderr, "%p: %s -> %s\n", var, var->name, glsl_get_type_name(t));
|
||||
}
|
||||
fprintf(stderr, "\n\n");
|
||||
}
|
||||
|
||||
static const struct glsl_type *
|
||||
get_coop_type_for_slice(struct lower_cmat_state *state, nir_deref_instr *deref)
|
||||
{
|
||||
nir_variable *var = nir_deref_instr_get_variable(deref);
|
||||
struct hash_entry *entry = _mesa_hash_table_search(state->slice_coop_types, var);
|
||||
|
||||
assert(entry != NULL);
|
||||
|
||||
return entry->data;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_cmat_filter(const nir_instr *instr, const void *_state)
|
||||
{
|
||||
if (instr->type == nir_instr_type_deref) {
|
||||
nir_deref_instr *deref = nir_instr_as_deref(instr);
|
||||
return glsl_type_is_cmat(deref->type);
|
||||
}
|
||||
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
return false;
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_cmat_construct:
|
||||
case nir_intrinsic_cmat_load:
|
||||
case nir_intrinsic_cmat_store:
|
||||
case nir_intrinsic_cmat_length:
|
||||
case nir_intrinsic_cmat_muladd:
|
||||
case nir_intrinsic_cmat_unary_op:
|
||||
case nir_intrinsic_cmat_binary_op:
|
||||
case nir_intrinsic_cmat_scalar_op:
|
||||
case nir_intrinsic_cmat_bitcast:
|
||||
case nir_intrinsic_cmat_insert:
|
||||
case nir_intrinsic_cmat_extract:
|
||||
case nir_intrinsic_cmat_copy:
|
||||
return true;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get number of matrix elements packed in each component of the slice.
|
||||
*/
|
||||
static unsigned
|
||||
get_packing_factor(const struct glsl_cmat_description desc,
|
||||
const struct glsl_type *slice_type)
|
||||
{
|
||||
const struct glsl_type *slice_element_type = glsl_without_array(slice_type);
|
||||
|
||||
assert(!glsl_type_is_cmat(slice_type));
|
||||
|
||||
assert(glsl_get_bit_size(slice_element_type) >= glsl_base_type_get_bit_size(desc.element_type));
|
||||
assert(glsl_get_bit_size(slice_element_type) % glsl_base_type_get_bit_size(desc.element_type) == 0);
|
||||
|
||||
return glsl_get_bit_size(slice_element_type) / glsl_base_type_get_bit_size(desc.element_type);
|
||||
}
|
||||
|
||||
static const struct glsl_type *
|
||||
get_slice_type_from_desc(const struct lower_cmat_state *state,
|
||||
const struct glsl_cmat_description desc)
|
||||
{
|
||||
enum glsl_base_type base_type;
|
||||
|
||||
/* Number of matrix elements stored by each subgroup invocation. If the
|
||||
* data is packed, the slice size will be less than this.
|
||||
*/
|
||||
const unsigned elements_per_invocation =
|
||||
(desc.rows * desc.cols) / state->subgroup_size;
|
||||
|
||||
assert(elements_per_invocation > 0);
|
||||
|
||||
const unsigned element_bits = 32;
|
||||
const unsigned bits = glsl_base_type_get_bit_size(desc.element_type);
|
||||
unsigned packing_factor = MIN2(elements_per_invocation,
|
||||
element_bits / bits);
|
||||
|
||||
/* Adjust the packing factor so that each row of the matrix fills and
|
||||
* entire GRF.
|
||||
*
|
||||
* The in-register layout of B matrices is different, so those are handled
|
||||
* more like column major (for row major matrices). See the file comment
|
||||
* for more details.
|
||||
*/
|
||||
const unsigned actual_cols = desc.use != GLSL_CMAT_USE_B ? desc.cols : desc.rows;
|
||||
while ((actual_cols / packing_factor) < 8) {
|
||||
assert(packing_factor > 1);
|
||||
packing_factor /= 2;
|
||||
}
|
||||
|
||||
switch (desc.element_type) {
|
||||
case GLSL_TYPE_FLOAT:
|
||||
base_type = GLSL_TYPE_FLOAT;
|
||||
break;
|
||||
case GLSL_TYPE_UINT:
|
||||
case GLSL_TYPE_FLOAT16:
|
||||
case GLSL_TYPE_UINT8:
|
||||
case GLSL_TYPE_UINT16:
|
||||
base_type = glsl_get_base_type(glsl_uintN_t_type(packing_factor * bits));
|
||||
break;
|
||||
case GLSL_TYPE_INT:
|
||||
case GLSL_TYPE_INT8:
|
||||
case GLSL_TYPE_INT16:
|
||||
base_type = glsl_get_base_type(glsl_intN_t_type(packing_factor * bits));
|
||||
break;
|
||||
default:
|
||||
unreachable("Invalid cooperative matrix element type.");
|
||||
}
|
||||
|
||||
unsigned len = elements_per_invocation / packing_factor;
|
||||
|
||||
/* Supported matrix sizes are designed to fill either 4 or 8 SIMD8
|
||||
* registers. That means:
|
||||
*
|
||||
* 4 regsiters 8 registers
|
||||
* SIMD32 len = 1 len = 2
|
||||
* SIMD16 len = 2 len = 4
|
||||
* SIMD8 len = 4 len = 8
|
||||
*
|
||||
* If configurations are added that result in other values of len, at the
|
||||
* very least this assertion will need to be updated. The only value of len
|
||||
* that makes sense to add would be 16, and that would be a lot of
|
||||
* registers.
|
||||
*/
|
||||
assert(len == 1 || len == 2 || len == 4 || len == 8);
|
||||
|
||||
const struct glsl_type *slice_type = glsl_vector_type(base_type, len);
|
||||
|
||||
assert(packing_factor == get_packing_factor(desc, slice_type));
|
||||
|
||||
return slice_type;
|
||||
}
|
||||
|
||||
static const struct glsl_type *
|
||||
get_slice_type(const struct lower_cmat_state *state,
|
||||
const struct glsl_type *type)
|
||||
{
|
||||
if (glsl_type_is_array(type)) {
|
||||
const struct glsl_type *slice_type =
|
||||
get_slice_type(state, glsl_get_array_element(type));
|
||||
|
||||
return glsl_array_type(slice_type, glsl_array_size(type), 0);
|
||||
}
|
||||
|
||||
assert(glsl_type_is_cmat(type));
|
||||
|
||||
return get_slice_type_from_desc(state,
|
||||
*glsl_get_cmat_description(type));
|
||||
}
|
||||
|
||||
static nir_deref_instr *
|
||||
create_local_slice(struct lower_cmat_state *state, nir_builder *b,
|
||||
const struct glsl_type *mat_type, const char *name)
|
||||
{
|
||||
const struct glsl_type *slice_type = get_slice_type(state, mat_type);
|
||||
nir_variable *slice_var = nir_local_variable_create(b->impl, slice_type, name);
|
||||
_mesa_hash_table_insert(state->slice_coop_types, slice_var, (void *)mat_type);
|
||||
return nir_build_deref_var(b, slice_var);
|
||||
}
|
||||
|
||||
static void
|
||||
lower_cmat_load_store(nir_builder *b, nir_intrinsic_instr *intrin,
|
||||
struct lower_cmat_state *state)
|
||||
{
|
||||
const bool load = intrin->intrinsic == nir_intrinsic_cmat_load;
|
||||
const unsigned mat_src = load ? 0 : 1;
|
||||
const unsigned ptr_src = load ? 1 : 0;
|
||||
|
||||
nir_deref_instr *slice = nir_src_as_deref(intrin->src[mat_src]);
|
||||
const struct glsl_type *mat_type = get_coop_type_for_slice(state, slice);
|
||||
const struct glsl_cmat_description *desc = glsl_get_cmat_description(mat_type);
|
||||
|
||||
nir_def *results[NIR_MAX_VEC_COMPONENTS];
|
||||
const unsigned num_components = glsl_get_vector_elements(slice->type);
|
||||
const unsigned packing_factor = get_packing_factor(*desc, slice->type);
|
||||
|
||||
nir_deref_instr *pointer = nir_src_as_deref(intrin->src[ptr_src]);
|
||||
|
||||
if ((nir_intrinsic_matrix_layout(intrin) == GLSL_MATRIX_LAYOUT_ROW_MAJOR) ==
|
||||
(desc->use != GLSL_CMAT_USE_B)) {
|
||||
nir_def *stride = nir_udiv_imm(b, intrin->src[2].ssa, packing_factor);
|
||||
|
||||
const struct glsl_type *element_type =
|
||||
glsl_scalar_type(glsl_get_base_type(slice->type));
|
||||
|
||||
pointer = nir_build_deref_cast(b, &pointer->def, pointer->modes,
|
||||
element_type,
|
||||
glsl_get_bit_size(element_type) / 8);
|
||||
|
||||
nir_def *invocation = nir_load_subgroup_invocation(b);
|
||||
nir_def *base_offset;
|
||||
nir_def *step;
|
||||
|
||||
if (desc->use != GLSL_CMAT_USE_B) {
|
||||
base_offset = nir_iadd(b,
|
||||
nir_imul(b,
|
||||
nir_udiv_imm(b, invocation, 8),
|
||||
stride),
|
||||
nir_umod_imm(b, invocation, 8));
|
||||
|
||||
step = nir_imul_imm(b, stride, state->subgroup_size / 8);
|
||||
} else {
|
||||
base_offset = nir_iadd(b,
|
||||
nir_imul(b,
|
||||
nir_umod_imm(b, invocation, 8),
|
||||
stride),
|
||||
nir_udiv_imm(b, invocation, 8));
|
||||
|
||||
step = nir_imm_int(b, state->subgroup_size / 8);
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < num_components; i++) {
|
||||
nir_def *offset = nir_imul_imm(b, step, i);
|
||||
|
||||
nir_deref_instr *memory_deref =
|
||||
nir_build_deref_ptr_as_array(b, pointer,
|
||||
nir_i2iN(b,
|
||||
nir_iadd(b,
|
||||
base_offset,
|
||||
offset),
|
||||
pointer->def.bit_size));
|
||||
|
||||
if (load) {
|
||||
results[i] = nir_load_deref(b, memory_deref);
|
||||
} else {
|
||||
nir_def *src = nir_channel(b, nir_load_deref(b, slice), i);
|
||||
nir_store_deref(b, memory_deref, src, 0x1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
nir_def *stride = intrin->src[2].ssa;
|
||||
|
||||
const struct glsl_type *element_type = glsl_scalar_type(desc->element_type);
|
||||
const unsigned element_bits = glsl_base_type_get_bit_size(desc->element_type);
|
||||
const unsigned element_stride = element_bits / 8;
|
||||
|
||||
pointer = nir_build_deref_cast(b, &pointer->def, pointer->modes, element_type,
|
||||
element_stride);
|
||||
|
||||
nir_def *invocation_div_8 = nir_udiv_imm(b, nir_load_subgroup_invocation(b), 8);
|
||||
nir_def *invocation_mod_8 = nir_umod_imm(b, nir_load_subgroup_invocation(b), 8);
|
||||
|
||||
nir_def *packed_stride = nir_imul_imm(b, stride, packing_factor);
|
||||
|
||||
for (unsigned i = 0; i < num_components; i++) {
|
||||
const unsigned i_offset = i * (state->subgroup_size / 8);
|
||||
nir_def *v[4];
|
||||
|
||||
for (unsigned j = 0; j < packing_factor; j++) {
|
||||
nir_def *j_offset = nir_imul_imm(b, stride, j);
|
||||
nir_def *offset;
|
||||
|
||||
if (desc->use != GLSL_CMAT_USE_B) {
|
||||
offset = nir_iadd(b,
|
||||
nir_iadd(b,
|
||||
nir_imul(b,
|
||||
invocation_mod_8,
|
||||
packed_stride),
|
||||
invocation_div_8),
|
||||
nir_iadd_imm(b, j_offset, i_offset));
|
||||
} else {
|
||||
offset = nir_iadd(b,
|
||||
nir_iadd(b,
|
||||
nir_imul(b,
|
||||
invocation_div_8,
|
||||
packed_stride),
|
||||
invocation_mod_8),
|
||||
nir_iadd(b,
|
||||
nir_imul_imm(b,
|
||||
packed_stride,
|
||||
i_offset),
|
||||
j_offset));
|
||||
}
|
||||
|
||||
nir_deref_instr *memory_deref =
|
||||
nir_build_deref_ptr_as_array(b, pointer,
|
||||
nir_i2iN(b,
|
||||
offset,
|
||||
pointer->def.bit_size));
|
||||
|
||||
if (load) {
|
||||
v[j] = nir_load_deref(b, memory_deref);
|
||||
} else {
|
||||
nir_def *src = nir_channel(b, nir_load_deref(b, slice), i);
|
||||
|
||||
nir_def *v =
|
||||
nir_channel(b, nir_unpack_bits(b, src, element_bits), j);
|
||||
|
||||
nir_store_deref(b, memory_deref, v, 0x1);
|
||||
}
|
||||
}
|
||||
|
||||
if (load) {
|
||||
results[i] = nir_pack_bits(b, nir_vec(b, v, packing_factor),
|
||||
packing_factor * element_bits);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (load)
|
||||
nir_store_deref(b, slice, nir_vec(b, results, num_components),
|
||||
nir_component_mask(num_components));
|
||||
}
|
||||
|
||||
static void
|
||||
lower_cmat_unary_op(nir_builder *b, nir_intrinsic_instr *intrin,
|
||||
struct lower_cmat_state *state)
|
||||
{
|
||||
nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
|
||||
nir_deref_instr *src_slice = nir_src_as_deref(intrin->src[1]);
|
||||
nir_def *results[NIR_MAX_VEC_COMPONENTS];
|
||||
const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
|
||||
|
||||
const struct glsl_type *dst_mat_type =
|
||||
get_coop_type_for_slice(state, dst_slice);
|
||||
const struct glsl_type *src_mat_type =
|
||||
get_coop_type_for_slice(state, src_slice);
|
||||
|
||||
const struct glsl_cmat_description dst_desc =
|
||||
*glsl_get_cmat_description(dst_mat_type);
|
||||
|
||||
const struct glsl_cmat_description src_desc =
|
||||
*glsl_get_cmat_description(src_mat_type);
|
||||
|
||||
const unsigned dst_bits = glsl_base_type_bit_size(dst_desc.element_type);
|
||||
const unsigned src_bits = glsl_base_type_bit_size(src_desc.element_type);
|
||||
|
||||
/* The type of the returned slice may be different from the type of the
|
||||
* input slice.
|
||||
*/
|
||||
const unsigned dst_packing_factor =
|
||||
get_packing_factor(dst_desc, dst_slice->type);
|
||||
|
||||
const unsigned src_packing_factor =
|
||||
get_packing_factor(src_desc, src_slice->type);
|
||||
|
||||
const nir_op op = nir_intrinsic_alu_op(intrin);
|
||||
|
||||
/* There are three possible cases:
|
||||
*
|
||||
* 1. dst_packing_factor == src_packing_factor. This is the common case,
|
||||
* and handling it is straightforward.
|
||||
*
|
||||
* 2. dst_packing_factor > src_packing_factor. This occurs when converting a
|
||||
* float32_t matrix slice to a packed float16_t slice. Loop over the size
|
||||
* of the destination slice, but read multiple entries from the source
|
||||
* slice on each iteration.
|
||||
*
|
||||
* 3. dst_packing_factor < src_packing_factor. This occurs when converting a
|
||||
* packed int8_t matrix slice to an int32_t slice. Loop over the size of
|
||||
* the source slice, but write multiple entries to the destination slice
|
||||
* on each iteration.
|
||||
*
|
||||
* Handle all cases by iterating over the total (non-packed) number of
|
||||
* elements in the slice. When dst_packing_factor values have been
|
||||
* calculated, store them.
|
||||
*/
|
||||
assert((dst_packing_factor * glsl_get_vector_elements(dst_slice->type)) ==
|
||||
(src_packing_factor * glsl_get_vector_elements(src_slice->type)));
|
||||
|
||||
/* Stores at most dst_packing_factor partial results. */
|
||||
nir_def *v[4];
|
||||
assert(dst_packing_factor <= 4);
|
||||
|
||||
for (unsigned i = 0; i < num_components * dst_packing_factor; i++) {
|
||||
const unsigned dst_chan_index = i % dst_packing_factor;
|
||||
const unsigned src_chan_index = i % src_packing_factor;
|
||||
const unsigned dst_index = i / dst_packing_factor;
|
||||
const unsigned src_index = i / src_packing_factor;
|
||||
|
||||
nir_def *src =
|
||||
nir_channel(b,
|
||||
nir_unpack_bits(b,
|
||||
nir_channel(b,
|
||||
nir_load_deref(b, src_slice),
|
||||
src_index),
|
||||
src_bits),
|
||||
src_chan_index);
|
||||
|
||||
v[dst_chan_index] = nir_build_alu1(b, op, src);
|
||||
|
||||
if (dst_chan_index == (dst_packing_factor - 1)) {
|
||||
results[dst_index] =
|
||||
nir_pack_bits(b, nir_vec(b, v, dst_packing_factor),
|
||||
dst_packing_factor * dst_bits);
|
||||
}
|
||||
}
|
||||
|
||||
nir_store_deref(b, dst_slice, nir_vec(b, results, num_components),
|
||||
nir_component_mask(num_components));
|
||||
}
|
||||
|
||||
static void
|
||||
lower_cmat_binary_op(nir_builder *b, nir_intrinsic_instr *intrin,
|
||||
struct lower_cmat_state *state)
|
||||
{
|
||||
nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
|
||||
nir_deref_instr *src_a_slice = nir_src_as_deref(intrin->src[1]);
|
||||
nir_deref_instr *src_b_slice = nir_src_as_deref(intrin->src[2]);
|
||||
|
||||
nir_def *src_a = nir_load_deref(b, src_a_slice);
|
||||
nir_def *src_b = nir_load_deref(b, src_b_slice);
|
||||
nir_def *results[NIR_MAX_VEC_COMPONENTS];
|
||||
const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
|
||||
|
||||
const struct glsl_type *dst_mat_type = get_coop_type_for_slice(state, dst_slice);
|
||||
ASSERTED const struct glsl_type *src_a_mat_type = get_coop_type_for_slice(state, src_a_slice);
|
||||
ASSERTED const struct glsl_type *src_b_mat_type = get_coop_type_for_slice(state, src_b_slice);
|
||||
|
||||
const struct glsl_cmat_description desc =
|
||||
*glsl_get_cmat_description(dst_mat_type);
|
||||
|
||||
assert(dst_mat_type == src_a_mat_type);
|
||||
assert(dst_mat_type == src_b_mat_type);
|
||||
|
||||
const unsigned bits = glsl_base_type_bit_size(desc.element_type);
|
||||
const unsigned packing_factor = get_packing_factor(desc, dst_slice->type);
|
||||
|
||||
for (unsigned i = 0; i < num_components; i++) {
|
||||
nir_def *val_a = nir_channel(b, src_a, i);
|
||||
nir_def *val_b = nir_channel(b, src_b, i);
|
||||
|
||||
results[i] =
|
||||
nir_pack_bits(b, nir_build_alu2(b, nir_intrinsic_alu_op(intrin),
|
||||
nir_unpack_bits(b, val_a, bits),
|
||||
nir_unpack_bits(b, val_b, bits)),
|
||||
packing_factor * bits);
|
||||
}
|
||||
|
||||
nir_store_deref(b, dst_slice, nir_vec(b, results, num_components),
|
||||
nir_component_mask(num_components));
|
||||
}
|
||||
|
||||
static void
|
||||
lower_cmat_scalar_op(nir_builder *b, nir_intrinsic_instr *intrin,
|
||||
struct lower_cmat_state *state)
|
||||
{
|
||||
nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
|
||||
nir_deref_instr *src_slice = nir_src_as_deref(intrin->src[1]);
|
||||
nir_def *scalar = intrin->src[2].ssa;
|
||||
|
||||
nir_def *src = nir_load_deref(b, src_slice);
|
||||
nir_def *results[NIR_MAX_VEC_COMPONENTS];
|
||||
const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
|
||||
|
||||
ASSERTED const struct glsl_type *dst_mat_type = get_coop_type_for_slice(state, dst_slice);
|
||||
ASSERTED const struct glsl_type *src_mat_type = get_coop_type_for_slice(state, src_slice);
|
||||
assert(dst_mat_type == src_mat_type);
|
||||
|
||||
const struct glsl_cmat_description desc =
|
||||
*glsl_get_cmat_description(dst_mat_type);
|
||||
|
||||
const unsigned bits = glsl_base_type_bit_size(desc.element_type);
|
||||
const unsigned packing_factor = get_packing_factor(desc, dst_slice->type);
|
||||
|
||||
for (unsigned i = 0; i < num_components; i++) {
|
||||
nir_def *val = nir_channel(b, src, i);
|
||||
|
||||
results[i] =
|
||||
nir_pack_bits(b, nir_build_alu2(b, nir_intrinsic_alu_op(intrin),
|
||||
nir_unpack_bits(b, val, bits),
|
||||
scalar),
|
||||
packing_factor * bits);
|
||||
}
|
||||
|
||||
nir_store_deref(b, dst_slice, nir_vec(b, results, num_components),
|
||||
nir_component_mask(num_components));
|
||||
}
|
||||
|
||||
static nir_deref_instr *
|
||||
lower_cmat_deref(nir_builder *b, nir_deref_instr *deref,
|
||||
struct lower_cmat_state *state)
|
||||
{
|
||||
nir_deref_instr *parent = nir_deref_instr_parent(deref);
|
||||
if (parent) {
|
||||
assert(deref->deref_type == nir_deref_type_array);
|
||||
parent = lower_cmat_deref(b, parent, state);
|
||||
return nir_build_deref_array(b, parent, deref->arr.index.ssa);
|
||||
} else {
|
||||
assert(deref->deref_type == nir_deref_type_var);
|
||||
assert(deref->var);
|
||||
assert(glsl_type_is_cmat(glsl_without_array(deref->var->type)));
|
||||
|
||||
struct hash_entry *entry = _mesa_hash_table_search(state->vars_to_slice, deref->var);
|
||||
assert(entry);
|
||||
return nir_build_deref_var(b, (nir_variable *)entry->data);
|
||||
}
|
||||
}
|
||||
|
||||
static nir_def *
|
||||
lower_cmat_instr(nir_builder *b, nir_instr *instr, void *_state)
|
||||
{
|
||||
struct lower_cmat_state *state = _state;
|
||||
|
||||
if (instr->type == nir_instr_type_deref) {
|
||||
nir_deref_instr *deref = lower_cmat_deref(b, nir_instr_as_deref(instr), state);
|
||||
return &deref->def;
|
||||
}
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_cmat_load:
|
||||
case nir_intrinsic_cmat_store:
|
||||
lower_cmat_load_store(b, intrin, state);
|
||||
return NIR_LOWER_INSTR_PROGRESS_REPLACE;
|
||||
|
||||
case nir_intrinsic_cmat_construct: {
|
||||
nir_deref_instr *slice = nir_src_as_deref(intrin->src[0]);
|
||||
nir_def *src = intrin->src[1].ssa;
|
||||
|
||||
const struct glsl_type *mat_type = get_coop_type_for_slice(state, slice);
|
||||
const struct glsl_cmat_description desc =
|
||||
*glsl_get_cmat_description(mat_type);
|
||||
const unsigned packing_factor = get_packing_factor(desc, slice->type);
|
||||
|
||||
if (packing_factor > 1) {
|
||||
src = nir_pack_bits(b, nir_replicate(b, src, packing_factor),
|
||||
packing_factor * glsl_base_type_get_bit_size(desc.element_type));
|
||||
}
|
||||
|
||||
const unsigned num_components = glsl_get_vector_elements(slice->type);
|
||||
|
||||
nir_store_deref(b, slice, nir_replicate(b, src, num_components),
|
||||
nir_component_mask(num_components));
|
||||
return NIR_LOWER_INSTR_PROGRESS_REPLACE;
|
||||
}
|
||||
|
||||
case nir_intrinsic_cmat_unary_op:
|
||||
lower_cmat_unary_op(b, intrin, state);
|
||||
return NIR_LOWER_INSTR_PROGRESS_REPLACE;
|
||||
|
||||
case nir_intrinsic_cmat_binary_op:
|
||||
lower_cmat_binary_op(b, intrin, state);
|
||||
return NIR_LOWER_INSTR_PROGRESS_REPLACE;
|
||||
|
||||
case nir_intrinsic_cmat_scalar_op:
|
||||
lower_cmat_scalar_op(b, intrin, state);
|
||||
return NIR_LOWER_INSTR_PROGRESS_REPLACE;
|
||||
|
||||
case nir_intrinsic_cmat_length: {
|
||||
const struct glsl_cmat_description desc = nir_intrinsic_cmat_desc(intrin);
|
||||
const struct glsl_type *mat_type = glsl_cmat_type(&desc);
|
||||
const struct glsl_type *slice_type = get_slice_type(state, mat_type);
|
||||
return nir_imm_intN_t(b, (get_packing_factor(desc, slice_type) *
|
||||
glsl_get_vector_elements(slice_type)), 32);
|
||||
}
|
||||
|
||||
case nir_intrinsic_cmat_muladd: {
|
||||
nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
|
||||
nir_deref_instr *A_slice = nir_src_as_deref(intrin->src[1]);
|
||||
nir_deref_instr *B_slice = nir_src_as_deref(intrin->src[2]);
|
||||
nir_deref_instr *accum_slice = nir_src_as_deref(intrin->src[3]);
|
||||
|
||||
const struct glsl_type *dst_mat_type = get_coop_type_for_slice(state, dst_slice);
|
||||
const struct glsl_cmat_description dst_desc = *glsl_get_cmat_description(dst_mat_type);
|
||||
|
||||
const struct glsl_type *src_mat_type = get_coop_type_for_slice(state, A_slice);
|
||||
const struct glsl_cmat_description src_desc = *glsl_get_cmat_description(src_mat_type);
|
||||
|
||||
const unsigned packing_factor = get_packing_factor(dst_desc, dst_slice->type);
|
||||
const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
|
||||
|
||||
nir_def *result =
|
||||
nir_dpas_intel(b,
|
||||
packing_factor * glsl_base_type_get_bit_size(dst_desc.element_type),
|
||||
nir_load_deref(b, A_slice),
|
||||
nir_load_deref(b, B_slice),
|
||||
nir_load_deref(b, accum_slice),
|
||||
.dest_type = nir_get_nir_type_for_glsl_base_type(dst_desc.element_type),
|
||||
.src_type = nir_get_nir_type_for_glsl_base_type(src_desc.element_type),
|
||||
.saturate = nir_intrinsic_saturate(intrin),
|
||||
.cmat_signed_mask = nir_intrinsic_cmat_signed_mask(intrin),
|
||||
.systolic_depth = 8,
|
||||
.repeat_count = 8);
|
||||
|
||||
nir_store_deref(b, dst_slice, result,
|
||||
nir_component_mask(num_components));
|
||||
|
||||
return NIR_LOWER_INSTR_PROGRESS_REPLACE;
|
||||
}
|
||||
|
||||
case nir_intrinsic_cmat_bitcast: {
|
||||
nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
|
||||
nir_deref_instr *src_slice = nir_src_as_deref(intrin->src[1]);
|
||||
|
||||
const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
|
||||
|
||||
assert(glsl_get_vector_elements(src_slice->type) == num_components);
|
||||
|
||||
nir_store_deref(b, dst_slice, nir_load_deref(b, src_slice),
|
||||
nir_component_mask(num_components));
|
||||
return NIR_LOWER_INSTR_PROGRESS_REPLACE;
|
||||
}
|
||||
|
||||
case nir_intrinsic_cmat_copy:
|
||||
nir_copy_deref(b,
|
||||
nir_src_as_deref(intrin->src[0]),
|
||||
nir_src_as_deref(intrin->src[1]));
|
||||
return NIR_LOWER_INSTR_PROGRESS_REPLACE;
|
||||
|
||||
case nir_intrinsic_cmat_insert: {
|
||||
nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
|
||||
nir_def *scalar = intrin->src[1].ssa;
|
||||
nir_deref_instr *src_slice = nir_src_as_deref(intrin->src[2]);
|
||||
const nir_src dst_index = intrin->src[3];
|
||||
|
||||
const struct glsl_type *dst_mat_type = get_coop_type_for_slice(state, dst_slice);
|
||||
ASSERTED const struct glsl_type *src_mat_type = get_coop_type_for_slice(state, src_slice);
|
||||
assert(dst_mat_type == src_mat_type);
|
||||
|
||||
const struct glsl_cmat_description desc =
|
||||
*glsl_get_cmat_description(dst_mat_type);
|
||||
|
||||
const unsigned bits = glsl_base_type_bit_size(desc.element_type);
|
||||
const unsigned packing_factor = get_packing_factor(desc, dst_slice->type);
|
||||
const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
|
||||
|
||||
nir_def *slice_index = nir_udiv_imm(b, dst_index.ssa, packing_factor);
|
||||
nir_def *vector_index = nir_umod_imm(b, dst_index.ssa, packing_factor);
|
||||
nir_def *results[NIR_MAX_VEC_COMPONENTS];
|
||||
|
||||
const int slice_constant_index = nir_src_is_const(dst_index)
|
||||
? nir_src_as_uint(dst_index) / packing_factor
|
||||
: -1;
|
||||
|
||||
for (unsigned i = 0; i < num_components; i++) {
|
||||
nir_def *val = nir_channel(b, nir_load_deref(b, src_slice), i);
|
||||
nir_def *insert;
|
||||
|
||||
if (slice_constant_index < 0 || slice_constant_index == i) {
|
||||
if (packing_factor == 1) {
|
||||
insert = scalar;
|
||||
} else {
|
||||
nir_def *unpacked = nir_unpack_bits(b, val, bits);
|
||||
nir_def *v = nir_vector_insert(b, unpacked, scalar, vector_index);
|
||||
|
||||
insert = nir_pack_bits(b, v, bits * packing_factor);
|
||||
}
|
||||
} else {
|
||||
insert = val;
|
||||
}
|
||||
|
||||
results[i] = slice_constant_index < 0
|
||||
? nir_bcsel(b, nir_ieq_imm(b, slice_index, i), insert, val)
|
||||
: insert;
|
||||
}
|
||||
|
||||
nir_store_deref(b, dst_slice, nir_vec(b, results, num_components),
|
||||
nir_component_mask(num_components));
|
||||
|
||||
return NIR_LOWER_INSTR_PROGRESS_REPLACE;
|
||||
}
|
||||
|
||||
case nir_intrinsic_cmat_extract: {
|
||||
nir_deref_instr *slice = nir_src_as_deref(intrin->src[0]);
|
||||
const struct glsl_type *mat_type = get_coop_type_for_slice(state, slice);
|
||||
nir_def *index = intrin->src[1].ssa;
|
||||
|
||||
const struct glsl_cmat_description desc =
|
||||
*glsl_get_cmat_description(mat_type);
|
||||
|
||||
const unsigned bits = glsl_base_type_bit_size(desc.element_type);
|
||||
const unsigned packing_factor = get_packing_factor(desc, slice->type);
|
||||
|
||||
nir_def *src =
|
||||
nir_vector_extract(b, nir_load_deref(b, slice),
|
||||
nir_udiv_imm(b, index, packing_factor));
|
||||
|
||||
if (packing_factor == 1) {
|
||||
return src;
|
||||
} else {
|
||||
return nir_vector_extract(b,
|
||||
nir_unpack_bits(b, src, bits),
|
||||
nir_umod_imm(b, index, packing_factor));
|
||||
}
|
||||
|
||||
return NIR_LOWER_INSTR_PROGRESS_REPLACE;
|
||||
}
|
||||
|
||||
default:
|
||||
unreachable("invalid cooperative matrix intrinsic");
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
create_slice_var(struct lower_cmat_state *state, nir_variable *var,
|
||||
nir_function_impl *impl)
|
||||
{
|
||||
// TODO: without array
|
||||
const struct glsl_type *mat_type = glsl_without_array(var->type);
|
||||
|
||||
assert(glsl_type_is_cmat(mat_type));
|
||||
assert((!impl && var->data.mode == nir_var_shader_temp) ||
|
||||
( impl && var->data.mode == nir_var_function_temp));
|
||||
|
||||
const struct glsl_type *slice_type = get_slice_type(state, var->type);
|
||||
const char *slice_name = ralloc_asprintf(state->shader, "%s_slice", var->name);
|
||||
nir_variable *slice_var = impl ?
|
||||
nir_local_variable_create(impl, slice_type, slice_name) :
|
||||
nir_variable_create(state->shader, var->data.mode, slice_type, slice_name);
|
||||
|
||||
_mesa_hash_table_insert(state->vars_to_slice, var, slice_var);
|
||||
_mesa_hash_table_insert(state->slice_coop_types, slice_var, (void *)mat_type);
|
||||
}
|
||||
|
||||
bool
|
||||
brw_nir_lower_cmat(nir_shader *shader, unsigned subgroup_size)
|
||||
{
|
||||
void *temp_ctx = ralloc_context(NULL);
|
||||
|
||||
struct lower_cmat_state state = {
|
||||
.shader = shader,
|
||||
.slice_coop_types = _mesa_pointer_hash_table_create(temp_ctx),
|
||||
.vars_to_slice = _mesa_pointer_hash_table_create(temp_ctx),
|
||||
.subgroup_size = subgroup_size,
|
||||
};
|
||||
|
||||
/* Create a slice array for each variable and add a map from the original
|
||||
* variable back to it, so it can be reached during lowering.
|
||||
*
|
||||
* TODO: Cooperative matrix inside struct?
|
||||
*/
|
||||
nir_foreach_variable_in_shader(var, shader) {
|
||||
if (glsl_type_is_cmat(glsl_without_array(var->type)))
|
||||
create_slice_var(&state, var, NULL);
|
||||
}
|
||||
nir_foreach_function(func, shader) {
|
||||
nir_foreach_function_temp_variable(var, func->impl) {
|
||||
if (glsl_type_is_cmat(glsl_without_array(var->type)))
|
||||
create_slice_var(&state, var, func->impl);
|
||||
}
|
||||
}
|
||||
|
||||
bool progress = nir_shader_lower_instructions(shader,
|
||||
lower_cmat_filter,
|
||||
lower_cmat_instr,
|
||||
&state);
|
||||
|
||||
ralloc_free(temp_ctx);
|
||||
|
||||
return progress;
|
||||
}
|
||||
362
src/intel/compiler/elk/brw_nir_lower_cs_intrinsics.c
Normal file
362
src/intel/compiler/elk/brw_nir_lower_cs_intrinsics.c
Normal file
|
|
@ -0,0 +1,362 @@
|
|||
/*
|
||||
* Copyright (c) 2016 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_nir.h"
|
||||
#include "compiler/nir/nir_builder.h"
|
||||
|
||||
struct lower_intrinsics_state {
|
||||
nir_shader *nir;
|
||||
nir_function_impl *impl;
|
||||
bool progress;
|
||||
bool hw_generated_local_id;
|
||||
nir_builder builder;
|
||||
};
|
||||
|
||||
static void
|
||||
compute_local_index_id(nir_builder *b,
|
||||
nir_shader *nir,
|
||||
nir_def **local_index,
|
||||
nir_def **local_id)
|
||||
{
|
||||
nir_def *subgroup_id = nir_load_subgroup_id(b);
|
||||
|
||||
nir_def *thread_local_id =
|
||||
nir_imul(b, subgroup_id, nir_load_simd_width_intel(b));
|
||||
nir_def *channel = nir_load_subgroup_invocation(b);
|
||||
nir_def *linear = nir_iadd(b, channel, thread_local_id);
|
||||
|
||||
nir_def *size_x;
|
||||
nir_def *size_y;
|
||||
if (nir->info.workgroup_size_variable) {
|
||||
nir_def *size_xyz = nir_load_workgroup_size(b);
|
||||
size_x = nir_channel(b, size_xyz, 0);
|
||||
size_y = nir_channel(b, size_xyz, 1);
|
||||
} else {
|
||||
size_x = nir_imm_int(b, nir->info.workgroup_size[0]);
|
||||
size_y = nir_imm_int(b, nir->info.workgroup_size[1]);
|
||||
}
|
||||
nir_def *size_xy = nir_imul(b, size_x, size_y);
|
||||
|
||||
/* The local invocation index and ID must respect the following
|
||||
*
|
||||
* gl_LocalInvocationID.x =
|
||||
* gl_LocalInvocationIndex % gl_WorkGroupSize.x;
|
||||
* gl_LocalInvocationID.y =
|
||||
* (gl_LocalInvocationIndex / gl_WorkGroupSize.x) %
|
||||
* gl_WorkGroupSize.y;
|
||||
* gl_LocalInvocationID.z =
|
||||
* (gl_LocalInvocationIndex /
|
||||
* (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) %
|
||||
* gl_WorkGroupSize.z;
|
||||
*
|
||||
* However, the final % gl_WorkGroupSize.z does nothing unless we
|
||||
* accidentally end up with a gl_LocalInvocationIndex that is too
|
||||
* large so it can safely be omitted.
|
||||
*/
|
||||
|
||||
nir_def *id_x, *id_y, *id_z;
|
||||
switch (nir->info.cs.derivative_group) {
|
||||
case DERIVATIVE_GROUP_NONE:
|
||||
if (nir->info.num_images == 0 &&
|
||||
nir->info.num_textures == 0) {
|
||||
/* X-major lid order. Optimal for linear accesses only,
|
||||
* which are usually buffers. X,Y ordering will look like:
|
||||
* (0,0) (1,0) (2,0) ... (size_x-1,0) (0,1) (1,1) ...
|
||||
*/
|
||||
id_x = nir_umod(b, linear, size_x);
|
||||
id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y);
|
||||
*local_index = linear;
|
||||
} else if (!nir->info.workgroup_size_variable &&
|
||||
nir->info.workgroup_size[1] % 4 == 0) {
|
||||
/* 1x4 block X-major lid order. Same as X-major except increments in
|
||||
* blocks of width=1 height=4. Always optimal for tileY and usually
|
||||
* optimal for linear accesses.
|
||||
* x = (linear / 4) % size_x
|
||||
* y = ((linear % 4) + (linear / 4 / size_x) * 4) % size_y
|
||||
* X,Y ordering will look like: (0,0) (0,1) (0,2) (0,3) (1,0) (1,1)
|
||||
* (1,2) (1,3) (2,0) ... (size_x-1,3) (0,4) (0,5) (0,6) (0,7) (1,4) ...
|
||||
*/
|
||||
const unsigned height = 4;
|
||||
nir_def *block = nir_udiv_imm(b, linear, height);
|
||||
id_x = nir_umod(b, block, size_x);
|
||||
id_y = nir_umod(b,
|
||||
nir_iadd(b,
|
||||
nir_umod_imm(b, linear, height),
|
||||
nir_imul_imm(b,
|
||||
nir_udiv(b, block, size_x),
|
||||
height)),
|
||||
size_y);
|
||||
} else {
|
||||
/* Y-major lid order. Optimal for tileY accesses only,
|
||||
* which are usually images. X,Y ordering will look like:
|
||||
* (0,0) (0,1) (0,2) ... (0,size_y-1) (1,0) (1,1) ...
|
||||
*/
|
||||
id_y = nir_umod(b, linear, size_y);
|
||||
id_x = nir_umod(b, nir_udiv(b, linear, size_y), size_x);
|
||||
}
|
||||
|
||||
id_z = nir_udiv(b, linear, size_xy);
|
||||
*local_id = nir_vec3(b, id_x, id_y, id_z);
|
||||
if (!*local_index) {
|
||||
*local_index = nir_iadd(b, nir_iadd(b, id_x,
|
||||
nir_imul(b, id_y, size_x)),
|
||||
nir_imul(b, id_z, size_xy));
|
||||
}
|
||||
break;
|
||||
case DERIVATIVE_GROUP_LINEAR:
|
||||
/* For linear, just set the local invocation index linearly,
|
||||
* and calculate local invocation ID from that.
|
||||
*/
|
||||
id_x = nir_umod(b, linear, size_x);
|
||||
id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y);
|
||||
id_z = nir_udiv(b, linear, size_xy);
|
||||
*local_id = nir_vec3(b, id_x, id_y, id_z);
|
||||
*local_index = linear;
|
||||
break;
|
||||
case DERIVATIVE_GROUP_QUADS: {
|
||||
/* For quads, first we figure out the 2x2 grid the invocation
|
||||
* belongs to -- treating extra Z layers as just more rows.
|
||||
* Then map that into local invocation ID (trivial) and local
|
||||
* invocation index. Skipping Z simplify index calculation.
|
||||
*/
|
||||
|
||||
nir_def *one = nir_imm_int(b, 1);
|
||||
nir_def *double_size_x = nir_ishl(b, size_x, one);
|
||||
|
||||
/* ID within a pair of rows, where each group of 4 is 2x2 quad. */
|
||||
nir_def *row_pair_id = nir_umod(b, linear, double_size_x);
|
||||
nir_def *y_row_pairs = nir_udiv(b, linear, double_size_x);
|
||||
|
||||
nir_def *x =
|
||||
nir_ior(b,
|
||||
nir_iand(b, row_pair_id, one),
|
||||
nir_iand(b, nir_ishr(b, row_pair_id, one),
|
||||
nir_imm_int(b, 0xfffffffe)));
|
||||
nir_def *y =
|
||||
nir_ior(b,
|
||||
nir_ishl(b, y_row_pairs, one),
|
||||
nir_iand(b, nir_ishr(b, row_pair_id, one), one));
|
||||
|
||||
*local_id = nir_vec3(b, x,
|
||||
nir_umod(b, y, size_y),
|
||||
nir_udiv(b, y, size_y));
|
||||
*local_index = nir_iadd(b, x, nir_imul(b, y, size_x));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
unreachable("invalid derivative group");
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
|
||||
nir_block *block)
|
||||
{
|
||||
bool progress = false;
|
||||
nir_builder *b = &state->builder;
|
||||
nir_shader *nir = state->nir;
|
||||
|
||||
/* Reuse calculated values inside the block. */
|
||||
nir_def *local_index = NULL;
|
||||
nir_def *local_id = NULL;
|
||||
|
||||
nir_foreach_instr_safe(instr, block) {
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
continue;
|
||||
|
||||
nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
|
||||
|
||||
b->cursor = nir_after_instr(&intrinsic->instr);
|
||||
|
||||
nir_def *sysval;
|
||||
switch (intrinsic->intrinsic) {
|
||||
case nir_intrinsic_load_local_invocation_id:
|
||||
if (state->hw_generated_local_id)
|
||||
continue;
|
||||
|
||||
FALLTHROUGH;
|
||||
case nir_intrinsic_load_local_invocation_index: {
|
||||
if (!local_index && !nir->info.workgroup_size_variable) {
|
||||
const uint16_t *ws = nir->info.workgroup_size;
|
||||
if (ws[0] * ws[1] * ws[2] == 1) {
|
||||
nir_def *zero = nir_imm_int(b, 0);
|
||||
local_index = zero;
|
||||
local_id = nir_replicate(b, zero, 3);
|
||||
}
|
||||
}
|
||||
|
||||
if (!local_index) {
|
||||
if (nir->info.stage == MESA_SHADER_TASK ||
|
||||
nir->info.stage == MESA_SHADER_MESH) {
|
||||
/* Will be lowered by nir_emit_task_mesh_intrinsic() using
|
||||
* information from the payload.
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
|
||||
if (state->hw_generated_local_id) {
|
||||
nir_def *local_id_vec = nir_load_local_invocation_id(b);
|
||||
nir_def *local_id[3] = { nir_channel(b, local_id_vec, 0),
|
||||
nir_channel(b, local_id_vec, 1),
|
||||
nir_channel(b, local_id_vec, 2) };
|
||||
nir_def *size_x = nir_imm_int(b, nir->info.workgroup_size[0]);
|
||||
nir_def *size_y = nir_imm_int(b, nir->info.workgroup_size[1]);
|
||||
|
||||
sysval = nir_imul(b, local_id[2], nir_imul(b, size_x, size_y));
|
||||
sysval = nir_iadd(b, sysval, nir_imul(b, local_id[1], size_x));
|
||||
sysval = nir_iadd(b, sysval, local_id[0]);
|
||||
local_index = sysval;
|
||||
break;
|
||||
}
|
||||
|
||||
/* First time we are using those, so let's calculate them. */
|
||||
assert(!local_id);
|
||||
compute_local_index_id(b, nir, &local_index, &local_id);
|
||||
}
|
||||
|
||||
assert(local_id);
|
||||
assert(local_index);
|
||||
if (intrinsic->intrinsic == nir_intrinsic_load_local_invocation_id)
|
||||
sysval = local_id;
|
||||
else
|
||||
sysval = local_index;
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_num_subgroups: {
|
||||
nir_def *size;
|
||||
if (state->nir->info.workgroup_size_variable) {
|
||||
nir_def *size_xyz = nir_load_workgroup_size(b);
|
||||
nir_def *size_x = nir_channel(b, size_xyz, 0);
|
||||
nir_def *size_y = nir_channel(b, size_xyz, 1);
|
||||
nir_def *size_z = nir_channel(b, size_xyz, 2);
|
||||
size = nir_imul(b, nir_imul(b, size_x, size_y), size_z);
|
||||
} else {
|
||||
size = nir_imm_int(b, nir->info.workgroup_size[0] *
|
||||
nir->info.workgroup_size[1] *
|
||||
nir->info.workgroup_size[2]);
|
||||
}
|
||||
|
||||
/* Calculate the equivalent of DIV_ROUND_UP. */
|
||||
nir_def *simd_width = nir_load_simd_width_intel(b);
|
||||
sysval =
|
||||
nir_udiv(b, nir_iadd_imm(b, nir_iadd(b, size, simd_width), -1),
|
||||
simd_width);
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
continue;
|
||||
}
|
||||
|
||||
if (intrinsic->def.bit_size == 64)
|
||||
sysval = nir_u2u64(b, sysval);
|
||||
|
||||
nir_def_rewrite_uses(&intrinsic->def, sysval);
|
||||
nir_instr_remove(&intrinsic->instr);
|
||||
|
||||
state->progress = true;
|
||||
}
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
static void
|
||||
lower_cs_intrinsics_convert_impl(struct lower_intrinsics_state *state)
|
||||
{
|
||||
state->builder = nir_builder_create(state->impl);
|
||||
|
||||
nir_foreach_block(block, state->impl) {
|
||||
lower_cs_intrinsics_convert_block(state, block);
|
||||
}
|
||||
|
||||
nir_metadata_preserve(state->impl,
|
||||
nir_metadata_block_index | nir_metadata_dominance);
|
||||
}
|
||||
|
||||
bool
|
||||
brw_nir_lower_cs_intrinsics(nir_shader *nir,
|
||||
const struct intel_device_info *devinfo,
|
||||
struct brw_cs_prog_data *prog_data)
|
||||
{
|
||||
assert(gl_shader_stage_uses_workgroup(nir->info.stage));
|
||||
|
||||
struct lower_intrinsics_state state = {
|
||||
.nir = nir,
|
||||
.hw_generated_local_id = false,
|
||||
};
|
||||
|
||||
/* Constraints from NV_compute_shader_derivatives. */
|
||||
if (gl_shader_stage_is_compute(nir->info.stage) &&
|
||||
!nir->info.workgroup_size_variable) {
|
||||
if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) {
|
||||
assert(nir->info.workgroup_size[0] % 2 == 0);
|
||||
assert(nir->info.workgroup_size[1] % 2 == 0);
|
||||
} else if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_LINEAR) {
|
||||
ASSERTED unsigned workgroup_size =
|
||||
nir->info.workgroup_size[0] *
|
||||
nir->info.workgroup_size[1] *
|
||||
nir->info.workgroup_size[2];
|
||||
assert(workgroup_size % 4 == 0);
|
||||
}
|
||||
}
|
||||
|
||||
if (devinfo->verx10 >= 125 && prog_data &&
|
||||
nir->info.stage == MESA_SHADER_COMPUTE &&
|
||||
nir->info.cs.derivative_group != DERIVATIVE_GROUP_QUADS &&
|
||||
!nir->info.workgroup_size_variable &&
|
||||
util_is_power_of_two_nonzero(nir->info.workgroup_size[0]) &&
|
||||
util_is_power_of_two_nonzero(nir->info.workgroup_size[1])) {
|
||||
|
||||
state.hw_generated_local_id = true;
|
||||
|
||||
/* TODO: more heuristics about 1D/SLM access vs. 2D access */
|
||||
bool linear =
|
||||
BITSET_TEST(nir->info.system_values_read,
|
||||
SYSTEM_VALUE_LOCAL_INVOCATION_INDEX) ||
|
||||
(nir->info.workgroup_size[1] == 1 &&
|
||||
nir->info.workgroup_size[2] == 1) ||
|
||||
(nir->info.num_images == 0 && nir->info.num_textures == 0);
|
||||
|
||||
prog_data->walk_order =
|
||||
linear ? INTEL_WALK_ORDER_XYZ : INTEL_WALK_ORDER_YXZ;
|
||||
|
||||
/* nir_lower_compute_system_values will replace any references to
|
||||
* SYSTEM_VALUE_LOCAL_INVOCATION_ID vector components with zero for
|
||||
* any dimension where the workgroup size is 1, so we can skip
|
||||
* generating those. However, the hardware can only generate
|
||||
* X, XY, or XYZ - it can't skip earlier components.
|
||||
*/
|
||||
prog_data->generate_local_id =
|
||||
(nir->info.workgroup_size[0] > 1 ? WRITEMASK_X : 0) |
|
||||
(nir->info.workgroup_size[1] > 1 ? WRITEMASK_XY : 0) |
|
||||
(nir->info.workgroup_size[2] > 1 ? WRITEMASK_XYZ : 0);
|
||||
}
|
||||
|
||||
nir_foreach_function_impl(impl, nir) {
|
||||
state.impl = impl;
|
||||
lower_cs_intrinsics_convert_impl(&state);
|
||||
}
|
||||
|
||||
return state.progress;
|
||||
}
|
||||
273
src/intel/compiler/elk/brw_nir_lower_intersection_shader.c
Normal file
273
src/intel/compiler/elk/brw_nir_lower_intersection_shader.c
Normal file
|
|
@ -0,0 +1,273 @@
|
|||
/*
|
||||
* Copyright (c) 2020 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_nir_rt.h"
|
||||
#include "brw_nir_rt_builder.h"
|
||||
|
||||
static nir_function_impl *
|
||||
lower_any_hit_for_intersection(nir_shader *any_hit)
|
||||
{
|
||||
nir_function_impl *impl = nir_shader_get_entrypoint(any_hit);
|
||||
|
||||
/* Any-hit shaders need three parameters */
|
||||
assert(impl->function->num_params == 0);
|
||||
nir_parameter params[] = {
|
||||
{
|
||||
/* A pointer to a boolean value for whether or not the hit was
|
||||
* accepted.
|
||||
*/
|
||||
.num_components = 1,
|
||||
.bit_size = 32,
|
||||
},
|
||||
{
|
||||
/* The hit T value */
|
||||
.num_components = 1,
|
||||
.bit_size = 32,
|
||||
},
|
||||
{
|
||||
/* The hit kind */
|
||||
.num_components = 1,
|
||||
.bit_size = 32,
|
||||
},
|
||||
};
|
||||
impl->function->num_params = ARRAY_SIZE(params);
|
||||
impl->function->params =
|
||||
ralloc_array(any_hit, nir_parameter, ARRAY_SIZE(params));
|
||||
memcpy(impl->function->params, params, sizeof(params));
|
||||
|
||||
nir_builder build = nir_builder_at(nir_before_impl(impl));
|
||||
nir_builder *b = &build;
|
||||
|
||||
nir_def *commit_ptr = nir_load_param(b, 0);
|
||||
nir_def *hit_t = nir_load_param(b, 1);
|
||||
nir_def *hit_kind = nir_load_param(b, 2);
|
||||
|
||||
nir_deref_instr *commit =
|
||||
nir_build_deref_cast(b, commit_ptr, nir_var_function_temp,
|
||||
glsl_bool_type(), 0);
|
||||
|
||||
nir_foreach_block_safe(block, impl) {
|
||||
nir_foreach_instr_safe(instr, block) {
|
||||
switch (instr->type) {
|
||||
case nir_instr_type_intrinsic: {
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_ignore_ray_intersection:
|
||||
b->cursor = nir_instr_remove(&intrin->instr);
|
||||
/* We put the newly emitted code inside a dummy if because it's
|
||||
* going to contain a jump instruction and we don't want to
|
||||
* deal with that mess here. It'll get dealt with by our
|
||||
* control-flow optimization passes.
|
||||
*/
|
||||
nir_store_deref(b, commit, nir_imm_false(b), 0x1);
|
||||
nir_push_if(b, nir_imm_true(b));
|
||||
nir_jump(b, nir_jump_return);
|
||||
nir_pop_if(b, NULL);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_terminate_ray:
|
||||
/* The "normal" handling of terminateRay works fine in
|
||||
* intersection shaders.
|
||||
*/
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_ray_t_max:
|
||||
nir_def_rewrite_uses(&intrin->def,
|
||||
hit_t);
|
||||
nir_instr_remove(&intrin->instr);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_ray_hit_kind:
|
||||
nir_def_rewrite_uses(&intrin->def,
|
||||
hit_kind);
|
||||
nir_instr_remove(&intrin->instr);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_instr_type_jump: {
|
||||
/* Stomp any halts to returns since they only return from the
|
||||
* any-hit shader and not necessarily from the intersection
|
||||
* shader. This is safe to do because we've already asserted
|
||||
* that we only have the one function.
|
||||
*/
|
||||
nir_jump_instr *jump = nir_instr_as_jump(instr);
|
||||
if (jump->type == nir_jump_halt)
|
||||
jump->type = nir_jump_return;
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
nir_validate_shader(any_hit, "after initial any-hit lowering");
|
||||
|
||||
nir_lower_returns_impl(impl);
|
||||
|
||||
nir_validate_shader(any_hit, "after lowering returns");
|
||||
|
||||
return impl;
|
||||
}
|
||||
|
||||
void
|
||||
brw_nir_lower_intersection_shader(nir_shader *intersection,
|
||||
const nir_shader *any_hit,
|
||||
const struct intel_device_info *devinfo)
|
||||
{
|
||||
void *dead_ctx = ralloc_context(intersection);
|
||||
|
||||
nir_function_impl *any_hit_impl = NULL;
|
||||
struct hash_table *any_hit_var_remap = NULL;
|
||||
if (any_hit) {
|
||||
nir_shader *any_hit_tmp = nir_shader_clone(dead_ctx, any_hit);
|
||||
NIR_PASS_V(any_hit_tmp, nir_opt_dce);
|
||||
any_hit_impl = lower_any_hit_for_intersection(any_hit_tmp);
|
||||
any_hit_var_remap = _mesa_pointer_hash_table_create(dead_ctx);
|
||||
}
|
||||
|
||||
nir_function_impl *impl = nir_shader_get_entrypoint(intersection);
|
||||
|
||||
nir_builder build = nir_builder_at(nir_before_impl(impl));
|
||||
nir_builder *b = &build;
|
||||
|
||||
nir_def *t_addr = brw_nir_rt_mem_hit_addr(b, false /* committed */);
|
||||
nir_variable *commit =
|
||||
nir_local_variable_create(impl, glsl_bool_type(), "ray_commit");
|
||||
nir_store_var(b, commit, nir_imm_false(b), 0x1);
|
||||
|
||||
assert(impl->end_block->predecessors->entries == 1);
|
||||
set_foreach(impl->end_block->predecessors, block_entry) {
|
||||
struct nir_block *block = (void *)block_entry->key;
|
||||
b->cursor = nir_after_block_before_jump(block);
|
||||
nir_push_if(b, nir_load_var(b, commit));
|
||||
{
|
||||
/* Set the "valid" bit in mem_hit */
|
||||
nir_def *ray_addr = brw_nir_rt_mem_hit_addr(b, false /* committed */);
|
||||
nir_def *flags_dw_addr = nir_iadd_imm(b, ray_addr, 12);
|
||||
nir_store_global(b, flags_dw_addr, 4,
|
||||
nir_ior(b, nir_load_global(b, flags_dw_addr, 4, 1, 32),
|
||||
nir_imm_int(b, 1 << 16)), 0x1 /* write_mask */);
|
||||
|
||||
nir_accept_ray_intersection(b);
|
||||
}
|
||||
nir_push_else(b, NULL);
|
||||
{
|
||||
nir_ignore_ray_intersection(b);
|
||||
}
|
||||
nir_pop_if(b, NULL);
|
||||
break;
|
||||
}
|
||||
|
||||
nir_foreach_block_safe(block, impl) {
|
||||
nir_foreach_instr_safe(instr, block) {
|
||||
switch (instr->type) {
|
||||
case nir_instr_type_intrinsic: {
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_report_ray_intersection: {
|
||||
b->cursor = nir_instr_remove(&intrin->instr);
|
||||
nir_def *hit_t = intrin->src[0].ssa;
|
||||
nir_def *hit_kind = intrin->src[1].ssa;
|
||||
nir_def *min_t = nir_load_ray_t_min(b);
|
||||
|
||||
struct brw_nir_rt_mem_ray_defs ray_def;
|
||||
brw_nir_rt_load_mem_ray(b, &ray_def, BRW_RT_BVH_LEVEL_WORLD);
|
||||
|
||||
struct brw_nir_rt_mem_hit_defs hit_in = {};
|
||||
brw_nir_rt_load_mem_hit(b, &hit_in, false);
|
||||
|
||||
nir_def *max_t = ray_def.t_far;
|
||||
|
||||
/* bool commit_tmp = false; */
|
||||
nir_variable *commit_tmp =
|
||||
nir_local_variable_create(impl, glsl_bool_type(),
|
||||
"commit_tmp");
|
||||
nir_store_var(b, commit_tmp, nir_imm_false(b), 0x1);
|
||||
|
||||
nir_push_if(b, nir_iand(b, nir_fge(b, hit_t, min_t),
|
||||
nir_fge(b, max_t, hit_t)));
|
||||
{
|
||||
/* Any-hit defaults to commit */
|
||||
nir_store_var(b, commit_tmp, nir_imm_true(b), 0x1);
|
||||
|
||||
if (any_hit_impl != NULL) {
|
||||
nir_push_if(b, nir_inot(b, nir_load_leaf_opaque_intel(b)));
|
||||
{
|
||||
nir_def *params[] = {
|
||||
&nir_build_deref_var(b, commit_tmp)->def,
|
||||
hit_t,
|
||||
hit_kind,
|
||||
};
|
||||
nir_inline_function_impl(b, any_hit_impl, params,
|
||||
any_hit_var_remap);
|
||||
}
|
||||
nir_pop_if(b, NULL);
|
||||
}
|
||||
|
||||
nir_push_if(b, nir_load_var(b, commit_tmp));
|
||||
{
|
||||
nir_store_var(b, commit, nir_imm_true(b), 0x1);
|
||||
|
||||
nir_def *ray_addr =
|
||||
brw_nir_rt_mem_ray_addr(b, brw_nir_rt_stack_addr(b), BRW_RT_BVH_LEVEL_WORLD);
|
||||
|
||||
nir_store_global(b, nir_iadd_imm(b, ray_addr, 16 + 12), 4, hit_t, 0x1);
|
||||
nir_store_global(b, t_addr, 4,
|
||||
nir_vec2(b, nir_fmin(b, hit_t, hit_in.t), hit_kind),
|
||||
0x3);
|
||||
}
|
||||
nir_pop_if(b, NULL);
|
||||
}
|
||||
nir_pop_if(b, NULL);
|
||||
|
||||
nir_def *accepted = nir_load_var(b, commit_tmp);
|
||||
nir_def_rewrite_uses(&intrin->def,
|
||||
accepted);
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
nir_metadata_preserve(impl, nir_metadata_none);
|
||||
|
||||
/* We did some inlining; have to re-index SSA defs */
|
||||
nir_index_ssa_defs(impl);
|
||||
|
||||
ralloc_free(dead_ctx);
|
||||
}
|
||||
567
src/intel/compiler/elk/brw_nir_lower_ray_queries.c
Normal file
567
src/intel/compiler/elk/brw_nir_lower_ray_queries.c
Normal file
|
|
@ -0,0 +1,567 @@
|
|||
/*
|
||||
* Copyright (c) 2021 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_nir_rt.h"
|
||||
#include "brw_nir_rt_builder.h"
|
||||
|
||||
#include "nir_deref.h"
|
||||
|
||||
#include "util/macros.h"
|
||||
|
||||
struct lowering_state {
|
||||
const struct intel_device_info *devinfo;
|
||||
|
||||
nir_function_impl *impl;
|
||||
|
||||
struct hash_table *queries;
|
||||
uint32_t n_queries;
|
||||
|
||||
struct brw_nir_rt_globals_defs globals;
|
||||
nir_def *rq_globals;
|
||||
};
|
||||
|
||||
struct brw_ray_query {
|
||||
nir_variable *opaque_var;
|
||||
nir_variable *internal_var;
|
||||
uint32_t id;
|
||||
};
|
||||
|
||||
#define SIZEOF_QUERY_STATE (sizeof(uint32_t))
|
||||
|
||||
static bool
|
||||
need_spill_fill(struct lowering_state *state)
|
||||
{
|
||||
return state->n_queries > 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* This pass converts opaque RayQuery structures from SPIRV into a vec3 where
|
||||
* the first 2 elements store a global address for the query and the third
|
||||
* element is an incremented counter on the number of executed
|
||||
* nir_intrinsic_rq_proceed.
|
||||
*/
|
||||
|
||||
static void
|
||||
register_opaque_var(nir_variable *opaque_var, struct lowering_state *state)
|
||||
{
|
||||
struct hash_entry *entry = _mesa_hash_table_search(state->queries, opaque_var);
|
||||
assert(entry == NULL);
|
||||
|
||||
struct brw_ray_query *rq = rzalloc(state->queries, struct brw_ray_query);
|
||||
rq->opaque_var = opaque_var;
|
||||
rq->id = state->n_queries;
|
||||
|
||||
unsigned aoa_size = glsl_get_aoa_size(opaque_var->type);
|
||||
state->n_queries += MAX2(1, aoa_size);
|
||||
|
||||
_mesa_hash_table_insert(state->queries, opaque_var, rq);
|
||||
}
|
||||
|
||||
static void
|
||||
create_internal_var(struct brw_ray_query *rq, struct lowering_state *state)
|
||||
{
|
||||
const struct glsl_type *opaque_type = rq->opaque_var->type;
|
||||
const struct glsl_type *internal_type = glsl_uint16_t_type();
|
||||
|
||||
while (glsl_type_is_array(opaque_type)) {
|
||||
assert(!glsl_type_is_unsized_array(opaque_type));
|
||||
internal_type = glsl_array_type(internal_type,
|
||||
glsl_array_size(opaque_type),
|
||||
0);
|
||||
opaque_type = glsl_get_array_element(opaque_type);
|
||||
}
|
||||
|
||||
rq->internal_var = nir_local_variable_create(state->impl,
|
||||
internal_type,
|
||||
NULL);
|
||||
}
|
||||
|
||||
|
||||
|
||||
static nir_def *
|
||||
get_ray_query_shadow_addr(nir_builder *b,
|
||||
nir_deref_instr *deref,
|
||||
struct lowering_state *state,
|
||||
nir_deref_instr **out_state_deref)
|
||||
{
|
||||
nir_deref_path path;
|
||||
nir_deref_path_init(&path, deref, NULL);
|
||||
assert(path.path[0]->deref_type == nir_deref_type_var);
|
||||
|
||||
nir_variable *opaque_var = nir_deref_instr_get_variable(path.path[0]);
|
||||
struct hash_entry *entry = _mesa_hash_table_search(state->queries, opaque_var);
|
||||
assert(entry);
|
||||
|
||||
struct brw_ray_query *rq = entry->data;
|
||||
|
||||
/* Base address in the shadow memory of the variable associated with this
|
||||
* ray query variable.
|
||||
*/
|
||||
nir_def *base_addr =
|
||||
nir_iadd_imm(b, state->globals.resume_sbt_addr,
|
||||
brw_rt_ray_queries_shadow_stack_size(state->devinfo) * rq->id);
|
||||
|
||||
bool spill_fill = need_spill_fill(state);
|
||||
*out_state_deref = nir_build_deref_var(b, rq->internal_var);
|
||||
|
||||
if (!spill_fill)
|
||||
return NULL;
|
||||
|
||||
/* Just emit code and let constant-folding go to town */
|
||||
nir_deref_instr **p = &path.path[1];
|
||||
for (; *p; p++) {
|
||||
if ((*p)->deref_type == nir_deref_type_array) {
|
||||
nir_def *index = (*p)->arr.index.ssa;
|
||||
|
||||
/**/
|
||||
*out_state_deref = nir_build_deref_array(b, *out_state_deref, index);
|
||||
|
||||
/**/
|
||||
uint64_t size = MAX2(1, glsl_get_aoa_size((*p)->type)) *
|
||||
brw_rt_ray_queries_shadow_stack_size(state->devinfo);
|
||||
|
||||
nir_def *mul = nir_amul_imm(b, nir_i2i64(b, index), size);
|
||||
|
||||
base_addr = nir_iadd(b, base_addr, mul);
|
||||
} else {
|
||||
unreachable("Unsupported deref type");
|
||||
}
|
||||
}
|
||||
|
||||
nir_deref_path_finish(&path);
|
||||
|
||||
/* Add the lane offset to the shadow memory address */
|
||||
nir_def *lane_offset =
|
||||
nir_imul_imm(
|
||||
b,
|
||||
nir_iadd(
|
||||
b,
|
||||
nir_imul(
|
||||
b,
|
||||
brw_load_btd_dss_id(b),
|
||||
brw_nir_rt_load_num_simd_lanes_per_dss(b, state->devinfo)),
|
||||
brw_nir_rt_sync_stack_id(b)),
|
||||
BRW_RT_SIZEOF_SHADOW_RAY_QUERY);
|
||||
|
||||
return nir_iadd(b, base_addr, nir_i2i64(b, lane_offset));
|
||||
}
|
||||
|
||||
static void
|
||||
update_trace_ctrl_level(nir_builder *b,
|
||||
nir_deref_instr *state_deref,
|
||||
nir_def **out_old_ctrl,
|
||||
nir_def **out_old_level,
|
||||
nir_def *new_ctrl,
|
||||
nir_def *new_level)
|
||||
{
|
||||
nir_def *old_value = nir_load_deref(b, state_deref);
|
||||
nir_def *old_ctrl = nir_ishr_imm(b, old_value, 2);
|
||||
nir_def *old_level = nir_iand_imm(b, old_value, 0x3);
|
||||
|
||||
if (out_old_ctrl)
|
||||
*out_old_ctrl = old_ctrl;
|
||||
if (out_old_level)
|
||||
*out_old_level = old_level;
|
||||
|
||||
if (new_ctrl)
|
||||
new_ctrl = nir_i2i16(b, new_ctrl);
|
||||
if (new_level)
|
||||
new_level = nir_i2i16(b, new_level);
|
||||
|
||||
if (new_ctrl || new_level) {
|
||||
if (!new_ctrl)
|
||||
new_ctrl = old_ctrl;
|
||||
if (!new_level)
|
||||
new_level = old_level;
|
||||
|
||||
nir_def *new_value = nir_ior(b, nir_ishl_imm(b, new_ctrl, 2), new_level);
|
||||
nir_store_deref(b, state_deref, new_value, 0x1);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
fill_query(nir_builder *b,
|
||||
nir_def *hw_stack_addr,
|
||||
nir_def *shadow_stack_addr,
|
||||
nir_def *ctrl)
|
||||
{
|
||||
brw_nir_memcpy_global(b, hw_stack_addr, 64, shadow_stack_addr, 64,
|
||||
BRW_RT_SIZEOF_RAY_QUERY);
|
||||
}
|
||||
|
||||
static void
|
||||
spill_query(nir_builder *b,
|
||||
nir_def *hw_stack_addr,
|
||||
nir_def *shadow_stack_addr)
|
||||
{
|
||||
brw_nir_memcpy_global(b, shadow_stack_addr, 64, hw_stack_addr, 64,
|
||||
BRW_RT_SIZEOF_RAY_QUERY);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
lower_ray_query_intrinsic(nir_builder *b,
|
||||
nir_intrinsic_instr *intrin,
|
||||
struct lowering_state *state)
|
||||
{
|
||||
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
|
||||
|
||||
b->cursor = nir_instr_remove(&intrin->instr);
|
||||
|
||||
nir_deref_instr *ctrl_level_deref;
|
||||
nir_def *shadow_stack_addr =
|
||||
get_ray_query_shadow_addr(b, deref, state, &ctrl_level_deref);
|
||||
nir_def *hw_stack_addr =
|
||||
brw_nir_rt_sync_stack_addr(b, state->globals.base_mem_addr, state->devinfo);
|
||||
nir_def *stack_addr = shadow_stack_addr ? shadow_stack_addr : hw_stack_addr;
|
||||
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_rq_initialize: {
|
||||
nir_def *as_addr = intrin->src[1].ssa;
|
||||
nir_def *ray_flags = intrin->src[2].ssa;
|
||||
/* From the SPIR-V spec:
|
||||
*
|
||||
* "Only the 8 least-significant bits of Cull Mask are used by
|
||||
* this instruction - other bits are ignored.
|
||||
*
|
||||
* Only the 16 least-significant bits of Miss Index are used by
|
||||
* this instruction - other bits are ignored."
|
||||
*/
|
||||
nir_def *cull_mask = nir_iand_imm(b, intrin->src[3].ssa, 0xff);
|
||||
nir_def *ray_orig = intrin->src[4].ssa;
|
||||
nir_def *ray_t_min = intrin->src[5].ssa;
|
||||
nir_def *ray_dir = intrin->src[6].ssa;
|
||||
nir_def *ray_t_max = intrin->src[7].ssa;
|
||||
|
||||
nir_def *root_node_ptr =
|
||||
brw_nir_rt_acceleration_structure_to_root_node(b, as_addr);
|
||||
|
||||
struct brw_nir_rt_mem_ray_defs ray_defs = {
|
||||
.root_node_ptr = root_node_ptr,
|
||||
.ray_flags = nir_u2u16(b, ray_flags),
|
||||
.ray_mask = cull_mask,
|
||||
.orig = ray_orig,
|
||||
.t_near = ray_t_min,
|
||||
.dir = ray_dir,
|
||||
.t_far = ray_t_max,
|
||||
};
|
||||
|
||||
nir_def *ray_addr =
|
||||
brw_nir_rt_mem_ray_addr(b, stack_addr, BRW_RT_BVH_LEVEL_WORLD);
|
||||
|
||||
brw_nir_rt_query_mark_init(b, stack_addr);
|
||||
brw_nir_rt_store_mem_ray_query_at_addr(b, ray_addr, &ray_defs);
|
||||
|
||||
update_trace_ctrl_level(b, ctrl_level_deref,
|
||||
NULL, NULL,
|
||||
nir_imm_int(b, GEN_RT_TRACE_RAY_INITAL),
|
||||
nir_imm_int(b, BRW_RT_BVH_LEVEL_WORLD));
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_rq_proceed: {
|
||||
nir_def *not_done =
|
||||
nir_inot(b, brw_nir_rt_query_done(b, stack_addr));
|
||||
nir_def *not_done_then, *not_done_else;
|
||||
|
||||
nir_push_if(b, not_done);
|
||||
{
|
||||
nir_def *ctrl, *level;
|
||||
update_trace_ctrl_level(b, ctrl_level_deref,
|
||||
&ctrl, &level,
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
/* Mark the query as done because handing it over to the HW for
|
||||
* processing. If the HW make any progress, it will write back some
|
||||
* data and as a side effect, clear the "done" bit. If no progress is
|
||||
* made, HW does not write anything back and we can use this bit to
|
||||
* detect that.
|
||||
*/
|
||||
brw_nir_rt_query_mark_done(b, stack_addr);
|
||||
|
||||
if (shadow_stack_addr)
|
||||
fill_query(b, hw_stack_addr, shadow_stack_addr, ctrl);
|
||||
|
||||
nir_trace_ray_intel(b, state->rq_globals, level, ctrl, .synchronous = true);
|
||||
|
||||
struct brw_nir_rt_mem_hit_defs hit_in = {};
|
||||
brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, hw_stack_addr, false);
|
||||
|
||||
if (shadow_stack_addr)
|
||||
spill_query(b, hw_stack_addr, shadow_stack_addr);
|
||||
|
||||
update_trace_ctrl_level(b, ctrl_level_deref,
|
||||
NULL, NULL,
|
||||
nir_imm_int(b, GEN_RT_TRACE_RAY_CONTINUE),
|
||||
hit_in.bvh_level);
|
||||
|
||||
not_done_then = nir_inot(b, hit_in.done);
|
||||
}
|
||||
nir_push_else(b, NULL);
|
||||
{
|
||||
not_done_else = nir_imm_false(b);
|
||||
}
|
||||
nir_pop_if(b, NULL);
|
||||
not_done = nir_if_phi(b, not_done_then, not_done_else);
|
||||
nir_def_rewrite_uses(&intrin->def, not_done);
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_rq_confirm_intersection: {
|
||||
brw_nir_memcpy_global(b,
|
||||
brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, true), 16,
|
||||
brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, false), 16,
|
||||
BRW_RT_SIZEOF_HIT_INFO);
|
||||
update_trace_ctrl_level(b, ctrl_level_deref,
|
||||
NULL, NULL,
|
||||
nir_imm_int(b, GEN_RT_TRACE_RAY_COMMIT),
|
||||
nir_imm_int(b, BRW_RT_BVH_LEVEL_OBJECT));
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_rq_generate_intersection: {
|
||||
brw_nir_rt_generate_hit_addr(b, stack_addr, intrin->src[1].ssa);
|
||||
update_trace_ctrl_level(b, ctrl_level_deref,
|
||||
NULL, NULL,
|
||||
nir_imm_int(b, GEN_RT_TRACE_RAY_COMMIT),
|
||||
nir_imm_int(b, BRW_RT_BVH_LEVEL_OBJECT));
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_rq_terminate: {
|
||||
brw_nir_rt_query_mark_done(b, stack_addr);
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_rq_load: {
|
||||
const bool committed = nir_intrinsic_committed(intrin);
|
||||
|
||||
struct brw_nir_rt_mem_ray_defs world_ray_in = {};
|
||||
struct brw_nir_rt_mem_ray_defs object_ray_in = {};
|
||||
struct brw_nir_rt_mem_hit_defs hit_in = {};
|
||||
brw_nir_rt_load_mem_ray_from_addr(b, &world_ray_in, stack_addr,
|
||||
BRW_RT_BVH_LEVEL_WORLD);
|
||||
brw_nir_rt_load_mem_ray_from_addr(b, &object_ray_in, stack_addr,
|
||||
BRW_RT_BVH_LEVEL_OBJECT);
|
||||
brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, stack_addr, committed);
|
||||
|
||||
nir_def *sysval = NULL;
|
||||
switch (nir_intrinsic_ray_query_value(intrin)) {
|
||||
case nir_ray_query_value_intersection_type:
|
||||
if (committed) {
|
||||
/* Values we want to generate :
|
||||
*
|
||||
* RayQueryCommittedIntersectionNoneEXT = 0U <= hit_in.valid == false
|
||||
* RayQueryCommittedIntersectionTriangleEXT = 1U <= hit_in.leaf_type == BRW_RT_BVH_NODE_TYPE_QUAD (4)
|
||||
* RayQueryCommittedIntersectionGeneratedEXT = 2U <= hit_in.leaf_type == BRW_RT_BVH_NODE_TYPE_PROCEDURAL (3)
|
||||
*/
|
||||
sysval =
|
||||
nir_bcsel(b, nir_ieq_imm(b, hit_in.leaf_type, 4),
|
||||
nir_imm_int(b, 1), nir_imm_int(b, 2));
|
||||
sysval =
|
||||
nir_bcsel(b, hit_in.valid,
|
||||
sysval, nir_imm_int(b, 0));
|
||||
} else {
|
||||
/* 0 -> triangle, 1 -> AABB */
|
||||
sysval =
|
||||
nir_b2i32(b,
|
||||
nir_ieq_imm(b, hit_in.leaf_type,
|
||||
BRW_RT_BVH_NODE_TYPE_PROCEDURAL));
|
||||
}
|
||||
break;
|
||||
|
||||
case nir_ray_query_value_intersection_t:
|
||||
sysval = hit_in.t;
|
||||
break;
|
||||
|
||||
case nir_ray_query_value_intersection_instance_custom_index: {
|
||||
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
|
||||
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
|
||||
sysval = leaf.instance_id;
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_ray_query_value_intersection_instance_id: {
|
||||
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
|
||||
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
|
||||
sysval = leaf.instance_index;
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_ray_query_value_intersection_instance_sbt_index: {
|
||||
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
|
||||
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
|
||||
sysval = leaf.contribution_to_hit_group_index;
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_ray_query_value_intersection_geometry_index: {
|
||||
nir_def *geometry_index_dw =
|
||||
nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
|
||||
1, 32);
|
||||
sysval = nir_iand_imm(b, geometry_index_dw, BITFIELD_MASK(29));
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_ray_query_value_intersection_primitive_index:
|
||||
sysval = brw_nir_rt_load_primitive_id_from_hit(b, NULL /* is_procedural */, &hit_in);
|
||||
break;
|
||||
|
||||
case nir_ray_query_value_intersection_barycentrics:
|
||||
sysval = hit_in.tri_bary;
|
||||
break;
|
||||
|
||||
case nir_ray_query_value_intersection_front_face:
|
||||
sysval = hit_in.front_face;
|
||||
break;
|
||||
|
||||
case nir_ray_query_value_intersection_object_ray_direction:
|
||||
sysval = world_ray_in.dir;
|
||||
break;
|
||||
|
||||
case nir_ray_query_value_intersection_object_ray_origin:
|
||||
sysval = world_ray_in.orig;
|
||||
break;
|
||||
|
||||
case nir_ray_query_value_intersection_object_to_world: {
|
||||
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
|
||||
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
|
||||
sysval = leaf.object_to_world[nir_intrinsic_column(intrin)];
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_ray_query_value_intersection_world_to_object: {
|
||||
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
|
||||
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
|
||||
sysval = leaf.world_to_object[nir_intrinsic_column(intrin)];
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_ray_query_value_intersection_candidate_aabb_opaque:
|
||||
sysval = hit_in.front_face;
|
||||
break;
|
||||
|
||||
case nir_ray_query_value_tmin:
|
||||
sysval = world_ray_in.t_near;
|
||||
break;
|
||||
|
||||
case nir_ray_query_value_flags:
|
||||
sysval = nir_u2u32(b, world_ray_in.ray_flags);
|
||||
break;
|
||||
|
||||
case nir_ray_query_value_world_ray_direction:
|
||||
sysval = world_ray_in.dir;
|
||||
break;
|
||||
|
||||
case nir_ray_query_value_world_ray_origin:
|
||||
sysval = world_ray_in.orig;
|
||||
break;
|
||||
|
||||
case nir_ray_query_value_intersection_triangle_vertex_positions: {
|
||||
struct brw_nir_rt_bvh_primitive_leaf_positions_defs pos;
|
||||
brw_nir_rt_load_bvh_primitive_leaf_positions(b, &pos, hit_in.prim_leaf_ptr);
|
||||
sysval = pos.positions[nir_intrinsic_column(intrin)];
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
unreachable("Invalid ray query");
|
||||
}
|
||||
|
||||
assert(sysval);
|
||||
nir_def_rewrite_uses(&intrin->def, sysval);
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
unreachable("Invalid intrinsic");
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
lower_ray_query_impl(nir_function_impl *impl, struct lowering_state *state)
|
||||
{
|
||||
nir_builder _b, *b = &_b;
|
||||
_b = nir_builder_at(nir_before_impl(impl));
|
||||
|
||||
state->rq_globals = nir_load_ray_query_global_intel(b);
|
||||
|
||||
brw_nir_rt_load_globals_addr(b, &state->globals, state->rq_globals);
|
||||
|
||||
nir_foreach_block_safe(block, impl) {
|
||||
nir_foreach_instr_safe(instr, block) {
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
continue;
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
if (intrin->intrinsic != nir_intrinsic_rq_initialize &&
|
||||
intrin->intrinsic != nir_intrinsic_rq_terminate &&
|
||||
intrin->intrinsic != nir_intrinsic_rq_proceed &&
|
||||
intrin->intrinsic != nir_intrinsic_rq_generate_intersection &&
|
||||
intrin->intrinsic != nir_intrinsic_rq_confirm_intersection &&
|
||||
intrin->intrinsic != nir_intrinsic_rq_load)
|
||||
continue;
|
||||
|
||||
lower_ray_query_intrinsic(b, intrin, state);
|
||||
}
|
||||
}
|
||||
|
||||
nir_metadata_preserve(impl, nir_metadata_none);
|
||||
}
|
||||
|
||||
bool
|
||||
brw_nir_lower_ray_queries(nir_shader *shader,
|
||||
const struct intel_device_info *devinfo)
|
||||
{
|
||||
assert(exec_list_length(&shader->functions) == 1);
|
||||
|
||||
struct lowering_state state = {
|
||||
.devinfo = devinfo,
|
||||
.impl = nir_shader_get_entrypoint(shader),
|
||||
.queries = _mesa_pointer_hash_table_create(NULL),
|
||||
};
|
||||
|
||||
/* Map all query variable to internal type variables */
|
||||
nir_foreach_function_temp_variable(var, state.impl)
|
||||
register_opaque_var(var, &state);
|
||||
hash_table_foreach(state.queries, entry)
|
||||
create_internal_var(entry->data, &state);
|
||||
|
||||
bool progress = state.n_queries > 0;
|
||||
|
||||
if (progress) {
|
||||
lower_ray_query_impl(state.impl, &state);
|
||||
|
||||
nir_remove_dead_derefs(shader);
|
||||
nir_remove_dead_variables(shader,
|
||||
nir_var_shader_temp | nir_var_function_temp,
|
||||
NULL);
|
||||
|
||||
nir_metadata_preserve(state.impl, nir_metadata_none);
|
||||
}
|
||||
|
||||
ralloc_free(state.queries);
|
||||
|
||||
return progress;
|
||||
}
|
||||
386
src/intel/compiler/elk/brw_nir_lower_rt_intrinsics.c
Normal file
386
src/intel/compiler/elk/brw_nir_lower_rt_intrinsics.c
Normal file
|
|
@ -0,0 +1,386 @@
|
|||
/*
|
||||
* Copyright (c) 2020 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_nir_rt.h"
|
||||
#include "brw_nir_rt_builder.h"
|
||||
|
||||
static nir_def *
|
||||
build_leaf_is_procedural(nir_builder *b, struct brw_nir_rt_mem_hit_defs *hit)
|
||||
{
|
||||
switch (b->shader->info.stage) {
|
||||
case MESA_SHADER_ANY_HIT:
|
||||
/* Any-hit shaders are always compiled into intersection shaders for
|
||||
* procedural geometry. If we got here in an any-hit shader, it's for
|
||||
* triangles.
|
||||
*/
|
||||
return nir_imm_false(b);
|
||||
|
||||
case MESA_SHADER_INTERSECTION:
|
||||
return nir_imm_true(b);
|
||||
|
||||
default:
|
||||
return nir_ieq_imm(b, hit->leaf_type,
|
||||
BRW_RT_BVH_NODE_TYPE_PROCEDURAL);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
lower_rt_intrinsics_impl(nir_function_impl *impl,
|
||||
const struct intel_device_info *devinfo)
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
nir_builder build = nir_builder_at(nir_before_impl(impl));
|
||||
nir_builder *b = &build;
|
||||
|
||||
struct brw_nir_rt_globals_defs globals;
|
||||
brw_nir_rt_load_globals(b, &globals);
|
||||
|
||||
nir_def *hotzone_addr = brw_nir_rt_sw_hotzone_addr(b, devinfo);
|
||||
nir_def *hotzone = nir_load_global(b, hotzone_addr, 16, 4, 32);
|
||||
|
||||
gl_shader_stage stage = b->shader->info.stage;
|
||||
struct brw_nir_rt_mem_ray_defs world_ray_in = {};
|
||||
struct brw_nir_rt_mem_ray_defs object_ray_in = {};
|
||||
struct brw_nir_rt_mem_hit_defs hit_in = {};
|
||||
switch (stage) {
|
||||
case MESA_SHADER_ANY_HIT:
|
||||
case MESA_SHADER_CLOSEST_HIT:
|
||||
case MESA_SHADER_INTERSECTION:
|
||||
brw_nir_rt_load_mem_hit(b, &hit_in,
|
||||
stage == MESA_SHADER_CLOSEST_HIT);
|
||||
brw_nir_rt_load_mem_ray(b, &object_ray_in,
|
||||
BRW_RT_BVH_LEVEL_OBJECT);
|
||||
FALLTHROUGH;
|
||||
|
||||
case MESA_SHADER_MISS:
|
||||
brw_nir_rt_load_mem_ray(b, &world_ray_in,
|
||||
BRW_RT_BVH_LEVEL_WORLD);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
nir_def *thread_stack_base_addr = brw_nir_rt_sw_stack_addr(b, devinfo);
|
||||
nir_def *stack_base_offset = nir_channel(b, hotzone, 0);
|
||||
nir_def *stack_base_addr =
|
||||
nir_iadd(b, thread_stack_base_addr, nir_u2u64(b, stack_base_offset));
|
||||
ASSERTED bool seen_scratch_base_ptr_load = false;
|
||||
ASSERTED bool found_resume = false;
|
||||
|
||||
nir_foreach_block(block, impl) {
|
||||
nir_foreach_instr_safe(instr, block) {
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
continue;
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
|
||||
b->cursor = nir_after_instr(&intrin->instr);
|
||||
|
||||
nir_def *sysval = NULL;
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_load_scratch_base_ptr:
|
||||
assert(nir_intrinsic_base(intrin) == 1);
|
||||
seen_scratch_base_ptr_load = true;
|
||||
sysval = stack_base_addr;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_btd_stack_push_intel: {
|
||||
int32_t stack_size = nir_intrinsic_stack_size(intrin);
|
||||
if (stack_size > 0) {
|
||||
nir_def *child_stack_offset =
|
||||
nir_iadd_imm(b, stack_base_offset, stack_size);
|
||||
nir_store_global(b, hotzone_addr, 16, child_stack_offset, 0x1);
|
||||
}
|
||||
nir_instr_remove(instr);
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_rt_resume:
|
||||
/* This is the first "interesting" instruction */
|
||||
assert(block == nir_start_block(impl));
|
||||
assert(!seen_scratch_base_ptr_load);
|
||||
found_resume = true;
|
||||
|
||||
int32_t stack_size = nir_intrinsic_stack_size(intrin);
|
||||
if (stack_size > 0) {
|
||||
stack_base_offset =
|
||||
nir_iadd_imm(b, stack_base_offset, -stack_size);
|
||||
nir_store_global(b, hotzone_addr, 16, stack_base_offset, 0x1);
|
||||
stack_base_addr = nir_iadd(b, thread_stack_base_addr,
|
||||
nir_u2u64(b, stack_base_offset));
|
||||
}
|
||||
nir_instr_remove(instr);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_uniform: {
|
||||
/* We don't want to lower this in the launch trampoline. */
|
||||
if (stage == MESA_SHADER_COMPUTE)
|
||||
break;
|
||||
|
||||
sysval = brw_nir_load_global_const(b, intrin,
|
||||
nir_load_btd_global_arg_addr_intel(b),
|
||||
BRW_RT_PUSH_CONST_OFFSET);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_ray_launch_id:
|
||||
sysval = nir_channels(b, hotzone, 0xe);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_ray_launch_size:
|
||||
sysval = globals.launch_size;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_ray_world_origin:
|
||||
sysval = world_ray_in.orig;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_ray_world_direction:
|
||||
sysval = world_ray_in.dir;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_ray_object_origin:
|
||||
sysval = object_ray_in.orig;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_ray_object_direction:
|
||||
sysval = object_ray_in.dir;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_ray_t_min:
|
||||
/* It shouldn't matter which we pull this from */
|
||||
sysval = world_ray_in.t_near;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_ray_t_max:
|
||||
if (stage == MESA_SHADER_MISS)
|
||||
sysval = world_ray_in.t_far;
|
||||
else
|
||||
sysval = hit_in.t;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_primitive_id:
|
||||
sysval = brw_nir_rt_load_primitive_id_from_hit(b,
|
||||
build_leaf_is_procedural(b, &hit_in),
|
||||
&hit_in);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_instance_id: {
|
||||
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
|
||||
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
|
||||
sysval = leaf.instance_index;
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_ray_object_to_world: {
|
||||
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
|
||||
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
|
||||
sysval = leaf.object_to_world[nir_intrinsic_column(intrin)];
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_ray_world_to_object: {
|
||||
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
|
||||
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
|
||||
sysval = leaf.world_to_object[nir_intrinsic_column(intrin)];
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_ray_hit_kind: {
|
||||
nir_def *tri_hit_kind =
|
||||
nir_bcsel(b, hit_in.front_face,
|
||||
nir_imm_int(b, BRW_RT_HIT_KIND_FRONT_FACE),
|
||||
nir_imm_int(b, BRW_RT_HIT_KIND_BACK_FACE));
|
||||
sysval = nir_bcsel(b, build_leaf_is_procedural(b, &hit_in),
|
||||
hit_in.aabb_hit_kind, tri_hit_kind);
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_ray_flags:
|
||||
/* We need to fetch the original ray flags we stored in the
|
||||
* leaf pointer, because the actual ray flags we get here
|
||||
* will include any flags passed on the pipeline at creation
|
||||
* time, and the spec for IncomingRayFlagsKHR says:
|
||||
* Setting pipeline flags on the raytracing pipeline must not
|
||||
* cause any corresponding flags to be set in variables with
|
||||
* this decoration.
|
||||
*/
|
||||
sysval = nir_u2u32(b, world_ray_in.inst_leaf_ptr);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_cull_mask:
|
||||
sysval = nir_u2u32(b, world_ray_in.ray_mask);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_ray_geometry_index: {
|
||||
nir_def *geometry_index_dw =
|
||||
nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
|
||||
1, 32);
|
||||
sysval = nir_iand_imm(b, geometry_index_dw, BITFIELD_MASK(29));
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_ray_instance_custom_index: {
|
||||
struct brw_nir_rt_bvh_instance_leaf_defs leaf;
|
||||
brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
|
||||
sysval = leaf.instance_id;
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_shader_record_ptr:
|
||||
/* We can't handle this intrinsic in resume shaders because the
|
||||
* handle we get there won't be from the original SBT. The shader
|
||||
* call lowering/splitting pass should have ensured that this
|
||||
* value was spilled from the initial shader and unspilled in any
|
||||
* resume shaders that need it.
|
||||
*/
|
||||
assert(!found_resume);
|
||||
sysval = nir_load_btd_local_arg_addr_intel(b);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_ray_base_mem_addr_intel:
|
||||
sysval = globals.base_mem_addr;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_ray_hw_stack_size_intel:
|
||||
sysval = nir_imul_imm(b, globals.hw_stack_size, 64);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_ray_sw_stack_size_intel:
|
||||
sysval = nir_imul_imm(b, globals.sw_stack_size, 64);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_ray_num_dss_rt_stacks_intel:
|
||||
sysval = globals.num_dss_rt_stacks;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_ray_hit_sbt_addr_intel:
|
||||
sysval = globals.hit_sbt_addr;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_ray_hit_sbt_stride_intel:
|
||||
sysval = globals.hit_sbt_stride;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_ray_miss_sbt_addr_intel:
|
||||
sysval = globals.miss_sbt_addr;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_ray_miss_sbt_stride_intel:
|
||||
sysval = globals.miss_sbt_stride;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_callable_sbt_addr_intel:
|
||||
sysval = globals.call_sbt_addr;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_callable_sbt_stride_intel:
|
||||
sysval = globals.call_sbt_stride;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_btd_resume_sbt_addr_intel:
|
||||
sysval = nir_pack_64_2x32_split(b,
|
||||
nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW),
|
||||
nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH));
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_leaf_procedural_intel:
|
||||
sysval = build_leaf_is_procedural(b, &hit_in);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_ray_triangle_vertex_positions: {
|
||||
struct brw_nir_rt_bvh_primitive_leaf_positions_defs pos;
|
||||
brw_nir_rt_load_bvh_primitive_leaf_positions(b, &pos, hit_in.prim_leaf_ptr);
|
||||
sysval = pos.positions[nir_intrinsic_column(intrin)];
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_leaf_opaque_intel: {
|
||||
if (stage == MESA_SHADER_INTERSECTION) {
|
||||
/* In intersection shaders, the opaque bit is passed to us in
|
||||
* the front_face bit.
|
||||
*/
|
||||
sysval = hit_in.front_face;
|
||||
} else {
|
||||
nir_def *flags_dw =
|
||||
nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
|
||||
1, 32);
|
||||
sysval = nir_i2b(b, nir_iand_imm(b, flags_dw, 1u << 30));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
continue;
|
||||
}
|
||||
|
||||
progress = true;
|
||||
|
||||
if (sysval) {
|
||||
nir_def_rewrite_uses(&intrin->def,
|
||||
sysval);
|
||||
nir_instr_remove(&intrin->instr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
nir_metadata_preserve(impl,
|
||||
progress ?
|
||||
nir_metadata_none :
|
||||
(nir_metadata_block_index |
|
||||
nir_metadata_dominance));
|
||||
}
|
||||
|
||||
/** Lower ray-tracing system values and intrinsics
|
||||
*
|
||||
* In most 3D shader stages, intrinsics are a fairly thin wrapper around
|
||||
* hardware functionality and system values represent magic bits that come
|
||||
* into the shader from FF hardware. Ray-tracing, however, looks a bit more
|
||||
* like the OpenGL 1.0 world where the underlying hardware is simple and most
|
||||
* of the API implementation is software.
|
||||
*
|
||||
* In particular, most things that are treated as system values (or built-ins
|
||||
* in SPIR-V) don't get magically dropped into registers for us. Instead, we
|
||||
* have to fetch them from the relevant data structures shared with the
|
||||
* ray-tracing hardware. Most come from either the RT_DISPATCH_GLOBALS or
|
||||
* from one of the MemHit data structures. Some, such as primitive_id require
|
||||
* us to fetch the leaf address from the MemHit struct and then manually read
|
||||
* the data out of the BVH. Instead of trying to emit all this code deep in
|
||||
* the back-end where we can't effectively optimize it, we lower it all to
|
||||
* global memory access in NIR.
|
||||
*
|
||||
* Once this pass is complete, the only real system values left are the two
|
||||
* argument pointer system values for BTD dispatch: btd_local_arg_addr and
|
||||
* btd_global_arg_addr.
|
||||
*/
|
||||
void
|
||||
brw_nir_lower_rt_intrinsics(nir_shader *nir,
|
||||
const struct intel_device_info *devinfo)
|
||||
{
|
||||
nir_foreach_function_impl(impl, nir) {
|
||||
lower_rt_intrinsics_impl(impl, devinfo);
|
||||
}
|
||||
}
|
||||
329
src/intel/compiler/elk/brw_nir_lower_shader_calls.c
Normal file
329
src/intel/compiler/elk/brw_nir_lower_shader_calls.c
Normal file
|
|
@ -0,0 +1,329 @@
|
|||
/*
|
||||
* Copyright © 2020 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_nir_rt.h"
|
||||
#include "brw_nir_rt_builder.h"
|
||||
#include "nir_phi_builder.h"
|
||||
|
||||
UNUSED static bool
|
||||
no_load_scratch_base_ptr_intrinsic(nir_shader *shader)
|
||||
{
|
||||
nir_foreach_function_impl(impl, shader) {
|
||||
nir_foreach_block(block, impl) {
|
||||
nir_foreach_instr(instr, block) {
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
continue;
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
if (intrin->intrinsic == nir_intrinsic_load_scratch_base_ptr)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Insert the appropriate return instruction at the end of the shader */
|
||||
void
|
||||
brw_nir_lower_shader_returns(nir_shader *shader)
|
||||
{
|
||||
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
|
||||
|
||||
/* Reserve scratch space at the start of the shader's per-thread scratch
|
||||
* space for the return BINDLESS_SHADER_RECORD address and data payload.
|
||||
* When a shader is called, the calling shader will write the return BSR
|
||||
* address in this region of the callee's scratch space.
|
||||
*
|
||||
* We could also put it at the end of the caller's scratch space. However,
|
||||
* doing this way means that a shader never accesses its caller's scratch
|
||||
* space unless given an explicit pointer (such as for ray payloads). It
|
||||
* also makes computing the address easier given that we want to apply an
|
||||
* alignment to the scratch offset to ensure we can make alignment
|
||||
* assumptions in the called shader.
|
||||
*
|
||||
* This isn't needed for ray-gen shaders because they end the thread and
|
||||
* never return to the calling trampoline shader.
|
||||
*/
|
||||
assert(no_load_scratch_base_ptr_intrinsic(shader));
|
||||
if (shader->info.stage != MESA_SHADER_RAYGEN)
|
||||
shader->scratch_size += BRW_BTD_STACK_CALLEE_DATA_SIZE;
|
||||
|
||||
nir_builder b = nir_builder_create(impl);
|
||||
|
||||
set_foreach(impl->end_block->predecessors, block_entry) {
|
||||
struct nir_block *block = (void *)block_entry->key;
|
||||
b.cursor = nir_after_block_before_jump(block);
|
||||
|
||||
switch (shader->info.stage) {
|
||||
case MESA_SHADER_RAYGEN:
|
||||
/* A raygen shader is always the root of the shader call tree. When
|
||||
* it ends, we retire the bindless stack ID and no further shaders
|
||||
* will be executed.
|
||||
*/
|
||||
assert(impl->end_block->predecessors->entries == 1);
|
||||
brw_nir_btd_retire(&b);
|
||||
break;
|
||||
|
||||
case MESA_SHADER_ANY_HIT:
|
||||
/* The default action of an any-hit shader is to accept the ray
|
||||
* intersection. Any-hit shaders may have more than one exit. Only
|
||||
* the final "normal" exit will actually need to accept the
|
||||
* intersection as any others should come from nir_jump_halt
|
||||
* instructions inserted after ignore_ray_intersection or
|
||||
* terminate_ray or the like. However, inserting an accept after
|
||||
* the ignore or terminate is safe because it'll get deleted later.
|
||||
*/
|
||||
nir_accept_ray_intersection(&b);
|
||||
break;
|
||||
|
||||
case MESA_SHADER_CALLABLE:
|
||||
case MESA_SHADER_MISS:
|
||||
case MESA_SHADER_CLOSEST_HIT:
|
||||
/* Callable, miss, and closest-hit shaders don't take any special
|
||||
* action at the end. They simply return back to the previous shader
|
||||
* in the call stack.
|
||||
*/
|
||||
assert(impl->end_block->predecessors->entries == 1);
|
||||
brw_nir_btd_return(&b);
|
||||
break;
|
||||
|
||||
case MESA_SHADER_INTERSECTION:
|
||||
/* This will be handled by brw_nir_lower_intersection_shader */
|
||||
break;
|
||||
|
||||
default:
|
||||
unreachable("Invalid callable shader stage");
|
||||
}
|
||||
}
|
||||
|
||||
nir_metadata_preserve(impl, nir_metadata_block_index |
|
||||
nir_metadata_dominance);
|
||||
}
|
||||
|
||||
static void
|
||||
store_resume_addr(nir_builder *b, nir_intrinsic_instr *call)
|
||||
{
|
||||
uint32_t call_idx = nir_intrinsic_call_idx(call);
|
||||
uint32_t offset = nir_intrinsic_stack_size(call);
|
||||
|
||||
/* First thing on the called shader's stack is the resume address
|
||||
* followed by a pointer to the payload.
|
||||
*/
|
||||
nir_def *resume_record_addr =
|
||||
nir_iadd_imm(b, nir_load_btd_resume_sbt_addr_intel(b),
|
||||
call_idx * BRW_BTD_RESUME_SBT_STRIDE);
|
||||
/* By the time we get here, any remaining shader/function memory
|
||||
* pointers have been lowered to SSA values.
|
||||
*/
|
||||
nir_def *payload_addr =
|
||||
nir_get_shader_call_payload_src(call)->ssa;
|
||||
brw_nir_rt_store_scratch(b, offset, BRW_BTD_STACK_ALIGN,
|
||||
nir_vec2(b, resume_record_addr, payload_addr),
|
||||
0xf /* write_mask */);
|
||||
|
||||
nir_btd_stack_push_intel(b, offset);
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_shader_trace_ray_instr(struct nir_builder *b, nir_instr *instr, void *data)
|
||||
{
|
||||
struct brw_bs_prog_key *key = data;
|
||||
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
return false;
|
||||
|
||||
/* Leave nir_intrinsic_rt_resume to be lowered by
|
||||
* brw_nir_lower_rt_intrinsics()
|
||||
*/
|
||||
nir_intrinsic_instr *call = nir_instr_as_intrinsic(instr);
|
||||
if (call->intrinsic != nir_intrinsic_rt_trace_ray)
|
||||
return false;
|
||||
|
||||
b->cursor = nir_instr_remove(instr);
|
||||
|
||||
store_resume_addr(b, call);
|
||||
|
||||
nir_def *as_addr = call->src[0].ssa;
|
||||
nir_def *ray_flags = call->src[1].ssa;
|
||||
/* From the SPIR-V spec:
|
||||
*
|
||||
* "Only the 8 least-significant bits of Cull Mask are used by this
|
||||
* instruction - other bits are ignored.
|
||||
*
|
||||
* Only the 4 least-significant bits of SBT Offset and SBT Stride are
|
||||
* used by this instruction - other bits are ignored.
|
||||
*
|
||||
* Only the 16 least-significant bits of Miss Index are used by this
|
||||
* instruction - other bits are ignored."
|
||||
*/
|
||||
nir_def *cull_mask = nir_iand_imm(b, call->src[2].ssa, 0xff);
|
||||
nir_def *sbt_offset = nir_iand_imm(b, call->src[3].ssa, 0xf);
|
||||
nir_def *sbt_stride = nir_iand_imm(b, call->src[4].ssa, 0xf);
|
||||
nir_def *miss_index = nir_iand_imm(b, call->src[5].ssa, 0xffff);
|
||||
nir_def *ray_orig = call->src[6].ssa;
|
||||
nir_def *ray_t_min = call->src[7].ssa;
|
||||
nir_def *ray_dir = call->src[8].ssa;
|
||||
nir_def *ray_t_max = call->src[9].ssa;
|
||||
|
||||
nir_def *root_node_ptr =
|
||||
brw_nir_rt_acceleration_structure_to_root_node(b, as_addr);
|
||||
|
||||
/* The hardware packet requires an address to the first element of the
|
||||
* hit SBT.
|
||||
*
|
||||
* In order to calculate this, we must multiply the "SBT Offset"
|
||||
* provided to OpTraceRay by the SBT stride provided for the hit SBT in
|
||||
* the call to vkCmdTraceRay() and add that to the base address of the
|
||||
* hit SBT. This stride is not to be confused with the "SBT Stride"
|
||||
* provided to OpTraceRay which is in units of this stride. It's a
|
||||
* rather terrible overload of the word "stride". The hardware docs
|
||||
* calls the SPIR-V stride value the "shader index multiplier" which is
|
||||
* a much more sane name.
|
||||
*/
|
||||
nir_def *hit_sbt_stride_B =
|
||||
nir_load_ray_hit_sbt_stride_intel(b);
|
||||
nir_def *hit_sbt_offset_B =
|
||||
nir_imul(b, sbt_offset, nir_u2u32(b, hit_sbt_stride_B));
|
||||
nir_def *hit_sbt_addr =
|
||||
nir_iadd(b, nir_load_ray_hit_sbt_addr_intel(b),
|
||||
nir_u2u64(b, hit_sbt_offset_B));
|
||||
|
||||
/* The hardware packet takes an address to the miss BSR. */
|
||||
nir_def *miss_sbt_stride_B =
|
||||
nir_load_ray_miss_sbt_stride_intel(b);
|
||||
nir_def *miss_sbt_offset_B =
|
||||
nir_imul(b, miss_index, nir_u2u32(b, miss_sbt_stride_B));
|
||||
nir_def *miss_sbt_addr =
|
||||
nir_iadd(b, nir_load_ray_miss_sbt_addr_intel(b),
|
||||
nir_u2u64(b, miss_sbt_offset_B));
|
||||
|
||||
struct brw_nir_rt_mem_ray_defs ray_defs = {
|
||||
.root_node_ptr = root_node_ptr,
|
||||
/* Combine the shader value given to traceRayEXT() with the pipeline
|
||||
* creation value VkPipelineCreateFlags.
|
||||
*/
|
||||
.ray_flags = nir_ior_imm(b, nir_u2u16(b, ray_flags), key->pipeline_ray_flags),
|
||||
.ray_mask = cull_mask,
|
||||
.hit_group_sr_base_ptr = hit_sbt_addr,
|
||||
.hit_group_sr_stride = nir_u2u16(b, hit_sbt_stride_B),
|
||||
.miss_sr_ptr = miss_sbt_addr,
|
||||
.orig = ray_orig,
|
||||
.t_near = ray_t_min,
|
||||
.dir = ray_dir,
|
||||
.t_far = ray_t_max,
|
||||
.shader_index_multiplier = sbt_stride,
|
||||
/* The instance leaf pointer is unused in the top level BVH traversal
|
||||
* since we always start from the root node. We can reuse that field to
|
||||
* store the ray_flags handed to traceRayEXT(). This will be reloaded
|
||||
* when the shader accesses gl_IncomingRayFlagsEXT (see
|
||||
* nir_intrinsic_load_ray_flags brw_nir_lower_rt_intrinsic.c)
|
||||
*/
|
||||
.inst_leaf_ptr = nir_u2u64(b, ray_flags),
|
||||
};
|
||||
brw_nir_rt_store_mem_ray(b, &ray_defs, BRW_RT_BVH_LEVEL_WORLD);
|
||||
|
||||
nir_trace_ray_intel(b,
|
||||
nir_load_btd_global_arg_addr_intel(b),
|
||||
nir_imm_int(b, BRW_RT_BVH_LEVEL_WORLD),
|
||||
nir_imm_int(b, GEN_RT_TRACE_RAY_INITAL),
|
||||
.synchronous = false);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_shader_call_instr(struct nir_builder *b, nir_intrinsic_instr *call,
|
||||
void *data)
|
||||
{
|
||||
if (call->intrinsic != nir_intrinsic_rt_execute_callable)
|
||||
return false;
|
||||
|
||||
b->cursor = nir_instr_remove(&call->instr);
|
||||
|
||||
store_resume_addr(b, call);
|
||||
|
||||
nir_def *sbt_offset32 =
|
||||
nir_imul(b, call->src[0].ssa,
|
||||
nir_u2u32(b, nir_load_callable_sbt_stride_intel(b)));
|
||||
nir_def *sbt_addr =
|
||||
nir_iadd(b, nir_load_callable_sbt_addr_intel(b),
|
||||
nir_u2u64(b, sbt_offset32));
|
||||
brw_nir_btd_spawn(b, sbt_addr);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
brw_nir_lower_shader_calls(nir_shader *shader, struct brw_bs_prog_key *key)
|
||||
{
|
||||
bool a = nir_shader_instructions_pass(shader,
|
||||
lower_shader_trace_ray_instr,
|
||||
nir_metadata_none,
|
||||
key);
|
||||
bool b = nir_shader_intrinsics_pass(shader, lower_shader_call_instr,
|
||||
nir_metadata_block_index |
|
||||
nir_metadata_dominance,
|
||||
NULL);
|
||||
return a || b;
|
||||
}
|
||||
|
||||
/** Creates a trivial return shader
|
||||
*
|
||||
* In most cases this shader doesn't actually do anything. It just needs to
|
||||
* return to the caller.
|
||||
*
|
||||
* By default, our HW has the ability to handle the fact that a shader is not
|
||||
* available and will execute the next following shader in the tracing call.
|
||||
* For instance, a RAYGEN shader traces a ray, the tracing generates a hit,
|
||||
* but there is no ANYHIT shader available. The HW should follow up by
|
||||
* execution the CLOSESTHIT shader.
|
||||
*
|
||||
* This default behavior can be changed through the RT_CTRL register
|
||||
* (privileged access) and when NULL shader checks are disabled, the HW will
|
||||
* instead call the call stack handler (this shader). This is what i915 is
|
||||
* doing as part of Wa_14013202645.
|
||||
*
|
||||
* In order to ensure the call to the CLOSESTHIT shader, this shader needs to
|
||||
* commit the ray and will not proceed with the BTD return. Similarly when the
|
||||
* same thing happen with the INTERSECTION shader, we should just carry on the
|
||||
* ray traversal with the continue operation.
|
||||
*
|
||||
*/
|
||||
nir_shader *
|
||||
brw_nir_create_trivial_return_shader(const struct brw_compiler *compiler,
|
||||
void *mem_ctx)
|
||||
{
|
||||
const nir_shader_compiler_options *nir_options =
|
||||
compiler->nir_options[MESA_SHADER_CALLABLE];
|
||||
|
||||
nir_builder _b = nir_builder_init_simple_shader(MESA_SHADER_CALLABLE,
|
||||
nir_options,
|
||||
"RT Trivial Return");
|
||||
nir_builder *b = &_b;
|
||||
|
||||
ralloc_steal(mem_ctx, b->shader);
|
||||
nir_shader *nir = b->shader;
|
||||
|
||||
NIR_PASS_V(nir, brw_nir_lower_shader_returns);
|
||||
|
||||
return nir;
|
||||
}
|
||||
765
src/intel/compiler/elk/brw_nir_lower_storage_image.c
Normal file
765
src/intel/compiler/elk/brw_nir_lower_storage_image.c
Normal file
|
|
@ -0,0 +1,765 @@
|
|||
/*
|
||||
* Copyright © 2018 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "isl/isl.h"
|
||||
|
||||
#include "brw_nir.h"
|
||||
#include "compiler/nir/nir_builder.h"
|
||||
#include "compiler/nir/nir_format_convert.h"
|
||||
|
||||
static nir_def *
|
||||
_load_image_param(nir_builder *b, nir_deref_instr *deref, unsigned offset)
|
||||
{
|
||||
nir_intrinsic_instr *load =
|
||||
nir_intrinsic_instr_create(b->shader,
|
||||
nir_intrinsic_image_deref_load_param_intel);
|
||||
load->src[0] = nir_src_for_ssa(&deref->def);
|
||||
nir_intrinsic_set_base(load, offset / 4);
|
||||
|
||||
switch (offset) {
|
||||
case ISL_IMAGE_PARAM_OFFSET_OFFSET:
|
||||
case ISL_IMAGE_PARAM_SWIZZLING_OFFSET:
|
||||
load->num_components = 2;
|
||||
break;
|
||||
case ISL_IMAGE_PARAM_TILING_OFFSET:
|
||||
case ISL_IMAGE_PARAM_SIZE_OFFSET:
|
||||
load->num_components = 3;
|
||||
break;
|
||||
case ISL_IMAGE_PARAM_STRIDE_OFFSET:
|
||||
load->num_components = 4;
|
||||
break;
|
||||
default:
|
||||
unreachable("Invalid param offset");
|
||||
}
|
||||
nir_def_init(&load->instr, &load->def, load->num_components, 32);
|
||||
|
||||
nir_builder_instr_insert(b, &load->instr);
|
||||
return &load->def;
|
||||
}
|
||||
|
||||
#define load_image_param(b, d, o) \
|
||||
_load_image_param(b, d, ISL_IMAGE_PARAM_##o##_OFFSET)
|
||||
|
||||
static nir_def *
|
||||
image_coord_is_in_bounds(nir_builder *b, nir_deref_instr *deref,
|
||||
nir_def *coord)
|
||||
{
|
||||
nir_def *size = load_image_param(b, deref, SIZE);
|
||||
nir_def *cmp = nir_ilt(b, coord, size);
|
||||
|
||||
unsigned coord_comps = glsl_get_sampler_coordinate_components(deref->type);
|
||||
nir_def *in_bounds = nir_imm_true(b);
|
||||
for (unsigned i = 0; i < coord_comps; i++)
|
||||
in_bounds = nir_iand(b, in_bounds, nir_channel(b, cmp, i));
|
||||
|
||||
return in_bounds;
|
||||
}
|
||||
|
||||
/** Calculate the offset in memory of the texel given by \p coord.
|
||||
*
|
||||
* This is meant to be used with untyped surface messages to access a tiled
|
||||
* surface, what involves taking into account the tiling and swizzling modes
|
||||
* of the surface manually so it will hopefully not happen very often.
|
||||
*
|
||||
* The tiling algorithm implemented here matches either the X or Y tiling
|
||||
* layouts supported by the hardware depending on the tiling coefficients
|
||||
* passed to the program as uniforms. See Volume 1 Part 2 Section 4.5
|
||||
* "Address Tiling Function" of the IVB PRM for an in-depth explanation of
|
||||
* the hardware tiling format.
|
||||
*/
|
||||
static nir_def *
|
||||
image_address(nir_builder *b, const struct intel_device_info *devinfo,
|
||||
nir_deref_instr *deref, nir_def *coord)
|
||||
{
|
||||
if (glsl_get_sampler_dim(deref->type) == GLSL_SAMPLER_DIM_1D &&
|
||||
glsl_sampler_type_is_array(deref->type)) {
|
||||
/* It's easier if 1D arrays are treated like 2D arrays */
|
||||
coord = nir_vec3(b, nir_channel(b, coord, 0),
|
||||
nir_imm_int(b, 0),
|
||||
nir_channel(b, coord, 1));
|
||||
} else {
|
||||
unsigned dims = glsl_get_sampler_coordinate_components(deref->type);
|
||||
coord = nir_trim_vector(b, coord, dims);
|
||||
}
|
||||
|
||||
nir_def *offset = load_image_param(b, deref, OFFSET);
|
||||
nir_def *tiling = load_image_param(b, deref, TILING);
|
||||
nir_def *stride = load_image_param(b, deref, STRIDE);
|
||||
|
||||
/* Shift the coordinates by the fixed surface offset. It may be non-zero
|
||||
* if the image is a single slice of a higher-dimensional surface, or if a
|
||||
* non-zero mipmap level of the surface is bound to the pipeline. The
|
||||
* offset needs to be applied here rather than at surface state set-up time
|
||||
* because the desired slice-level may start mid-tile, so simply shifting
|
||||
* the surface base address wouldn't give a well-formed tiled surface in
|
||||
* the general case.
|
||||
*/
|
||||
nir_def *xypos = (coord->num_components == 1) ?
|
||||
nir_vec2(b, coord, nir_imm_int(b, 0)) :
|
||||
nir_trim_vector(b, coord, 2);
|
||||
xypos = nir_iadd(b, xypos, offset);
|
||||
|
||||
/* The layout of 3-D textures in memory is sort-of like a tiling
|
||||
* format. At each miplevel, the slices are arranged in rows of
|
||||
* 2^level slices per row. The slice row is stored in tmp.y and
|
||||
* the slice within the row is stored in tmp.x.
|
||||
*
|
||||
* The layout of 2-D array textures and cubemaps is much simpler:
|
||||
* Depending on whether the ARYSPC_LOD0 layout is in use it will be
|
||||
* stored in memory as an array of slices, each one being a 2-D
|
||||
* arrangement of miplevels, or as a 2D arrangement of miplevels,
|
||||
* each one being an array of slices. In either case the separation
|
||||
* between slices of the same LOD is equal to the qpitch value
|
||||
* provided as stride.w.
|
||||
*
|
||||
* This code can be made to handle either 2D arrays and 3D textures
|
||||
* by passing in the miplevel as tile.z for 3-D textures and 0 in
|
||||
* tile.z for 2-D array textures.
|
||||
*
|
||||
* See Volume 1 Part 1 of the Gfx7 PRM, sections 6.18.4.7 "Surface
|
||||
* Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
|
||||
* of the hardware 3D texture and 2D array layouts.
|
||||
*/
|
||||
if (coord->num_components > 2) {
|
||||
/* Decompose z into a major (tmp.y) and a minor (tmp.x)
|
||||
* index.
|
||||
*/
|
||||
nir_def *z = nir_channel(b, coord, 2);
|
||||
nir_def *z_x = nir_ubfe(b, z, nir_imm_int(b, 0),
|
||||
nir_channel(b, tiling, 2));
|
||||
nir_def *z_y = nir_ushr(b, z, nir_channel(b, tiling, 2));
|
||||
|
||||
/* Take into account the horizontal (tmp.x) and vertical (tmp.y)
|
||||
* slice offset.
|
||||
*/
|
||||
xypos = nir_iadd(b, xypos, nir_imul(b, nir_vec2(b, z_x, z_y),
|
||||
nir_channels(b, stride, 0xc)));
|
||||
}
|
||||
|
||||
nir_def *addr;
|
||||
if (coord->num_components > 1) {
|
||||
/* Calculate the major/minor x and y indices. In order to
|
||||
* accommodate both X and Y tiling, the Y-major tiling format is
|
||||
* treated as being a bunch of narrow X-tiles placed next to each
|
||||
* other. This means that the tile width for Y-tiling is actually
|
||||
* the width of one sub-column of the Y-major tile where each 4K
|
||||
* tile has 8 512B sub-columns.
|
||||
*
|
||||
* The major Y value is the row of tiles in which the pixel lives.
|
||||
* The major X value is the tile sub-column in which the pixel
|
||||
* lives; for X tiling, this is the same as the tile column, for Y
|
||||
* tiling, each tile has 8 sub-columns. The minor X and Y indices
|
||||
* are the position within the sub-column.
|
||||
*/
|
||||
|
||||
/* Calculate the minor x and y indices. */
|
||||
nir_def *minor = nir_ubfe(b, xypos, nir_imm_int(b, 0),
|
||||
nir_trim_vector(b, tiling, 2));
|
||||
nir_def *major = nir_ushr(b, xypos, nir_trim_vector(b, tiling, 2));
|
||||
|
||||
/* Calculate the texel index from the start of the tile row and the
|
||||
* vertical coordinate of the row.
|
||||
* Equivalent to:
|
||||
* tmp.x = (major.x << tile.y << tile.x) +
|
||||
* (minor.y << tile.x) + minor.x
|
||||
* tmp.y = major.y << tile.y
|
||||
*/
|
||||
nir_def *idx_x, *idx_y;
|
||||
idx_x = nir_ishl(b, nir_channel(b, major, 0), nir_channel(b, tiling, 1));
|
||||
idx_x = nir_iadd(b, idx_x, nir_channel(b, minor, 1));
|
||||
idx_x = nir_ishl(b, idx_x, nir_channel(b, tiling, 0));
|
||||
idx_x = nir_iadd(b, idx_x, nir_channel(b, minor, 0));
|
||||
idx_y = nir_ishl(b, nir_channel(b, major, 1), nir_channel(b, tiling, 1));
|
||||
|
||||
/* Add it to the start of the tile row. */
|
||||
nir_def *idx;
|
||||
idx = nir_imul(b, idx_y, nir_channel(b, stride, 1));
|
||||
idx = nir_iadd(b, idx, idx_x);
|
||||
|
||||
/* Multiply by the Bpp value. */
|
||||
addr = nir_imul(b, idx, nir_channel(b, stride, 0));
|
||||
|
||||
if (devinfo->ver < 8 && devinfo->platform != INTEL_PLATFORM_BYT) {
|
||||
/* Take into account the two dynamically specified shifts. Both are
|
||||
* used to implement swizzling of X-tiled surfaces. For Y-tiled
|
||||
* surfaces only one bit needs to be XOR-ed with bit 6 of the memory
|
||||
* address, so a swz value of 0xff (actually interpreted as 31 by the
|
||||
* hardware) will be provided to cause the relevant bit of tmp.y to
|
||||
* be zero and turn the first XOR into the identity. For linear
|
||||
* surfaces or platforms lacking address swizzling both shifts will
|
||||
* be 0xff causing the relevant bits of both tmp.x and .y to be zero,
|
||||
* what effectively disables swizzling.
|
||||
*/
|
||||
nir_def *swizzle = load_image_param(b, deref, SWIZZLING);
|
||||
nir_def *shift0 = nir_ushr(b, addr, nir_channel(b, swizzle, 0));
|
||||
nir_def *shift1 = nir_ushr(b, addr, nir_channel(b, swizzle, 1));
|
||||
|
||||
/* XOR tmp.x and tmp.y with bit 6 of the memory address. */
|
||||
nir_def *bit = nir_iand(b, nir_ixor(b, shift0, shift1),
|
||||
nir_imm_int(b, 1 << 6));
|
||||
addr = nir_ixor(b, addr, bit);
|
||||
}
|
||||
} else {
|
||||
/* Multiply by the Bpp/stride value. Note that the addr.y may be
|
||||
* non-zero even if the image is one-dimensional because a vertical
|
||||
* offset may have been applied above to select a non-zero slice or
|
||||
* level of a higher-dimensional texture.
|
||||
*/
|
||||
nir_def *idx;
|
||||
idx = nir_imul(b, nir_channel(b, xypos, 1), nir_channel(b, stride, 1));
|
||||
idx = nir_iadd(b, nir_channel(b, xypos, 0), idx);
|
||||
addr = nir_imul(b, idx, nir_channel(b, stride, 0));
|
||||
}
|
||||
|
||||
return addr;
|
||||
}
|
||||
|
||||
struct format_info {
|
||||
const struct isl_format_layout *fmtl;
|
||||
unsigned chans;
|
||||
unsigned bits[4];
|
||||
};
|
||||
|
||||
static struct format_info
|
||||
get_format_info(enum isl_format fmt)
|
||||
{
|
||||
const struct isl_format_layout *fmtl = isl_format_get_layout(fmt);
|
||||
|
||||
return (struct format_info) {
|
||||
.fmtl = fmtl,
|
||||
.chans = isl_format_get_num_channels(fmt),
|
||||
.bits = {
|
||||
fmtl->channels.r.bits,
|
||||
fmtl->channels.g.bits,
|
||||
fmtl->channels.b.bits,
|
||||
fmtl->channels.a.bits
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
static nir_def *
|
||||
convert_color_for_load(nir_builder *b, const struct intel_device_info *devinfo,
|
||||
nir_def *color,
|
||||
enum isl_format image_fmt, enum isl_format lower_fmt,
|
||||
unsigned dest_components)
|
||||
{
|
||||
if (image_fmt == lower_fmt)
|
||||
goto expand_vec;
|
||||
|
||||
if (image_fmt == ISL_FORMAT_R11G11B10_FLOAT) {
|
||||
assert(lower_fmt == ISL_FORMAT_R32_UINT);
|
||||
color = nir_format_unpack_11f11f10f(b, color);
|
||||
goto expand_vec;
|
||||
}
|
||||
|
||||
struct format_info image = get_format_info(image_fmt);
|
||||
struct format_info lower = get_format_info(lower_fmt);
|
||||
|
||||
const bool needs_sign_extension =
|
||||
isl_format_has_snorm_channel(image_fmt) ||
|
||||
isl_format_has_sint_channel(image_fmt);
|
||||
|
||||
/* We only check the red channel to detect if we need to pack/unpack */
|
||||
assert(image.bits[0] != lower.bits[0] ||
|
||||
memcmp(image.bits, lower.bits, sizeof(image.bits)) == 0);
|
||||
|
||||
if (image.bits[0] != lower.bits[0] && lower_fmt == ISL_FORMAT_R32_UINT) {
|
||||
if (needs_sign_extension)
|
||||
color = nir_format_unpack_sint(b, color, image.bits, image.chans);
|
||||
else
|
||||
color = nir_format_unpack_uint(b, color, image.bits, image.chans);
|
||||
} else {
|
||||
/* All these formats are homogeneous */
|
||||
for (unsigned i = 1; i < image.chans; i++)
|
||||
assert(image.bits[i] == image.bits[0]);
|
||||
|
||||
/* On IVB, we rely on the undocumented behavior that typed reads from
|
||||
* surfaces of the unsupported R8 and R16 formats return useful data in
|
||||
* their least significant bits. However, the data in the high bits is
|
||||
* garbage so we have to discard it.
|
||||
*/
|
||||
if (devinfo->verx10 == 70 &&
|
||||
(lower_fmt == ISL_FORMAT_R16_UINT ||
|
||||
lower_fmt == ISL_FORMAT_R8_UINT))
|
||||
color = nir_format_mask_uvec(b, color, lower.bits);
|
||||
|
||||
if (image.bits[0] != lower.bits[0]) {
|
||||
color = nir_format_bitcast_uvec_unmasked(b, color, lower.bits[0],
|
||||
image.bits[0]);
|
||||
}
|
||||
|
||||
if (needs_sign_extension)
|
||||
color = nir_format_sign_extend_ivec(b, color, image.bits);
|
||||
}
|
||||
|
||||
switch (image.fmtl->channels.r.type) {
|
||||
case ISL_UNORM:
|
||||
assert(isl_format_has_uint_channel(lower_fmt));
|
||||
color = nir_format_unorm_to_float(b, color, image.bits);
|
||||
break;
|
||||
|
||||
case ISL_SNORM:
|
||||
assert(isl_format_has_uint_channel(lower_fmt));
|
||||
color = nir_format_snorm_to_float(b, color, image.bits);
|
||||
break;
|
||||
|
||||
case ISL_SFLOAT:
|
||||
if (image.bits[0] == 16)
|
||||
color = nir_unpack_half_2x16_split_x(b, color);
|
||||
break;
|
||||
|
||||
case ISL_UINT:
|
||||
case ISL_SINT:
|
||||
break;
|
||||
|
||||
default:
|
||||
unreachable("Invalid image channel type");
|
||||
}
|
||||
|
||||
expand_vec:
|
||||
assert(dest_components == 1 || dest_components == 4);
|
||||
assert(color->num_components <= dest_components);
|
||||
if (color->num_components == dest_components)
|
||||
return color;
|
||||
|
||||
nir_def *comps[4];
|
||||
for (unsigned i = 0; i < color->num_components; i++)
|
||||
comps[i] = nir_channel(b, color, i);
|
||||
|
||||
for (unsigned i = color->num_components; i < 3; i++)
|
||||
comps[i] = nir_imm_int(b, 0);
|
||||
|
||||
if (color->num_components < 4) {
|
||||
if (isl_format_has_int_channel(image_fmt))
|
||||
comps[3] = nir_imm_int(b, 1);
|
||||
else
|
||||
comps[3] = nir_imm_float(b, 1);
|
||||
}
|
||||
|
||||
return nir_vec(b, comps, dest_components);
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_image_load_instr(nir_builder *b,
|
||||
const struct intel_device_info *devinfo,
|
||||
nir_intrinsic_instr *intrin,
|
||||
bool sparse)
|
||||
{
|
||||
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
|
||||
nir_variable *var = nir_deref_instr_get_variable(deref);
|
||||
|
||||
if (var->data.image.format == PIPE_FORMAT_NONE)
|
||||
return false;
|
||||
|
||||
const enum isl_format image_fmt =
|
||||
isl_format_for_pipe_format(var->data.image.format);
|
||||
|
||||
if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt)) {
|
||||
const enum isl_format lower_fmt =
|
||||
isl_lower_storage_image_format(devinfo, image_fmt);
|
||||
const unsigned dest_components =
|
||||
sparse ? (intrin->num_components - 1) : intrin->num_components;
|
||||
|
||||
/* Use an undef to hold the uses of the load while we do the color
|
||||
* conversion.
|
||||
*/
|
||||
nir_def *placeholder = nir_undef(b, 4, 32);
|
||||
nir_def_rewrite_uses(&intrin->def, placeholder);
|
||||
|
||||
intrin->num_components = isl_format_get_num_channels(lower_fmt);
|
||||
intrin->def.num_components = intrin->num_components;
|
||||
|
||||
b->cursor = nir_after_instr(&intrin->instr);
|
||||
|
||||
nir_def *color = convert_color_for_load(b, devinfo,
|
||||
&intrin->def,
|
||||
image_fmt, lower_fmt,
|
||||
dest_components);
|
||||
|
||||
if (sparse) {
|
||||
/* Put the sparse component back on the original instruction */
|
||||
intrin->num_components++;
|
||||
intrin->def.num_components = intrin->num_components;
|
||||
|
||||
/* Carry over the sparse component without modifying it with the
|
||||
* converted color.
|
||||
*/
|
||||
nir_def *sparse_color[NIR_MAX_VEC_COMPONENTS];
|
||||
for (unsigned i = 0; i < dest_components; i++)
|
||||
sparse_color[i] = nir_channel(b, color, i);
|
||||
sparse_color[dest_components] =
|
||||
nir_channel(b, &intrin->def, intrin->num_components - 1);
|
||||
color = nir_vec(b, sparse_color, dest_components + 1);
|
||||
}
|
||||
|
||||
nir_def_rewrite_uses(placeholder, color);
|
||||
nir_instr_remove(placeholder->parent_instr);
|
||||
} else {
|
||||
/* This code part is only useful prior to Gfx9, we do not have plans to
|
||||
* enable sparse there.
|
||||
*/
|
||||
assert(!sparse);
|
||||
|
||||
const struct isl_format_layout *image_fmtl =
|
||||
isl_format_get_layout(image_fmt);
|
||||
/* We have a matching typed format for everything 32b and below */
|
||||
assert(image_fmtl->bpb == 64 || image_fmtl->bpb == 128);
|
||||
enum isl_format raw_fmt = (image_fmtl->bpb == 64) ?
|
||||
ISL_FORMAT_R32G32_UINT :
|
||||
ISL_FORMAT_R32G32B32A32_UINT;
|
||||
const unsigned dest_components = intrin->num_components;
|
||||
|
||||
b->cursor = nir_instr_remove(&intrin->instr);
|
||||
|
||||
nir_def *coord = intrin->src[1].ssa;
|
||||
|
||||
nir_def *do_load = image_coord_is_in_bounds(b, deref, coord);
|
||||
if (devinfo->verx10 == 70) {
|
||||
/* Check whether the first stride component (i.e. the Bpp value)
|
||||
* is greater than four, what on Gfx7 indicates that a surface of
|
||||
* type RAW has been bound for untyped access. Reading or writing
|
||||
* to a surface of type other than RAW using untyped surface
|
||||
* messages causes a hang on IVB and VLV.
|
||||
*/
|
||||
nir_def *stride = load_image_param(b, deref, STRIDE);
|
||||
nir_def *is_raw =
|
||||
nir_igt_imm(b, nir_channel(b, stride, 0), 4);
|
||||
do_load = nir_iand(b, do_load, is_raw);
|
||||
}
|
||||
nir_push_if(b, do_load);
|
||||
|
||||
nir_def *addr = image_address(b, devinfo, deref, coord);
|
||||
nir_def *load =
|
||||
nir_image_deref_load_raw_intel(b, image_fmtl->bpb / 32, 32,
|
||||
&deref->def, addr);
|
||||
|
||||
nir_push_else(b, NULL);
|
||||
|
||||
nir_def *zero = nir_imm_zero(b, load->num_components, 32);
|
||||
|
||||
nir_pop_if(b, NULL);
|
||||
|
||||
nir_def *value = nir_if_phi(b, load, zero);
|
||||
|
||||
nir_def *color = convert_color_for_load(b, devinfo, value,
|
||||
image_fmt, raw_fmt,
|
||||
dest_components);
|
||||
|
||||
nir_def_rewrite_uses(&intrin->def, color);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static nir_def *
|
||||
convert_color_for_store(nir_builder *b, const struct intel_device_info *devinfo,
|
||||
nir_def *color,
|
||||
enum isl_format image_fmt, enum isl_format lower_fmt)
|
||||
{
|
||||
struct format_info image = get_format_info(image_fmt);
|
||||
struct format_info lower = get_format_info(lower_fmt);
|
||||
|
||||
color = nir_trim_vector(b, color, image.chans);
|
||||
|
||||
if (image_fmt == lower_fmt)
|
||||
return color;
|
||||
|
||||
if (image_fmt == ISL_FORMAT_R11G11B10_FLOAT) {
|
||||
assert(lower_fmt == ISL_FORMAT_R32_UINT);
|
||||
return nir_format_pack_11f11f10f(b, color);
|
||||
}
|
||||
|
||||
switch (image.fmtl->channels.r.type) {
|
||||
case ISL_UNORM:
|
||||
assert(isl_format_has_uint_channel(lower_fmt));
|
||||
color = nir_format_float_to_unorm(b, color, image.bits);
|
||||
break;
|
||||
|
||||
case ISL_SNORM:
|
||||
assert(isl_format_has_uint_channel(lower_fmt));
|
||||
color = nir_format_float_to_snorm(b, color, image.bits);
|
||||
break;
|
||||
|
||||
case ISL_SFLOAT:
|
||||
if (image.bits[0] == 16)
|
||||
color = nir_format_float_to_half(b, color);
|
||||
break;
|
||||
|
||||
case ISL_UINT:
|
||||
color = nir_format_clamp_uint(b, color, image.bits);
|
||||
break;
|
||||
|
||||
case ISL_SINT:
|
||||
color = nir_format_clamp_sint(b, color, image.bits);
|
||||
break;
|
||||
|
||||
default:
|
||||
unreachable("Invalid image channel type");
|
||||
}
|
||||
|
||||
if (image.bits[0] < 32 &&
|
||||
(isl_format_has_snorm_channel(image_fmt) ||
|
||||
isl_format_has_sint_channel(image_fmt)))
|
||||
color = nir_format_mask_uvec(b, color, image.bits);
|
||||
|
||||
if (image.bits[0] != lower.bits[0] && lower_fmt == ISL_FORMAT_R32_UINT) {
|
||||
color = nir_format_pack_uint(b, color, image.bits, image.chans);
|
||||
} else {
|
||||
/* All these formats are homogeneous */
|
||||
for (unsigned i = 1; i < image.chans; i++)
|
||||
assert(image.bits[i] == image.bits[0]);
|
||||
|
||||
if (image.bits[0] != lower.bits[0]) {
|
||||
color = nir_format_bitcast_uvec_unmasked(b, color, image.bits[0],
|
||||
lower.bits[0]);
|
||||
}
|
||||
}
|
||||
|
||||
return color;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_image_store_instr(nir_builder *b,
|
||||
const struct intel_device_info *devinfo,
|
||||
nir_intrinsic_instr *intrin)
|
||||
{
|
||||
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
|
||||
nir_variable *var = nir_deref_instr_get_variable(deref);
|
||||
|
||||
/* For write-only surfaces, we trust that the hardware can just do the
|
||||
* conversion for us.
|
||||
*/
|
||||
if (var->data.access & ACCESS_NON_READABLE)
|
||||
return false;
|
||||
|
||||
if (var->data.image.format == PIPE_FORMAT_NONE)
|
||||
return false;
|
||||
|
||||
const enum isl_format image_fmt =
|
||||
isl_format_for_pipe_format(var->data.image.format);
|
||||
|
||||
if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt)) {
|
||||
const enum isl_format lower_fmt =
|
||||
isl_lower_storage_image_format(devinfo, image_fmt);
|
||||
|
||||
/* Color conversion goes before the store */
|
||||
b->cursor = nir_before_instr(&intrin->instr);
|
||||
|
||||
nir_def *color = convert_color_for_store(b, devinfo,
|
||||
intrin->src[3].ssa,
|
||||
image_fmt, lower_fmt);
|
||||
intrin->num_components = isl_format_get_num_channels(lower_fmt);
|
||||
nir_src_rewrite(&intrin->src[3], color);
|
||||
} else {
|
||||
const struct isl_format_layout *image_fmtl =
|
||||
isl_format_get_layout(image_fmt);
|
||||
/* We have a matching typed format for everything 32b and below */
|
||||
assert(image_fmtl->bpb == 64 || image_fmtl->bpb == 128);
|
||||
enum isl_format raw_fmt = (image_fmtl->bpb == 64) ?
|
||||
ISL_FORMAT_R32G32_UINT :
|
||||
ISL_FORMAT_R32G32B32A32_UINT;
|
||||
|
||||
b->cursor = nir_instr_remove(&intrin->instr);
|
||||
|
||||
nir_def *coord = intrin->src[1].ssa;
|
||||
|
||||
nir_def *do_store = image_coord_is_in_bounds(b, deref, coord);
|
||||
if (devinfo->verx10 == 70) {
|
||||
/* Check whether the first stride component (i.e. the Bpp value)
|
||||
* is greater than four, what on Gfx7 indicates that a surface of
|
||||
* type RAW has been bound for untyped access. Reading or writing
|
||||
* to a surface of type other than RAW using untyped surface
|
||||
* messages causes a hang on IVB and VLV.
|
||||
*/
|
||||
nir_def *stride = load_image_param(b, deref, STRIDE);
|
||||
nir_def *is_raw =
|
||||
nir_igt_imm(b, nir_channel(b, stride, 0), 4);
|
||||
do_store = nir_iand(b, do_store, is_raw);
|
||||
}
|
||||
nir_push_if(b, do_store);
|
||||
|
||||
nir_def *addr = image_address(b, devinfo, deref, coord);
|
||||
nir_def *color = convert_color_for_store(b, devinfo,
|
||||
intrin->src[3].ssa,
|
||||
image_fmt, raw_fmt);
|
||||
|
||||
nir_intrinsic_instr *store =
|
||||
nir_intrinsic_instr_create(b->shader,
|
||||
nir_intrinsic_image_deref_store_raw_intel);
|
||||
store->src[0] = nir_src_for_ssa(&deref->def);
|
||||
store->src[1] = nir_src_for_ssa(addr);
|
||||
store->src[2] = nir_src_for_ssa(color);
|
||||
store->num_components = image_fmtl->bpb / 32;
|
||||
nir_builder_instr_insert(b, &store->instr);
|
||||
|
||||
nir_pop_if(b, NULL);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_image_atomic_instr(nir_builder *b,
|
||||
const struct intel_device_info *devinfo,
|
||||
nir_intrinsic_instr *intrin)
|
||||
{
|
||||
if (devinfo->verx10 >= 75)
|
||||
return false;
|
||||
|
||||
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
|
||||
|
||||
b->cursor = nir_instr_remove(&intrin->instr);
|
||||
|
||||
/* Use an undef to hold the uses of the load conversion. */
|
||||
nir_def *placeholder = nir_undef(b, 4, 32);
|
||||
nir_def_rewrite_uses(&intrin->def, placeholder);
|
||||
|
||||
/* Check the first component of the size field to find out if the
|
||||
* image is bound. Necessary on IVB for typed atomics because
|
||||
* they don't seem to respect null surfaces and will happily
|
||||
* corrupt or read random memory when no image is bound.
|
||||
*/
|
||||
nir_def *size = load_image_param(b, deref, SIZE);
|
||||
nir_def *zero = nir_imm_int(b, 0);
|
||||
nir_push_if(b, nir_ine(b, nir_channel(b, size, 0), zero));
|
||||
|
||||
nir_builder_instr_insert(b, &intrin->instr);
|
||||
|
||||
nir_pop_if(b, NULL);
|
||||
|
||||
nir_def *result = nir_if_phi(b, &intrin->def, zero);
|
||||
nir_def_rewrite_uses(placeholder, result);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_image_size_instr(nir_builder *b,
|
||||
const struct intel_device_info *devinfo,
|
||||
nir_intrinsic_instr *intrin)
|
||||
{
|
||||
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
|
||||
nir_variable *var = nir_deref_instr_get_variable(deref);
|
||||
|
||||
/* For write-only images, we have an actual image surface so we fall back
|
||||
* and let the back-end emit a TXS for this.
|
||||
*/
|
||||
if (var->data.access & ACCESS_NON_READABLE)
|
||||
return false;
|
||||
|
||||
if (var->data.image.format == PIPE_FORMAT_NONE)
|
||||
return false;
|
||||
|
||||
/* If we have a matching typed format, then we have an actual image surface
|
||||
* so we fall back and let the back-end emit a TXS for this.
|
||||
*/
|
||||
const enum isl_format image_fmt =
|
||||
isl_format_for_pipe_format(var->data.image.format);
|
||||
if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt))
|
||||
return false;
|
||||
|
||||
assert(nir_src_as_uint(intrin->src[1]) == 0);
|
||||
|
||||
b->cursor = nir_instr_remove(&intrin->instr);
|
||||
|
||||
nir_def *size = load_image_param(b, deref, SIZE);
|
||||
|
||||
nir_def *comps[4] = { NULL, NULL, NULL, NULL };
|
||||
|
||||
assert(nir_intrinsic_image_dim(intrin) != GLSL_SAMPLER_DIM_CUBE);
|
||||
unsigned coord_comps = glsl_get_sampler_coordinate_components(deref->type);
|
||||
for (unsigned c = 0; c < coord_comps; c++)
|
||||
comps[c] = nir_channel(b, size, c);
|
||||
|
||||
for (unsigned c = coord_comps; c < intrin->def.num_components; ++c)
|
||||
comps[c] = nir_imm_int(b, 1);
|
||||
|
||||
nir_def *vec = nir_vec(b, comps, intrin->def.num_components);
|
||||
nir_def_rewrite_uses(&intrin->def, vec);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
brw_nir_lower_storage_image_instr(nir_builder *b,
|
||||
nir_instr *instr,
|
||||
void *cb_data)
|
||||
{
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
return false;
|
||||
const struct brw_nir_lower_storage_image_opts *opts = cb_data;
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_image_deref_load:
|
||||
if (opts->lower_loads)
|
||||
return lower_image_load_instr(b, opts->devinfo, intrin, false);
|
||||
return false;
|
||||
|
||||
case nir_intrinsic_image_deref_sparse_load:
|
||||
if (opts->lower_loads)
|
||||
return lower_image_load_instr(b, opts->devinfo, intrin, true);
|
||||
return false;
|
||||
|
||||
case nir_intrinsic_image_deref_store:
|
||||
if (opts->lower_stores)
|
||||
return lower_image_store_instr(b, opts->devinfo, intrin);
|
||||
return false;
|
||||
|
||||
case nir_intrinsic_image_deref_atomic:
|
||||
case nir_intrinsic_image_deref_atomic_swap:
|
||||
if (opts->lower_atomics)
|
||||
return lower_image_atomic_instr(b, opts->devinfo, intrin);
|
||||
return false;
|
||||
|
||||
case nir_intrinsic_image_deref_size:
|
||||
if (opts->lower_get_size)
|
||||
return lower_image_size_instr(b, opts->devinfo, intrin);
|
||||
return false;
|
||||
|
||||
default:
|
||||
/* Nothing to do */
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
brw_nir_lower_storage_image(nir_shader *shader,
|
||||
const struct brw_nir_lower_storage_image_opts *opts)
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
const nir_lower_image_options image_options = {
|
||||
.lower_cube_size = true,
|
||||
.lower_image_samples_to_one = true,
|
||||
};
|
||||
|
||||
progress |= nir_lower_image(shader, &image_options);
|
||||
|
||||
progress |= nir_shader_instructions_pass(shader,
|
||||
brw_nir_lower_storage_image_instr,
|
||||
nir_metadata_none,
|
||||
(void *)opts);
|
||||
|
||||
return progress;
|
||||
}
|
||||
536
src/intel/compiler/elk/brw_nir_rt.c
Normal file
536
src/intel/compiler/elk/brw_nir_rt.c
Normal file
|
|
@ -0,0 +1,536 @@
|
|||
/*
|
||||
* Copyright © 2020 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "intel_nir.h"
|
||||
#include "brw_nir_rt.h"
|
||||
#include "brw_nir_rt_builder.h"
|
||||
#include "intel_nir.h"
|
||||
|
||||
static bool
|
||||
resize_deref(nir_builder *b, nir_deref_instr *deref,
|
||||
unsigned num_components, unsigned bit_size)
|
||||
{
|
||||
if (deref->def.num_components == num_components &&
|
||||
deref->def.bit_size == bit_size)
|
||||
return false;
|
||||
|
||||
/* NIR requires array indices have to match the deref bit size */
|
||||
if (deref->def.bit_size != bit_size &&
|
||||
(deref->deref_type == nir_deref_type_array ||
|
||||
deref->deref_type == nir_deref_type_ptr_as_array)) {
|
||||
b->cursor = nir_before_instr(&deref->instr);
|
||||
nir_def *idx;
|
||||
if (nir_src_is_const(deref->arr.index)) {
|
||||
idx = nir_imm_intN_t(b, nir_src_as_int(deref->arr.index), bit_size);
|
||||
} else {
|
||||
idx = nir_i2iN(b, deref->arr.index.ssa, bit_size);
|
||||
}
|
||||
nir_src_rewrite(&deref->arr.index, idx);
|
||||
}
|
||||
|
||||
deref->def.num_components = num_components;
|
||||
deref->def.bit_size = bit_size;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_rt_io_derefs(nir_shader *shader)
|
||||
{
|
||||
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
|
||||
|
||||
bool progress = false;
|
||||
|
||||
unsigned num_shader_call_vars = 0;
|
||||
nir_foreach_variable_with_modes(var, shader, nir_var_shader_call_data)
|
||||
num_shader_call_vars++;
|
||||
|
||||
unsigned num_ray_hit_attrib_vars = 0;
|
||||
nir_foreach_variable_with_modes(var, shader, nir_var_ray_hit_attrib)
|
||||
num_ray_hit_attrib_vars++;
|
||||
|
||||
/* At most one payload is allowed because it's an input. Technically, this
|
||||
* is also true for hit attribute variables. However, after we inline an
|
||||
* any-hit shader into an intersection shader, we can end up with multiple
|
||||
* hit attribute variables. They'll end up mapping to a cast from the same
|
||||
* base pointer so this is fine.
|
||||
*/
|
||||
assert(num_shader_call_vars <= 1);
|
||||
|
||||
nir_builder b = nir_builder_at(nir_before_impl(impl));
|
||||
|
||||
nir_def *call_data_addr = NULL;
|
||||
if (num_shader_call_vars > 0) {
|
||||
assert(shader->scratch_size >= BRW_BTD_STACK_CALLEE_DATA_SIZE);
|
||||
call_data_addr =
|
||||
brw_nir_rt_load_scratch(&b, BRW_BTD_STACK_CALL_DATA_PTR_OFFSET, 8,
|
||||
1, 64);
|
||||
progress = true;
|
||||
}
|
||||
|
||||
gl_shader_stage stage = shader->info.stage;
|
||||
nir_def *hit_attrib_addr = NULL;
|
||||
if (num_ray_hit_attrib_vars > 0) {
|
||||
assert(stage == MESA_SHADER_ANY_HIT ||
|
||||
stage == MESA_SHADER_CLOSEST_HIT ||
|
||||
stage == MESA_SHADER_INTERSECTION);
|
||||
nir_def *hit_addr =
|
||||
brw_nir_rt_mem_hit_addr(&b, stage == MESA_SHADER_CLOSEST_HIT);
|
||||
/* The vec2 barycentrics are in 2nd and 3rd dwords of MemHit */
|
||||
nir_def *bary_addr = nir_iadd_imm(&b, hit_addr, 4);
|
||||
hit_attrib_addr = nir_bcsel(&b, nir_load_leaf_procedural_intel(&b),
|
||||
brw_nir_rt_hit_attrib_data_addr(&b),
|
||||
bary_addr);
|
||||
progress = true;
|
||||
}
|
||||
|
||||
nir_foreach_block(block, impl) {
|
||||
nir_foreach_instr_safe(instr, block) {
|
||||
if (instr->type != nir_instr_type_deref)
|
||||
continue;
|
||||
|
||||
nir_deref_instr *deref = nir_instr_as_deref(instr);
|
||||
if (nir_deref_mode_is(deref, nir_var_shader_call_data)) {
|
||||
deref->modes = nir_var_function_temp;
|
||||
if (deref->deref_type == nir_deref_type_var) {
|
||||
b.cursor = nir_before_instr(&deref->instr);
|
||||
nir_deref_instr *cast =
|
||||
nir_build_deref_cast(&b, call_data_addr,
|
||||
nir_var_function_temp,
|
||||
deref->var->type, 0);
|
||||
nir_def_rewrite_uses(&deref->def,
|
||||
&cast->def);
|
||||
nir_instr_remove(&deref->instr);
|
||||
progress = true;
|
||||
}
|
||||
} else if (nir_deref_mode_is(deref, nir_var_ray_hit_attrib)) {
|
||||
deref->modes = nir_var_function_temp;
|
||||
if (deref->deref_type == nir_deref_type_var) {
|
||||
b.cursor = nir_before_instr(&deref->instr);
|
||||
nir_deref_instr *cast =
|
||||
nir_build_deref_cast(&b, hit_attrib_addr,
|
||||
nir_var_function_temp,
|
||||
deref->type, 0);
|
||||
nir_def_rewrite_uses(&deref->def,
|
||||
&cast->def);
|
||||
nir_instr_remove(&deref->instr);
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* We're going to lower all function_temp memory to scratch using
|
||||
* 64-bit addresses. We need to resize all our derefs first or else
|
||||
* nir_lower_explicit_io will have a fit.
|
||||
*/
|
||||
if (nir_deref_mode_is(deref, nir_var_function_temp) &&
|
||||
resize_deref(&b, deref, 1, 64))
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (progress) {
|
||||
nir_metadata_preserve(impl, nir_metadata_block_index |
|
||||
nir_metadata_dominance);
|
||||
} else {
|
||||
nir_metadata_preserve(impl, nir_metadata_all);
|
||||
}
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
/** Lowers ray-tracing shader I/O and scratch access
|
||||
*
|
||||
* SPV_KHR_ray_tracing adds three new types of I/O, each of which need their
|
||||
* own bit of special care:
|
||||
*
|
||||
* - Shader payload data: This is represented by the IncomingCallableData
|
||||
* and IncomingRayPayload storage classes which are both represented by
|
||||
* nir_var_call_data in NIR. There is at most one of these per-shader and
|
||||
* they contain payload data passed down the stack from the parent shader
|
||||
* when it calls executeCallable() or traceRay(). In our implementation,
|
||||
* the actual storage lives in the calling shader's scratch space and we're
|
||||
* passed a pointer to it.
|
||||
*
|
||||
* - Hit attribute data: This is represented by the HitAttribute storage
|
||||
* class in SPIR-V and nir_var_ray_hit_attrib in NIR. For triangle
|
||||
* geometry, it's supposed to contain two floats which are the barycentric
|
||||
* coordinates. For AABS/procedural geometry, it contains the hit data
|
||||
* written out by the intersection shader. In our implementation, it's a
|
||||
* 64-bit pointer which points either to the u/v area of the relevant
|
||||
* MemHit data structure or the space right after the HW ray stack entry.
|
||||
*
|
||||
* - Shader record buffer data: This allows read-only access to the data
|
||||
* stored in the SBT right after the bindless shader handles. It's
|
||||
* effectively a UBO with a magic address. Coming out of spirv_to_nir,
|
||||
* we get a nir_intrinsic_load_shader_record_ptr which is cast to a
|
||||
* nir_var_mem_global deref and all access happens through that. The
|
||||
* shader_record_ptr system value is handled in brw_nir_lower_rt_intrinsics
|
||||
* and we assume nir_lower_explicit_io is called elsewhere thanks to
|
||||
* VK_KHR_buffer_device_address so there's really nothing to do here.
|
||||
*
|
||||
* We also handle lowering any remaining function_temp variables to scratch at
|
||||
* this point. This gets rid of any remaining arrays and also takes care of
|
||||
* the sending side of ray payloads where we pass pointers to a function_temp
|
||||
* variable down the call stack.
|
||||
*/
|
||||
static void
|
||||
lower_rt_io_and_scratch(nir_shader *nir)
|
||||
{
|
||||
/* First, we to ensure all the I/O variables have explicit types. Because
|
||||
* these are shader-internal and don't come in from outside, they don't
|
||||
* have an explicit memory layout and we have to assign them one.
|
||||
*/
|
||||
NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
|
||||
nir_var_function_temp |
|
||||
nir_var_shader_call_data |
|
||||
nir_var_ray_hit_attrib,
|
||||
glsl_get_natural_size_align_bytes);
|
||||
|
||||
/* Now patch any derefs to I/O vars */
|
||||
NIR_PASS_V(nir, lower_rt_io_derefs);
|
||||
|
||||
/* Finally, lower any remaining function_temp, mem_constant, or
|
||||
* ray_hit_attrib access to 64-bit global memory access.
|
||||
*/
|
||||
NIR_PASS_V(nir, nir_lower_explicit_io,
|
||||
nir_var_function_temp |
|
||||
nir_var_mem_constant |
|
||||
nir_var_ray_hit_attrib,
|
||||
nir_address_format_64bit_global);
|
||||
}
|
||||
|
||||
static void
|
||||
build_terminate_ray(nir_builder *b)
|
||||
{
|
||||
nir_def *skip_closest_hit = nir_test_mask(b, nir_load_ray_flags(b),
|
||||
BRW_RT_RAY_FLAG_SKIP_CLOSEST_HIT_SHADER);
|
||||
nir_push_if(b, skip_closest_hit);
|
||||
{
|
||||
/* The shader that calls traceRay() is unable to access any ray hit
|
||||
* information except for that which is explicitly written into the ray
|
||||
* payload by shaders invoked during the trace. If there's no closest-
|
||||
* hit shader, then accepting the hit has no observable effect; it's
|
||||
* just extra memory traffic for no reason.
|
||||
*/
|
||||
brw_nir_btd_return(b);
|
||||
nir_jump(b, nir_jump_halt);
|
||||
}
|
||||
nir_push_else(b, NULL);
|
||||
{
|
||||
/* The closest hit shader is in the same shader group as the any-hit
|
||||
* shader that we're currently in. We can get the address for its SBT
|
||||
* handle by looking at the shader record pointer and subtracting the
|
||||
* size of a SBT handle. The BINDLESS_SHADER_RECORD for a closest hit
|
||||
* shader is the first one in the SBT handle.
|
||||
*/
|
||||
nir_def *closest_hit =
|
||||
nir_iadd_imm(b, nir_load_shader_record_ptr(b),
|
||||
-BRW_RT_SBT_HANDLE_SIZE);
|
||||
|
||||
brw_nir_rt_commit_hit(b);
|
||||
brw_nir_btd_spawn(b, closest_hit);
|
||||
nir_jump(b, nir_jump_halt);
|
||||
}
|
||||
nir_pop_if(b, NULL);
|
||||
}
|
||||
|
||||
/** Lowers away ray walk intrinsics
|
||||
*
|
||||
* This lowers terminate_ray, ignore_ray_intersection, and the NIR-specific
|
||||
* accept_ray_intersection intrinsics to the appropriate Intel-specific
|
||||
* intrinsics.
|
||||
*/
|
||||
static bool
|
||||
lower_ray_walk_intrinsics(nir_shader *shader,
|
||||
const struct intel_device_info *devinfo)
|
||||
{
|
||||
assert(shader->info.stage == MESA_SHADER_ANY_HIT ||
|
||||
shader->info.stage == MESA_SHADER_INTERSECTION);
|
||||
|
||||
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
|
||||
|
||||
nir_builder b = nir_builder_create(impl);
|
||||
|
||||
bool progress = false;
|
||||
nir_foreach_block_safe(block, impl) {
|
||||
nir_foreach_instr_safe(instr, block) {
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
continue;
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_ignore_ray_intersection: {
|
||||
b.cursor = nir_instr_remove(&intrin->instr);
|
||||
|
||||
/* We put the newly emitted code inside a dummy if because it's
|
||||
* going to contain a jump instruction and we don't want to deal
|
||||
* with that mess here. It'll get dealt with by our control-flow
|
||||
* optimization passes.
|
||||
*/
|
||||
nir_push_if(&b, nir_imm_true(&b));
|
||||
nir_trace_ray_intel(&b,
|
||||
nir_load_btd_global_arg_addr_intel(&b),
|
||||
nir_imm_int(&b, BRW_RT_BVH_LEVEL_OBJECT),
|
||||
nir_imm_int(&b, GEN_RT_TRACE_RAY_CONTINUE),
|
||||
.synchronous = false);
|
||||
nir_jump(&b, nir_jump_halt);
|
||||
nir_pop_if(&b, NULL);
|
||||
progress = true;
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_accept_ray_intersection: {
|
||||
b.cursor = nir_instr_remove(&intrin->instr);
|
||||
|
||||
nir_def *terminate = nir_test_mask(&b, nir_load_ray_flags(&b),
|
||||
BRW_RT_RAY_FLAG_TERMINATE_ON_FIRST_HIT);
|
||||
nir_push_if(&b, terminate);
|
||||
{
|
||||
build_terminate_ray(&b);
|
||||
}
|
||||
nir_push_else(&b, NULL);
|
||||
{
|
||||
nir_trace_ray_intel(&b,
|
||||
nir_load_btd_global_arg_addr_intel(&b),
|
||||
nir_imm_int(&b, BRW_RT_BVH_LEVEL_OBJECT),
|
||||
nir_imm_int(&b, GEN_RT_TRACE_RAY_COMMIT),
|
||||
.synchronous = false);
|
||||
nir_jump(&b, nir_jump_halt);
|
||||
}
|
||||
nir_pop_if(&b, NULL);
|
||||
progress = true;
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_terminate_ray: {
|
||||
b.cursor = nir_instr_remove(&intrin->instr);
|
||||
build_terminate_ray(&b);
|
||||
progress = true;
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (progress) {
|
||||
nir_metadata_preserve(impl, nir_metadata_none);
|
||||
} else {
|
||||
nir_metadata_preserve(impl, nir_metadata_all);
|
||||
}
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
void
|
||||
brw_nir_lower_raygen(nir_shader *nir)
|
||||
{
|
||||
assert(nir->info.stage == MESA_SHADER_RAYGEN);
|
||||
NIR_PASS_V(nir, brw_nir_lower_shader_returns);
|
||||
lower_rt_io_and_scratch(nir);
|
||||
}
|
||||
|
||||
void
|
||||
brw_nir_lower_any_hit(nir_shader *nir, const struct intel_device_info *devinfo)
|
||||
{
|
||||
assert(nir->info.stage == MESA_SHADER_ANY_HIT);
|
||||
NIR_PASS_V(nir, brw_nir_lower_shader_returns);
|
||||
NIR_PASS_V(nir, lower_ray_walk_intrinsics, devinfo);
|
||||
lower_rt_io_and_scratch(nir);
|
||||
}
|
||||
|
||||
void
|
||||
brw_nir_lower_closest_hit(nir_shader *nir)
|
||||
{
|
||||
assert(nir->info.stage == MESA_SHADER_CLOSEST_HIT);
|
||||
NIR_PASS_V(nir, brw_nir_lower_shader_returns);
|
||||
lower_rt_io_and_scratch(nir);
|
||||
}
|
||||
|
||||
void
|
||||
brw_nir_lower_miss(nir_shader *nir)
|
||||
{
|
||||
assert(nir->info.stage == MESA_SHADER_MISS);
|
||||
NIR_PASS_V(nir, brw_nir_lower_shader_returns);
|
||||
lower_rt_io_and_scratch(nir);
|
||||
}
|
||||
|
||||
void
|
||||
brw_nir_lower_callable(nir_shader *nir)
|
||||
{
|
||||
assert(nir->info.stage == MESA_SHADER_CALLABLE);
|
||||
NIR_PASS_V(nir, brw_nir_lower_shader_returns);
|
||||
lower_rt_io_and_scratch(nir);
|
||||
}
|
||||
|
||||
void
|
||||
brw_nir_lower_combined_intersection_any_hit(nir_shader *intersection,
|
||||
const nir_shader *any_hit,
|
||||
const struct intel_device_info *devinfo)
|
||||
{
|
||||
assert(intersection->info.stage == MESA_SHADER_INTERSECTION);
|
||||
assert(any_hit == NULL || any_hit->info.stage == MESA_SHADER_ANY_HIT);
|
||||
NIR_PASS_V(intersection, brw_nir_lower_shader_returns);
|
||||
NIR_PASS_V(intersection, brw_nir_lower_intersection_shader,
|
||||
any_hit, devinfo);
|
||||
NIR_PASS_V(intersection, lower_ray_walk_intrinsics, devinfo);
|
||||
lower_rt_io_and_scratch(intersection);
|
||||
}
|
||||
|
||||
static nir_def *
|
||||
build_load_uniform(nir_builder *b, unsigned offset,
|
||||
unsigned num_components, unsigned bit_size)
|
||||
{
|
||||
return nir_load_uniform(b, num_components, bit_size, nir_imm_int(b, 0),
|
||||
.base = offset,
|
||||
.range = num_components * bit_size / 8);
|
||||
}
|
||||
|
||||
#define load_trampoline_param(b, name, num_components, bit_size) \
|
||||
build_load_uniform((b), offsetof(struct brw_rt_raygen_trampoline_params, name), \
|
||||
(num_components), (bit_size))
|
||||
|
||||
nir_shader *
|
||||
brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler,
|
||||
void *mem_ctx)
|
||||
{
|
||||
const struct intel_device_info *devinfo = compiler->devinfo;
|
||||
const nir_shader_compiler_options *nir_options =
|
||||
compiler->nir_options[MESA_SHADER_COMPUTE];
|
||||
|
||||
STATIC_ASSERT(sizeof(struct brw_rt_raygen_trampoline_params) == 32);
|
||||
|
||||
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
|
||||
nir_options,
|
||||
"RT Ray-Gen Trampoline");
|
||||
ralloc_steal(mem_ctx, b.shader);
|
||||
|
||||
b.shader->info.workgroup_size_variable = true;
|
||||
|
||||
/* The RT global data and raygen BINDLESS_SHADER_RECORD addresses are
|
||||
* passed in as push constants in the first register. We deal with the
|
||||
* raygen BSR address here; the global data we'll deal with later.
|
||||
*/
|
||||
b.shader->num_uniforms = 32;
|
||||
nir_def *raygen_param_bsr_addr =
|
||||
load_trampoline_param(&b, raygen_bsr_addr, 1, 64);
|
||||
nir_def *is_indirect =
|
||||
nir_i2b(&b, load_trampoline_param(&b, is_indirect, 1, 8));
|
||||
nir_def *local_shift =
|
||||
nir_u2u32(&b, load_trampoline_param(&b, local_group_size_log2, 3, 8));
|
||||
|
||||
nir_def *raygen_indirect_bsr_addr;
|
||||
nir_push_if(&b, is_indirect);
|
||||
{
|
||||
raygen_indirect_bsr_addr =
|
||||
nir_load_global_constant(&b, raygen_param_bsr_addr,
|
||||
8 /* align */,
|
||||
1 /* components */,
|
||||
64 /* bit_size */);
|
||||
}
|
||||
nir_pop_if(&b, NULL);
|
||||
|
||||
nir_def *raygen_bsr_addr =
|
||||
nir_if_phi(&b, raygen_indirect_bsr_addr, raygen_param_bsr_addr);
|
||||
|
||||
nir_def *global_id = nir_load_workgroup_id_zero_base(&b);
|
||||
nir_def *simd_channel = nir_load_subgroup_invocation(&b);
|
||||
nir_def *local_x =
|
||||
nir_ubfe(&b, simd_channel, nir_imm_int(&b, 0),
|
||||
nir_channel(&b, local_shift, 0));
|
||||
nir_def *local_y =
|
||||
nir_ubfe(&b, simd_channel, nir_channel(&b, local_shift, 0),
|
||||
nir_channel(&b, local_shift, 1));
|
||||
nir_def *local_z =
|
||||
nir_ubfe(&b, simd_channel,
|
||||
nir_iadd(&b, nir_channel(&b, local_shift, 0),
|
||||
nir_channel(&b, local_shift, 1)),
|
||||
nir_channel(&b, local_shift, 2));
|
||||
nir_def *launch_id =
|
||||
nir_iadd(&b, nir_ishl(&b, global_id, local_shift),
|
||||
nir_vec3(&b, local_x, local_y, local_z));
|
||||
|
||||
nir_def *launch_size = nir_load_ray_launch_size(&b);
|
||||
nir_push_if(&b, nir_ball(&b, nir_ult(&b, launch_id, launch_size)));
|
||||
{
|
||||
nir_store_global(&b, brw_nir_rt_sw_hotzone_addr(&b, devinfo), 16,
|
||||
nir_vec4(&b, nir_imm_int(&b, 0), /* Stack ptr */
|
||||
nir_channel(&b, launch_id, 0),
|
||||
nir_channel(&b, launch_id, 1),
|
||||
nir_channel(&b, launch_id, 2)),
|
||||
0xf /* write mask */);
|
||||
|
||||
brw_nir_btd_spawn(&b, raygen_bsr_addr);
|
||||
}
|
||||
nir_push_else(&b, NULL);
|
||||
{
|
||||
/* Even though these invocations aren't being used for anything, the
|
||||
* hardware allocated stack IDs for them. They need to retire them.
|
||||
*/
|
||||
brw_nir_btd_retire(&b);
|
||||
}
|
||||
nir_pop_if(&b, NULL);
|
||||
|
||||
nir_shader *nir = b.shader;
|
||||
nir->info.name = ralloc_strdup(nir, "RT: TraceRay trampoline");
|
||||
nir_validate_shader(nir, "in brw_nir_create_raygen_trampoline");
|
||||
|
||||
struct brw_nir_compiler_opts opts = {};
|
||||
brw_preprocess_nir(compiler, nir, &opts);
|
||||
|
||||
NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, devinfo);
|
||||
|
||||
b = nir_builder_create(nir_shader_get_entrypoint(b.shader));
|
||||
/* brw_nir_lower_rt_intrinsics will leave us with a btd_global_arg_addr
|
||||
* intrinsic which doesn't exist in compute shaders. We also created one
|
||||
* above when we generated the BTD spawn intrinsic. Now we go through and
|
||||
* replace them with a uniform load.
|
||||
*/
|
||||
nir_foreach_block(block, b.impl) {
|
||||
nir_foreach_instr_safe(instr, block) {
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
continue;
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
if (intrin->intrinsic != nir_intrinsic_load_btd_global_arg_addr_intel)
|
||||
continue;
|
||||
|
||||
b.cursor = nir_before_instr(&intrin->instr);
|
||||
nir_def *global_arg_addr =
|
||||
load_trampoline_param(&b, rt_disp_globals_addr, 1, 64);
|
||||
nir_def_rewrite_uses(&intrin->def,
|
||||
global_arg_addr);
|
||||
nir_instr_remove(instr);
|
||||
}
|
||||
}
|
||||
|
||||
NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics, devinfo, NULL);
|
||||
|
||||
const bool is_scalar = true;
|
||||
brw_nir_optimize(nir, is_scalar, devinfo);
|
||||
|
||||
return nir;
|
||||
}
|
||||
76
src/intel/compiler/elk/brw_nir_rt.h
Normal file
76
src/intel/compiler/elk/brw_nir_rt.h
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
/*
|
||||
* Copyright © 2020 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BRW_NIR_RT_H
|
||||
#define BRW_NIR_RT_H
|
||||
|
||||
#include "brw_nir.h"
|
||||
#include "brw_rt.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void brw_nir_lower_raygen(nir_shader *nir);
|
||||
void brw_nir_lower_any_hit(nir_shader *nir,
|
||||
const struct intel_device_info *devinfo);
|
||||
void brw_nir_lower_closest_hit(nir_shader *nir);
|
||||
void brw_nir_lower_miss(nir_shader *nir);
|
||||
void brw_nir_lower_callable(nir_shader *nir);
|
||||
void brw_nir_lower_combined_intersection_any_hit(nir_shader *intersection,
|
||||
const nir_shader *any_hit,
|
||||
const struct intel_device_info *devinfo);
|
||||
|
||||
/* We reserve the first 16B of the stack for callee data pointers */
|
||||
#define BRW_BTD_STACK_RESUME_BSR_ADDR_OFFSET 0
|
||||
#define BRW_BTD_STACK_CALL_DATA_PTR_OFFSET 8
|
||||
#define BRW_BTD_STACK_CALLEE_DATA_SIZE 16
|
||||
|
||||
/* We require the stack to be 8B aligned at the start of a shader */
|
||||
#define BRW_BTD_STACK_ALIGN 8
|
||||
|
||||
bool brw_nir_lower_ray_queries(nir_shader *shader,
|
||||
const struct intel_device_info *devinfo);
|
||||
|
||||
void brw_nir_lower_shader_returns(nir_shader *shader);
|
||||
|
||||
bool brw_nir_lower_shader_calls(nir_shader *shader, struct brw_bs_prog_key *key);
|
||||
|
||||
void brw_nir_lower_rt_intrinsics(nir_shader *shader,
|
||||
const struct intel_device_info *devinfo);
|
||||
void brw_nir_lower_intersection_shader(nir_shader *intersection,
|
||||
const nir_shader *any_hit,
|
||||
const struct intel_device_info *devinfo);
|
||||
|
||||
nir_shader *
|
||||
brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler,
|
||||
void *mem_ctx);
|
||||
nir_shader *
|
||||
brw_nir_create_trivial_return_shader(const struct brw_compiler *compiler,
|
||||
void *mem_ctx);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* BRW_NIR_RT_H */
|
||||
990
src/intel/compiler/elk/brw_nir_rt_builder.h
Normal file
990
src/intel/compiler/elk/brw_nir_rt_builder.h
Normal file
|
|
@ -0,0 +1,990 @@
|
|||
/*
|
||||
* Copyright © 2020 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BRW_NIR_RT_BUILDER_H
|
||||
#define BRW_NIR_RT_BUILDER_H
|
||||
|
||||
/* This file provides helpers to access memory based data structures that the
|
||||
* RT hardware reads/writes and their locations.
|
||||
*
|
||||
* See also "Memory Based Data Structures for Ray Tracing" (BSpec 47547) and
|
||||
* "Ray Tracing Address Computation for Memory Resident Structures" (BSpec
|
||||
* 47550).
|
||||
*/
|
||||
|
||||
#include "brw_rt.h"
|
||||
#include "nir_builder.h"
|
||||
|
||||
#define is_access_for_builder(b) \
|
||||
((b)->shader->info.stage == MESA_SHADER_FRAGMENT ? \
|
||||
ACCESS_INCLUDE_HELPERS : 0)
|
||||
|
||||
static inline nir_def *
|
||||
brw_nir_rt_load(nir_builder *b, nir_def *addr, unsigned align,
|
||||
unsigned components, unsigned bit_size)
|
||||
{
|
||||
return nir_build_load_global(b, components, bit_size, addr,
|
||||
.align_mul = align,
|
||||
.access = is_access_for_builder(b));
|
||||
}
|
||||
|
||||
static inline void
|
||||
brw_nir_rt_store(nir_builder *b, nir_def *addr, unsigned align,
|
||||
nir_def *value, unsigned write_mask)
|
||||
{
|
||||
nir_build_store_global(b, value, addr,
|
||||
.align_mul = align,
|
||||
.write_mask = (write_mask) &
|
||||
BITFIELD_MASK(value->num_components),
|
||||
.access = is_access_for_builder(b));
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
brw_nir_rt_load_const(nir_builder *b, unsigned components,
|
||||
nir_def *addr, nir_def *pred)
|
||||
{
|
||||
return nir_load_global_const_block_intel(b, components, addr, pred);
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
brw_load_btd_dss_id(nir_builder *b)
|
||||
{
|
||||
return nir_load_topology_id_intel(b, .base = BRW_TOPOLOGY_ID_DSS);
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
brw_nir_rt_load_num_simd_lanes_per_dss(nir_builder *b,
|
||||
const struct intel_device_info *devinfo)
|
||||
{
|
||||
return nir_imm_int(b, devinfo->num_thread_per_eu *
|
||||
devinfo->max_eus_per_subslice *
|
||||
16 /* The RT computation is based off SIMD16 */);
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
brw_load_eu_thread_simd(nir_builder *b)
|
||||
{
|
||||
return nir_load_topology_id_intel(b, .base = BRW_TOPOLOGY_ID_EU_THREAD_SIMD);
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
brw_nir_rt_async_stack_id(nir_builder *b)
|
||||
{
|
||||
return nir_iadd(b, nir_umul_32x16(b, nir_load_ray_num_dss_rt_stacks_intel(b),
|
||||
brw_load_btd_dss_id(b)),
|
||||
nir_load_btd_stack_id_intel(b));
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
brw_nir_rt_sync_stack_id(nir_builder *b)
|
||||
{
|
||||
return brw_load_eu_thread_simd(b);
|
||||
}
|
||||
|
||||
/* We have our own load/store scratch helpers because they emit a global
|
||||
* memory read or write based on the scratch_base_ptr system value rather
|
||||
* than a load/store_scratch intrinsic.
|
||||
*/
|
||||
static inline nir_def *
|
||||
brw_nir_rt_load_scratch(nir_builder *b, uint32_t offset, unsigned align,
|
||||
unsigned num_components, unsigned bit_size)
|
||||
{
|
||||
nir_def *addr =
|
||||
nir_iadd_imm(b, nir_load_scratch_base_ptr(b, 1, 64, 1), offset);
|
||||
return brw_nir_rt_load(b, addr, MIN2(align, BRW_BTD_STACK_ALIGN),
|
||||
num_components, bit_size);
|
||||
}
|
||||
|
||||
static inline void
|
||||
brw_nir_rt_store_scratch(nir_builder *b, uint32_t offset, unsigned align,
|
||||
nir_def *value, nir_component_mask_t write_mask)
|
||||
{
|
||||
nir_def *addr =
|
||||
nir_iadd_imm(b, nir_load_scratch_base_ptr(b, 1, 64, 1), offset);
|
||||
brw_nir_rt_store(b, addr, MIN2(align, BRW_BTD_STACK_ALIGN),
|
||||
value, write_mask);
|
||||
}
|
||||
|
||||
static inline void
|
||||
brw_nir_btd_spawn(nir_builder *b, nir_def *record_addr)
|
||||
{
|
||||
nir_btd_spawn_intel(b, nir_load_btd_global_arg_addr_intel(b), record_addr);
|
||||
}
|
||||
|
||||
static inline void
|
||||
brw_nir_btd_retire(nir_builder *b)
|
||||
{
|
||||
nir_btd_retire_intel(b);
|
||||
}
|
||||
|
||||
/** This is a pseudo-op which does a bindless return
|
||||
*
|
||||
* It loads the return address from the stack and calls btd_spawn to spawn the
|
||||
* resume shader.
|
||||
*/
|
||||
static inline void
|
||||
brw_nir_btd_return(struct nir_builder *b)
|
||||
{
|
||||
nir_def *resume_addr =
|
||||
brw_nir_rt_load_scratch(b, BRW_BTD_STACK_RESUME_BSR_ADDR_OFFSET,
|
||||
8 /* align */, 1, 64);
|
||||
brw_nir_btd_spawn(b, resume_addr);
|
||||
}
|
||||
|
||||
static inline void
|
||||
assert_def_size(nir_def *def, unsigned num_components, unsigned bit_size)
|
||||
{
|
||||
assert(def->num_components == num_components);
|
||||
assert(def->bit_size == bit_size);
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
brw_nir_num_rt_stacks(nir_builder *b,
|
||||
const struct intel_device_info *devinfo)
|
||||
{
|
||||
return nir_imul_imm(b, nir_load_ray_num_dss_rt_stacks_intel(b),
|
||||
intel_device_info_dual_subslice_id_bound(devinfo));
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
brw_nir_rt_sw_hotzone_addr(nir_builder *b,
|
||||
const struct intel_device_info *devinfo)
|
||||
{
|
||||
nir_def *offset32 =
|
||||
nir_imul_imm(b, brw_nir_rt_async_stack_id(b),
|
||||
BRW_RT_SIZEOF_HOTZONE);
|
||||
|
||||
offset32 = nir_iadd(b, offset32, nir_ineg(b,
|
||||
nir_imul_imm(b, brw_nir_num_rt_stacks(b, devinfo),
|
||||
BRW_RT_SIZEOF_HOTZONE)));
|
||||
|
||||
return nir_iadd(b, nir_load_ray_base_mem_addr_intel(b),
|
||||
nir_i2i64(b, offset32));
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
brw_nir_rt_sync_stack_addr(nir_builder *b,
|
||||
nir_def *base_mem_addr,
|
||||
const struct intel_device_info *devinfo)
|
||||
{
|
||||
/* For Ray queries (Synchronous Ray Tracing), the formula is similar but
|
||||
* goes down from rtMemBasePtr :
|
||||
*
|
||||
* syncBase = RTDispatchGlobals.rtMemBasePtr
|
||||
* - (DSSID * NUM_SIMD_LANES_PER_DSS + SyncStackID + 1)
|
||||
* * syncStackSize
|
||||
*
|
||||
* We assume that we can calculate a 32-bit offset first and then add it
|
||||
* to the 64-bit base address at the end.
|
||||
*/
|
||||
nir_def *offset32 =
|
||||
nir_imul(b,
|
||||
nir_iadd(b,
|
||||
nir_imul(b, brw_load_btd_dss_id(b),
|
||||
brw_nir_rt_load_num_simd_lanes_per_dss(b, devinfo)),
|
||||
nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
|
||||
nir_imm_int(b, BRW_RT_SIZEOF_RAY_QUERY));
|
||||
return nir_isub(b, base_mem_addr, nir_u2u64(b, offset32));
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
brw_nir_rt_stack_addr(nir_builder *b)
|
||||
{
|
||||
/* From the BSpec "Address Computation for Memory Based Data Structures:
|
||||
* Ray and TraversalStack (Async Ray Tracing)":
|
||||
*
|
||||
* stackBase = RTDispatchGlobals.rtMemBasePtr
|
||||
* + (DSSID * RTDispatchGlobals.numDSSRTStacks + stackID)
|
||||
* * RTDispatchGlobals.stackSizePerRay // 64B aligned
|
||||
*
|
||||
* We assume that we can calculate a 32-bit offset first and then add it
|
||||
* to the 64-bit base address at the end.
|
||||
*/
|
||||
nir_def *offset32 =
|
||||
nir_imul(b, brw_nir_rt_async_stack_id(b),
|
||||
nir_load_ray_hw_stack_size_intel(b));
|
||||
return nir_iadd(b, nir_load_ray_base_mem_addr_intel(b),
|
||||
nir_u2u64(b, offset32));
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
brw_nir_rt_mem_hit_addr_from_addr(nir_builder *b,
|
||||
nir_def *stack_addr,
|
||||
bool committed)
|
||||
{
|
||||
return nir_iadd_imm(b, stack_addr, committed ? 0 : BRW_RT_SIZEOF_HIT_INFO);
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
brw_nir_rt_mem_hit_addr(nir_builder *b, bool committed)
|
||||
{
|
||||
return nir_iadd_imm(b, brw_nir_rt_stack_addr(b),
|
||||
committed ? 0 : BRW_RT_SIZEOF_HIT_INFO);
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
brw_nir_rt_hit_attrib_data_addr(nir_builder *b)
|
||||
{
|
||||
return nir_iadd_imm(b, brw_nir_rt_stack_addr(b),
|
||||
BRW_RT_OFFSETOF_HIT_ATTRIB_DATA);
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
brw_nir_rt_mem_ray_addr(nir_builder *b,
|
||||
nir_def *stack_addr,
|
||||
enum brw_rt_bvh_level bvh_level)
|
||||
{
|
||||
/* From the BSpec "Address Computation for Memory Based Data Structures:
|
||||
* Ray and TraversalStack (Async Ray Tracing)":
|
||||
*
|
||||
* rayBase = stackBase + sizeof(HitInfo) * 2 // 64B aligned
|
||||
* rayPtr = rayBase + bvhLevel * sizeof(Ray); // 64B aligned
|
||||
*
|
||||
* In Vulkan, we always have exactly two levels of BVH: World and Object.
|
||||
*/
|
||||
uint32_t offset = BRW_RT_SIZEOF_HIT_INFO * 2 +
|
||||
bvh_level * BRW_RT_SIZEOF_RAY;
|
||||
return nir_iadd_imm(b, stack_addr, offset);
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
brw_nir_rt_sw_stack_addr(nir_builder *b,
|
||||
const struct intel_device_info *devinfo)
|
||||
{
|
||||
nir_def *addr = nir_load_ray_base_mem_addr_intel(b);
|
||||
|
||||
nir_def *offset32 = nir_imul(b, brw_nir_num_rt_stacks(b, devinfo),
|
||||
nir_load_ray_hw_stack_size_intel(b));
|
||||
addr = nir_iadd(b, addr, nir_u2u64(b, offset32));
|
||||
|
||||
nir_def *offset_in_stack =
|
||||
nir_imul(b, nir_u2u64(b, brw_nir_rt_async_stack_id(b)),
|
||||
nir_u2u64(b, nir_load_ray_sw_stack_size_intel(b)));
|
||||
|
||||
return nir_iadd(b, addr, offset_in_stack);
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
nir_unpack_64_4x16_split_z(nir_builder *b, nir_def *val)
|
||||
{
|
||||
return nir_unpack_32_2x16_split_x(b, nir_unpack_64_2x32_split_y(b, val));
|
||||
}
|
||||
|
||||
struct brw_nir_rt_globals_defs {
|
||||
nir_def *base_mem_addr;
|
||||
nir_def *call_stack_handler_addr;
|
||||
nir_def *hw_stack_size;
|
||||
nir_def *num_dss_rt_stacks;
|
||||
nir_def *hit_sbt_addr;
|
||||
nir_def *hit_sbt_stride;
|
||||
nir_def *miss_sbt_addr;
|
||||
nir_def *miss_sbt_stride;
|
||||
nir_def *sw_stack_size;
|
||||
nir_def *launch_size;
|
||||
nir_def *call_sbt_addr;
|
||||
nir_def *call_sbt_stride;
|
||||
nir_def *resume_sbt_addr;
|
||||
};
|
||||
|
||||
static inline void
|
||||
brw_nir_rt_load_globals_addr(nir_builder *b,
|
||||
struct brw_nir_rt_globals_defs *defs,
|
||||
nir_def *addr)
|
||||
{
|
||||
nir_def *data;
|
||||
data = brw_nir_rt_load_const(b, 16, addr, nir_imm_true(b));
|
||||
defs->base_mem_addr = nir_pack_64_2x32(b, nir_trim_vector(b, data, 2));
|
||||
|
||||
defs->call_stack_handler_addr =
|
||||
nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 2));
|
||||
|
||||
defs->hw_stack_size = nir_channel(b, data, 4);
|
||||
defs->num_dss_rt_stacks = nir_iand_imm(b, nir_channel(b, data, 5), 0xffff);
|
||||
defs->hit_sbt_addr =
|
||||
nir_pack_64_2x32_split(b, nir_channel(b, data, 8),
|
||||
nir_extract_i16(b, nir_channel(b, data, 9),
|
||||
nir_imm_int(b, 0)));
|
||||
defs->hit_sbt_stride =
|
||||
nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 9));
|
||||
defs->miss_sbt_addr =
|
||||
nir_pack_64_2x32_split(b, nir_channel(b, data, 10),
|
||||
nir_extract_i16(b, nir_channel(b, data, 11),
|
||||
nir_imm_int(b, 0)));
|
||||
defs->miss_sbt_stride =
|
||||
nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 11));
|
||||
defs->sw_stack_size = nir_channel(b, data, 12);
|
||||
defs->launch_size = nir_channels(b, data, 0x7u << 13);
|
||||
|
||||
data = brw_nir_rt_load_const(b, 8, nir_iadd_imm(b, addr, 64), nir_imm_true(b));
|
||||
defs->call_sbt_addr =
|
||||
nir_pack_64_2x32_split(b, nir_channel(b, data, 0),
|
||||
nir_extract_i16(b, nir_channel(b, data, 1),
|
||||
nir_imm_int(b, 0)));
|
||||
defs->call_sbt_stride =
|
||||
nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 1));
|
||||
|
||||
defs->resume_sbt_addr =
|
||||
nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 2));
|
||||
}
|
||||
|
||||
static inline void
|
||||
brw_nir_rt_load_globals(nir_builder *b,
|
||||
struct brw_nir_rt_globals_defs *defs)
|
||||
{
|
||||
brw_nir_rt_load_globals_addr(b, defs, nir_load_btd_global_arg_addr_intel(b));
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
brw_nir_rt_unpack_leaf_ptr(nir_builder *b, nir_def *vec2)
|
||||
{
|
||||
/* Hit record leaf pointers are 42-bit and assumed to be in 64B chunks.
|
||||
* This leaves 22 bits at the top for other stuff.
|
||||
*/
|
||||
nir_def *ptr64 = nir_imul_imm(b, nir_pack_64_2x32(b, vec2), 64);
|
||||
|
||||
/* The top 16 bits (remember, we shifted by 6 already) contain garbage
|
||||
* that we need to get rid of.
|
||||
*/
|
||||
nir_def *ptr_lo = nir_unpack_64_2x32_split_x(b, ptr64);
|
||||
nir_def *ptr_hi = nir_unpack_64_2x32_split_y(b, ptr64);
|
||||
ptr_hi = nir_extract_i16(b, ptr_hi, nir_imm_int(b, 0));
|
||||
return nir_pack_64_2x32_split(b, ptr_lo, ptr_hi);
|
||||
}
|
||||
|
||||
/**
|
||||
* MemHit memory layout (BSpec 47547) :
|
||||
*
|
||||
* name bits description
|
||||
* - t 32 hit distance of current hit (or initial traversal distance)
|
||||
* - u 32 barycentric hit coordinates
|
||||
* - v 32 barycentric hit coordinates
|
||||
* - primIndexDelta 16 prim index delta for compressed meshlets and quads
|
||||
* - valid 1 set if there is a hit
|
||||
* - leafType 3 type of node primLeafPtr is pointing to
|
||||
* - primLeafIndex 4 index of the hit primitive inside the leaf
|
||||
* - bvhLevel 3 the instancing level at which the hit occured
|
||||
* - frontFace 1 whether we hit the front-facing side of a triangle (also used to pass opaque flag when calling intersection shaders)
|
||||
* - pad0 4 unused bits
|
||||
* - primLeafPtr 42 pointer to BVH leaf node (multiple of 64 bytes)
|
||||
* - hitGroupRecPtr0 22 LSB of hit group record of the hit triangle (multiple of 16 bytes)
|
||||
* - instLeafPtr 42 pointer to BVH instance leaf node (in multiple of 64 bytes)
|
||||
* - hitGroupRecPtr1 22 MSB of hit group record of the hit triangle (multiple of 32 bytes)
|
||||
*/
|
||||
struct brw_nir_rt_mem_hit_defs {
|
||||
nir_def *t;
|
||||
nir_def *tri_bary; /**< Only valid for triangle geometry */
|
||||
nir_def *aabb_hit_kind; /**< Only valid for AABB geometry */
|
||||
nir_def *valid;
|
||||
nir_def *leaf_type;
|
||||
nir_def *prim_index_delta;
|
||||
nir_def *prim_leaf_index;
|
||||
nir_def *bvh_level;
|
||||
nir_def *front_face;
|
||||
nir_def *done; /**< Only for ray queries */
|
||||
nir_def *prim_leaf_ptr;
|
||||
nir_def *inst_leaf_ptr;
|
||||
};
|
||||
|
||||
static inline void
|
||||
brw_nir_rt_load_mem_hit_from_addr(nir_builder *b,
|
||||
struct brw_nir_rt_mem_hit_defs *defs,
|
||||
nir_def *stack_addr,
|
||||
bool committed)
|
||||
{
|
||||
nir_def *hit_addr =
|
||||
brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, committed);
|
||||
|
||||
nir_def *data = brw_nir_rt_load(b, hit_addr, 16, 4, 32);
|
||||
defs->t = nir_channel(b, data, 0);
|
||||
defs->aabb_hit_kind = nir_channel(b, data, 1);
|
||||
defs->tri_bary = nir_channels(b, data, 0x6);
|
||||
nir_def *bitfield = nir_channel(b, data, 3);
|
||||
defs->prim_index_delta =
|
||||
nir_ubitfield_extract(b, bitfield, nir_imm_int(b, 0), nir_imm_int(b, 16));
|
||||
defs->valid = nir_i2b(b, nir_iand_imm(b, bitfield, 1u << 16));
|
||||
defs->leaf_type =
|
||||
nir_ubitfield_extract(b, bitfield, nir_imm_int(b, 17), nir_imm_int(b, 3));
|
||||
defs->prim_leaf_index =
|
||||
nir_ubitfield_extract(b, bitfield, nir_imm_int(b, 20), nir_imm_int(b, 4));
|
||||
defs->bvh_level =
|
||||
nir_ubitfield_extract(b, bitfield, nir_imm_int(b, 24), nir_imm_int(b, 3));
|
||||
defs->front_face = nir_i2b(b, nir_iand_imm(b, bitfield, 1 << 27));
|
||||
defs->done = nir_i2b(b, nir_iand_imm(b, bitfield, 1 << 28));
|
||||
|
||||
data = brw_nir_rt_load(b, nir_iadd_imm(b, hit_addr, 16), 16, 4, 32);
|
||||
defs->prim_leaf_ptr =
|
||||
brw_nir_rt_unpack_leaf_ptr(b, nir_channels(b, data, 0x3 << 0));
|
||||
defs->inst_leaf_ptr =
|
||||
brw_nir_rt_unpack_leaf_ptr(b, nir_channels(b, data, 0x3 << 2));
|
||||
}
|
||||
|
||||
static inline void
|
||||
brw_nir_rt_load_mem_hit(nir_builder *b,
|
||||
struct brw_nir_rt_mem_hit_defs *defs,
|
||||
bool committed)
|
||||
{
|
||||
brw_nir_rt_load_mem_hit_from_addr(b, defs, brw_nir_rt_stack_addr(b),
|
||||
committed);
|
||||
}
|
||||
|
||||
static inline void
|
||||
brw_nir_memcpy_global(nir_builder *b,
|
||||
nir_def *dst_addr, uint32_t dst_align,
|
||||
nir_def *src_addr, uint32_t src_align,
|
||||
uint32_t size)
|
||||
{
|
||||
/* We're going to copy in 16B chunks */
|
||||
assert(size % 16 == 0);
|
||||
dst_align = MIN2(dst_align, 16);
|
||||
src_align = MIN2(src_align, 16);
|
||||
|
||||
for (unsigned offset = 0; offset < size; offset += 16) {
|
||||
nir_def *data =
|
||||
brw_nir_rt_load(b, nir_iadd_imm(b, src_addr, offset), 16,
|
||||
4, 32);
|
||||
brw_nir_rt_store(b, nir_iadd_imm(b, dst_addr, offset), 16,
|
||||
data, 0xf /* write_mask */);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
brw_nir_memclear_global(nir_builder *b,
|
||||
nir_def *dst_addr, uint32_t dst_align,
|
||||
uint32_t size)
|
||||
{
|
||||
/* We're going to copy in 16B chunks */
|
||||
assert(size % 16 == 0);
|
||||
dst_align = MIN2(dst_align, 16);
|
||||
|
||||
nir_def *zero = nir_imm_ivec4(b, 0, 0, 0, 0);
|
||||
for (unsigned offset = 0; offset < size; offset += 16) {
|
||||
brw_nir_rt_store(b, nir_iadd_imm(b, dst_addr, offset), dst_align,
|
||||
zero, 0xf /* write_mask */);
|
||||
}
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
brw_nir_rt_query_done(nir_builder *b, nir_def *stack_addr)
|
||||
{
|
||||
struct brw_nir_rt_mem_hit_defs hit_in = {};
|
||||
brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, stack_addr,
|
||||
false /* committed */);
|
||||
|
||||
return hit_in.done;
|
||||
}
|
||||
|
||||
static inline void
|
||||
brw_nir_rt_set_dword_bit_at(nir_builder *b,
|
||||
nir_def *addr,
|
||||
uint32_t addr_offset,
|
||||
uint32_t bit)
|
||||
{
|
||||
nir_def *dword_addr = nir_iadd_imm(b, addr, addr_offset);
|
||||
nir_def *dword = brw_nir_rt_load(b, dword_addr, 4, 1, 32);
|
||||
brw_nir_rt_store(b, dword_addr, 4, nir_ior_imm(b, dword, 1u << bit), 0x1);
|
||||
}
|
||||
|
||||
static inline void
|
||||
brw_nir_rt_query_mark_done(nir_builder *b, nir_def *stack_addr)
|
||||
{
|
||||
brw_nir_rt_set_dword_bit_at(b,
|
||||
brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr,
|
||||
false /* committed */),
|
||||
4 * 3 /* dword offset */, 28 /* bit */);
|
||||
}
|
||||
|
||||
/* This helper clears the 3rd dword of the MemHit structure where the valid
|
||||
* bit is located.
|
||||
*/
|
||||
static inline void
|
||||
brw_nir_rt_query_mark_init(nir_builder *b, nir_def *stack_addr)
|
||||
{
|
||||
nir_def *dword_addr;
|
||||
|
||||
for (uint32_t i = 0; i < 2; i++) {
|
||||
dword_addr =
|
||||
nir_iadd_imm(b,
|
||||
brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr,
|
||||
i == 0 /* committed */),
|
||||
4 * 3 /* dword offset */);
|
||||
brw_nir_rt_store(b, dword_addr, 4, nir_imm_int(b, 0), 0x1);
|
||||
}
|
||||
}
|
||||
|
||||
/* This helper is pretty much a memcpy of uncommitted into committed hit
|
||||
* structure, just adding the valid bit.
|
||||
*/
|
||||
static inline void
|
||||
brw_nir_rt_commit_hit_addr(nir_builder *b, nir_def *stack_addr)
|
||||
{
|
||||
nir_def *dst_addr =
|
||||
brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, true /* committed */);
|
||||
nir_def *src_addr =
|
||||
brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, false /* committed */);
|
||||
|
||||
for (unsigned offset = 0; offset < BRW_RT_SIZEOF_HIT_INFO; offset += 16) {
|
||||
nir_def *data =
|
||||
brw_nir_rt_load(b, nir_iadd_imm(b, src_addr, offset), 16, 4, 32);
|
||||
|
||||
if (offset == 0) {
|
||||
data = nir_vec4(b,
|
||||
nir_channel(b, data, 0),
|
||||
nir_channel(b, data, 1),
|
||||
nir_channel(b, data, 2),
|
||||
nir_ior_imm(b,
|
||||
nir_channel(b, data, 3),
|
||||
0x1 << 16 /* valid */));
|
||||
|
||||
/* Also write the potential hit as we change it. */
|
||||
brw_nir_rt_store(b, nir_iadd_imm(b, src_addr, offset), 16,
|
||||
data, 0xf /* write_mask */);
|
||||
}
|
||||
|
||||
brw_nir_rt_store(b, nir_iadd_imm(b, dst_addr, offset), 16,
|
||||
data, 0xf /* write_mask */);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
brw_nir_rt_commit_hit(nir_builder *b)
|
||||
{
|
||||
nir_def *stack_addr = brw_nir_rt_stack_addr(b);
|
||||
brw_nir_rt_commit_hit_addr(b, stack_addr);
|
||||
}
|
||||
|
||||
static inline void
|
||||
brw_nir_rt_generate_hit_addr(nir_builder *b, nir_def *stack_addr, nir_def *t_val)
|
||||
{
|
||||
nir_def *committed_addr =
|
||||
brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, true /* committed */);
|
||||
nir_def *potential_addr =
|
||||
brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, false /* committed */);
|
||||
|
||||
/* Set:
|
||||
*
|
||||
* potential.t = t_val;
|
||||
* potential.valid = true;
|
||||
*/
|
||||
nir_def *potential_hit_dwords_0_3 =
|
||||
brw_nir_rt_load(b, potential_addr, 16, 4, 32);
|
||||
potential_hit_dwords_0_3 =
|
||||
nir_vec4(b,
|
||||
t_val,
|
||||
nir_channel(b, potential_hit_dwords_0_3, 1),
|
||||
nir_channel(b, potential_hit_dwords_0_3, 2),
|
||||
nir_ior_imm(b, nir_channel(b, potential_hit_dwords_0_3, 3),
|
||||
(0x1 << 16) /* valid */));
|
||||
brw_nir_rt_store(b, potential_addr, 16, potential_hit_dwords_0_3, 0xf /* write_mask */);
|
||||
|
||||
/* Set:
|
||||
*
|
||||
* committed.t = t_val;
|
||||
* committed.u = 0.0f;
|
||||
* committed.v = 0.0f;
|
||||
* committed.valid = true;
|
||||
* committed.leaf_type = potential.leaf_type;
|
||||
* committed.bvh_level = BRW_RT_BVH_LEVEL_OBJECT;
|
||||
* committed.front_face = false;
|
||||
* committed.prim_leaf_index = 0;
|
||||
* committed.done = false;
|
||||
*/
|
||||
nir_def *committed_hit_dwords_0_3 =
|
||||
brw_nir_rt_load(b, committed_addr, 16, 4, 32);
|
||||
committed_hit_dwords_0_3 =
|
||||
nir_vec4(b,
|
||||
t_val,
|
||||
nir_imm_float(b, 0.0f),
|
||||
nir_imm_float(b, 0.0f),
|
||||
nir_ior_imm(b,
|
||||
nir_ior_imm(b, nir_channel(b, potential_hit_dwords_0_3, 3), 0x000e0000),
|
||||
(0x1 << 16) /* valid */ |
|
||||
(BRW_RT_BVH_LEVEL_OBJECT << 24) /* leaf_type */));
|
||||
brw_nir_rt_store(b, committed_addr, 16, committed_hit_dwords_0_3, 0xf /* write_mask */);
|
||||
|
||||
/* Set:
|
||||
*
|
||||
* committed.prim_leaf_ptr = potential.prim_leaf_ptr;
|
||||
* committed.inst_leaf_ptr = potential.inst_leaf_ptr;
|
||||
*/
|
||||
brw_nir_memcpy_global(b,
|
||||
nir_iadd_imm(b, committed_addr, 16), 16,
|
||||
nir_iadd_imm(b, potential_addr, 16), 16,
|
||||
16);
|
||||
}
|
||||
|
||||
struct brw_nir_rt_mem_ray_defs {
|
||||
nir_def *orig;
|
||||
nir_def *dir;
|
||||
nir_def *t_near;
|
||||
nir_def *t_far;
|
||||
nir_def *root_node_ptr;
|
||||
nir_def *ray_flags;
|
||||
nir_def *hit_group_sr_base_ptr;
|
||||
nir_def *hit_group_sr_stride;
|
||||
nir_def *miss_sr_ptr;
|
||||
nir_def *shader_index_multiplier;
|
||||
nir_def *inst_leaf_ptr;
|
||||
nir_def *ray_mask;
|
||||
};
|
||||
|
||||
static inline void
|
||||
brw_nir_rt_store_mem_ray_query_at_addr(nir_builder *b,
|
||||
nir_def *ray_addr,
|
||||
const struct brw_nir_rt_mem_ray_defs *defs)
|
||||
{
|
||||
assert_def_size(defs->orig, 3, 32);
|
||||
assert_def_size(defs->dir, 3, 32);
|
||||
brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 0), 16,
|
||||
nir_vec4(b, nir_channel(b, defs->orig, 0),
|
||||
nir_channel(b, defs->orig, 1),
|
||||
nir_channel(b, defs->orig, 2),
|
||||
nir_channel(b, defs->dir, 0)),
|
||||
~0 /* write mask */);
|
||||
|
||||
assert_def_size(defs->t_near, 1, 32);
|
||||
assert_def_size(defs->t_far, 1, 32);
|
||||
brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 16), 16,
|
||||
nir_vec4(b, nir_channel(b, defs->dir, 1),
|
||||
nir_channel(b, defs->dir, 2),
|
||||
defs->t_near,
|
||||
defs->t_far),
|
||||
~0 /* write mask */);
|
||||
|
||||
assert_def_size(defs->root_node_ptr, 1, 64);
|
||||
assert_def_size(defs->ray_flags, 1, 16);
|
||||
brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 32), 16,
|
||||
nir_vec2(b, nir_unpack_64_2x32_split_x(b, defs->root_node_ptr),
|
||||
nir_pack_32_2x16_split(b,
|
||||
nir_unpack_64_4x16_split_z(b, defs->root_node_ptr),
|
||||
defs->ray_flags)),
|
||||
0x3 /* write mask */);
|
||||
|
||||
/* leaf_ptr is optional */
|
||||
nir_def *inst_leaf_ptr;
|
||||
if (defs->inst_leaf_ptr) {
|
||||
inst_leaf_ptr = defs->inst_leaf_ptr;
|
||||
} else {
|
||||
inst_leaf_ptr = nir_imm_int64(b, 0);
|
||||
}
|
||||
|
||||
assert_def_size(inst_leaf_ptr, 1, 64);
|
||||
assert_def_size(defs->ray_mask, 1, 32);
|
||||
brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 56), 8,
|
||||
nir_vec2(b, nir_unpack_64_2x32_split_x(b, inst_leaf_ptr),
|
||||
nir_pack_32_2x16_split(b,
|
||||
nir_unpack_64_4x16_split_z(b, inst_leaf_ptr),
|
||||
nir_unpack_32_2x16_split_x(b, defs->ray_mask))),
|
||||
~0 /* write mask */);
|
||||
}
|
||||
|
||||
static inline void
|
||||
brw_nir_rt_store_mem_ray(nir_builder *b,
|
||||
const struct brw_nir_rt_mem_ray_defs *defs,
|
||||
enum brw_rt_bvh_level bvh_level)
|
||||
{
|
||||
nir_def *ray_addr =
|
||||
brw_nir_rt_mem_ray_addr(b, brw_nir_rt_stack_addr(b), bvh_level);
|
||||
|
||||
assert_def_size(defs->orig, 3, 32);
|
||||
assert_def_size(defs->dir, 3, 32);
|
||||
brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 0), 16,
|
||||
nir_vec4(b, nir_channel(b, defs->orig, 0),
|
||||
nir_channel(b, defs->orig, 1),
|
||||
nir_channel(b, defs->orig, 2),
|
||||
nir_channel(b, defs->dir, 0)),
|
||||
~0 /* write mask */);
|
||||
|
||||
assert_def_size(defs->t_near, 1, 32);
|
||||
assert_def_size(defs->t_far, 1, 32);
|
||||
brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 16), 16,
|
||||
nir_vec4(b, nir_channel(b, defs->dir, 1),
|
||||
nir_channel(b, defs->dir, 2),
|
||||
defs->t_near,
|
||||
defs->t_far),
|
||||
~0 /* write mask */);
|
||||
|
||||
assert_def_size(defs->root_node_ptr, 1, 64);
|
||||
assert_def_size(defs->ray_flags, 1, 16);
|
||||
assert_def_size(defs->hit_group_sr_base_ptr, 1, 64);
|
||||
assert_def_size(defs->hit_group_sr_stride, 1, 16);
|
||||
brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 32), 16,
|
||||
nir_vec4(b, nir_unpack_64_2x32_split_x(b, defs->root_node_ptr),
|
||||
nir_pack_32_2x16_split(b,
|
||||
nir_unpack_64_4x16_split_z(b, defs->root_node_ptr),
|
||||
defs->ray_flags),
|
||||
nir_unpack_64_2x32_split_x(b, defs->hit_group_sr_base_ptr),
|
||||
nir_pack_32_2x16_split(b,
|
||||
nir_unpack_64_4x16_split_z(b, defs->hit_group_sr_base_ptr),
|
||||
defs->hit_group_sr_stride)),
|
||||
~0 /* write mask */);
|
||||
|
||||
/* leaf_ptr is optional */
|
||||
nir_def *inst_leaf_ptr;
|
||||
if (defs->inst_leaf_ptr) {
|
||||
inst_leaf_ptr = defs->inst_leaf_ptr;
|
||||
} else {
|
||||
inst_leaf_ptr = nir_imm_int64(b, 0);
|
||||
}
|
||||
|
||||
assert_def_size(defs->miss_sr_ptr, 1, 64);
|
||||
assert_def_size(defs->shader_index_multiplier, 1, 32);
|
||||
assert_def_size(inst_leaf_ptr, 1, 64);
|
||||
assert_def_size(defs->ray_mask, 1, 32);
|
||||
brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 48), 16,
|
||||
nir_vec4(b, nir_unpack_64_2x32_split_x(b, defs->miss_sr_ptr),
|
||||
nir_pack_32_2x16_split(b,
|
||||
nir_unpack_64_4x16_split_z(b, defs->miss_sr_ptr),
|
||||
nir_unpack_32_2x16_split_x(b,
|
||||
nir_ishl(b, defs->shader_index_multiplier,
|
||||
nir_imm_int(b, 8)))),
|
||||
nir_unpack_64_2x32_split_x(b, inst_leaf_ptr),
|
||||
nir_pack_32_2x16_split(b,
|
||||
nir_unpack_64_4x16_split_z(b, inst_leaf_ptr),
|
||||
nir_unpack_32_2x16_split_x(b, defs->ray_mask))),
|
||||
~0 /* write mask */);
|
||||
}
|
||||
|
||||
static inline void
|
||||
brw_nir_rt_load_mem_ray_from_addr(nir_builder *b,
|
||||
struct brw_nir_rt_mem_ray_defs *defs,
|
||||
nir_def *ray_base_addr,
|
||||
enum brw_rt_bvh_level bvh_level)
|
||||
{
|
||||
nir_def *ray_addr = brw_nir_rt_mem_ray_addr(b,
|
||||
ray_base_addr,
|
||||
bvh_level);
|
||||
|
||||
nir_def *data[4] = {
|
||||
brw_nir_rt_load(b, nir_iadd_imm(b, ray_addr, 0), 16, 4, 32),
|
||||
brw_nir_rt_load(b, nir_iadd_imm(b, ray_addr, 16), 16, 4, 32),
|
||||
brw_nir_rt_load(b, nir_iadd_imm(b, ray_addr, 32), 16, 4, 32),
|
||||
brw_nir_rt_load(b, nir_iadd_imm(b, ray_addr, 48), 16, 4, 32),
|
||||
};
|
||||
|
||||
defs->orig = nir_trim_vector(b, data[0], 3);
|
||||
defs->dir = nir_vec3(b, nir_channel(b, data[0], 3),
|
||||
nir_channel(b, data[1], 0),
|
||||
nir_channel(b, data[1], 1));
|
||||
defs->t_near = nir_channel(b, data[1], 2);
|
||||
defs->t_far = nir_channel(b, data[1], 3);
|
||||
defs->root_node_ptr =
|
||||
nir_pack_64_2x32_split(b, nir_channel(b, data[2], 0),
|
||||
nir_extract_i16(b, nir_channel(b, data[2], 1),
|
||||
nir_imm_int(b, 0)));
|
||||
defs->ray_flags =
|
||||
nir_unpack_32_2x16_split_y(b, nir_channel(b, data[2], 1));
|
||||
defs->hit_group_sr_base_ptr =
|
||||
nir_pack_64_2x32_split(b, nir_channel(b, data[2], 2),
|
||||
nir_extract_i16(b, nir_channel(b, data[2], 3),
|
||||
nir_imm_int(b, 0)));
|
||||
defs->hit_group_sr_stride =
|
||||
nir_unpack_32_2x16_split_y(b, nir_channel(b, data[2], 3));
|
||||
defs->miss_sr_ptr =
|
||||
nir_pack_64_2x32_split(b, nir_channel(b, data[3], 0),
|
||||
nir_extract_i16(b, nir_channel(b, data[3], 1),
|
||||
nir_imm_int(b, 0)));
|
||||
defs->shader_index_multiplier =
|
||||
nir_ushr(b, nir_unpack_32_2x16_split_y(b, nir_channel(b, data[3], 1)),
|
||||
nir_imm_int(b, 8));
|
||||
defs->inst_leaf_ptr =
|
||||
nir_pack_64_2x32_split(b, nir_channel(b, data[3], 2),
|
||||
nir_extract_i16(b, nir_channel(b, data[3], 3),
|
||||
nir_imm_int(b, 0)));
|
||||
defs->ray_mask =
|
||||
nir_unpack_32_2x16_split_y(b, nir_channel(b, data[3], 3));
|
||||
}
|
||||
|
||||
static inline void
|
||||
brw_nir_rt_load_mem_ray(nir_builder *b,
|
||||
struct brw_nir_rt_mem_ray_defs *defs,
|
||||
enum brw_rt_bvh_level bvh_level)
|
||||
{
|
||||
brw_nir_rt_load_mem_ray_from_addr(b, defs, brw_nir_rt_stack_addr(b),
|
||||
bvh_level);
|
||||
}
|
||||
|
||||
struct brw_nir_rt_bvh_instance_leaf_defs {
|
||||
nir_def *shader_index;
|
||||
nir_def *contribution_to_hit_group_index;
|
||||
nir_def *world_to_object[4];
|
||||
nir_def *instance_id;
|
||||
nir_def *instance_index;
|
||||
nir_def *object_to_world[4];
|
||||
};
|
||||
|
||||
static inline void
|
||||
brw_nir_rt_load_bvh_instance_leaf(nir_builder *b,
|
||||
struct brw_nir_rt_bvh_instance_leaf_defs *defs,
|
||||
nir_def *leaf_addr)
|
||||
{
|
||||
nir_def *leaf_desc = brw_nir_rt_load(b, leaf_addr, 4, 2, 32);
|
||||
|
||||
defs->shader_index =
|
||||
nir_iand_imm(b, nir_channel(b, leaf_desc, 0), (1 << 24) - 1);
|
||||
defs->contribution_to_hit_group_index =
|
||||
nir_iand_imm(b, nir_channel(b, leaf_desc, 1), (1 << 24) - 1);
|
||||
|
||||
defs->world_to_object[0] =
|
||||
brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 16), 4, 3, 32);
|
||||
defs->world_to_object[1] =
|
||||
brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 28), 4, 3, 32);
|
||||
defs->world_to_object[2] =
|
||||
brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 40), 4, 3, 32);
|
||||
/* The last column of the matrices is swapped between the two probably
|
||||
* because it makes it easier/faster for hardware somehow.
|
||||
*/
|
||||
defs->object_to_world[3] =
|
||||
brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 52), 4, 3, 32);
|
||||
|
||||
nir_def *data =
|
||||
brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 64), 4, 4, 32);
|
||||
defs->instance_id = nir_channel(b, data, 2);
|
||||
defs->instance_index = nir_channel(b, data, 3);
|
||||
|
||||
defs->object_to_world[0] =
|
||||
brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 80), 4, 3, 32);
|
||||
defs->object_to_world[1] =
|
||||
brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 92), 4, 3, 32);
|
||||
defs->object_to_world[2] =
|
||||
brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 104), 4, 3, 32);
|
||||
defs->world_to_object[3] =
|
||||
brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 116), 4, 3, 32);
|
||||
}
|
||||
|
||||
struct brw_nir_rt_bvh_primitive_leaf_defs {
|
||||
nir_def *shader_index;
|
||||
nir_def *geom_mask;
|
||||
nir_def *geom_index;
|
||||
nir_def *type;
|
||||
nir_def *geom_flags;
|
||||
};
|
||||
|
||||
static inline void
|
||||
brw_nir_rt_load_bvh_primitive_leaf(nir_builder *b,
|
||||
struct brw_nir_rt_bvh_primitive_leaf_defs *defs,
|
||||
nir_def *leaf_addr)
|
||||
{
|
||||
nir_def *desc = brw_nir_rt_load(b, leaf_addr, 4, 2, 32);
|
||||
|
||||
defs->shader_index =
|
||||
nir_ubitfield_extract(b, nir_channel(b, desc, 0),
|
||||
nir_imm_int(b, 23), nir_imm_int(b, 0));
|
||||
defs->geom_mask =
|
||||
nir_ubitfield_extract(b, nir_channel(b, desc, 0),
|
||||
nir_imm_int(b, 31), nir_imm_int(b, 24));
|
||||
|
||||
defs->geom_index =
|
||||
nir_ubitfield_extract(b, nir_channel(b, desc, 1),
|
||||
nir_imm_int(b, 28), nir_imm_int(b, 0));
|
||||
defs->type =
|
||||
nir_ubitfield_extract(b, nir_channel(b, desc, 1),
|
||||
nir_imm_int(b, 29), nir_imm_int(b, 29));
|
||||
defs->geom_flags =
|
||||
nir_ubitfield_extract(b, nir_channel(b, desc, 1),
|
||||
nir_imm_int(b, 31), nir_imm_int(b, 30));
|
||||
}
|
||||
|
||||
struct brw_nir_rt_bvh_primitive_leaf_positions_defs {
|
||||
nir_def *positions[3];
|
||||
};
|
||||
|
||||
static inline void
|
||||
brw_nir_rt_load_bvh_primitive_leaf_positions(nir_builder *b,
|
||||
struct brw_nir_rt_bvh_primitive_leaf_positions_defs *defs,
|
||||
nir_def *leaf_addr)
|
||||
{
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(defs->positions); i++) {
|
||||
defs->positions[i] =
|
||||
brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 16 + i * 4 * 3), 4, 3, 32);
|
||||
}
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
brw_nir_rt_load_primitive_id_from_hit(nir_builder *b,
|
||||
nir_def *is_procedural,
|
||||
const struct brw_nir_rt_mem_hit_defs *defs)
|
||||
{
|
||||
if (!is_procedural) {
|
||||
is_procedural =
|
||||
nir_ieq_imm(b, defs->leaf_type,
|
||||
BRW_RT_BVH_NODE_TYPE_PROCEDURAL);
|
||||
}
|
||||
|
||||
nir_def *prim_id_proc, *prim_id_quad;
|
||||
nir_push_if(b, is_procedural);
|
||||
{
|
||||
/* For procedural leafs, the index is in dw[3]. */
|
||||
nir_def *offset =
|
||||
nir_iadd_imm(b, nir_ishl_imm(b, defs->prim_leaf_index, 2), 12);
|
||||
prim_id_proc = nir_load_global(b, nir_iadd(b, defs->prim_leaf_ptr,
|
||||
nir_u2u64(b, offset)),
|
||||
4, /* align */ 1, 32);
|
||||
}
|
||||
nir_push_else(b, NULL);
|
||||
{
|
||||
/* For quad leafs, the index is dw[2] and there is a 16bit additional
|
||||
* offset in dw[3].
|
||||
*/
|
||||
prim_id_quad = nir_load_global(b, nir_iadd_imm(b, defs->prim_leaf_ptr, 8),
|
||||
4, /* align */ 1, 32);
|
||||
prim_id_quad = nir_iadd(b,
|
||||
prim_id_quad,
|
||||
defs->prim_index_delta);
|
||||
}
|
||||
nir_pop_if(b, NULL);
|
||||
|
||||
return nir_if_phi(b, prim_id_proc, prim_id_quad);
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
brw_nir_rt_acceleration_structure_to_root_node(nir_builder *b,
|
||||
nir_def *as_addr)
|
||||
{
|
||||
/* The HW memory structure in which we specify what acceleration structure
|
||||
* to traverse, takes the address to the root node in the acceleration
|
||||
* structure, not the acceleration structure itself. To find that, we have
|
||||
* to read the root node offset from the acceleration structure which is
|
||||
* the first QWord.
|
||||
*
|
||||
* But if the acceleration structure pointer is NULL, then we should return
|
||||
* NULL as root node pointer.
|
||||
*
|
||||
* TODO: we could optimize this by assuming that for a given version of the
|
||||
* BVH, we can find the root node at a given offset.
|
||||
*/
|
||||
nir_def *root_node_ptr, *null_node_ptr;
|
||||
nir_push_if(b, nir_ieq_imm(b, as_addr, 0));
|
||||
{
|
||||
null_node_ptr = nir_imm_int64(b, 0);
|
||||
}
|
||||
nir_push_else(b, NULL);
|
||||
{
|
||||
root_node_ptr =
|
||||
nir_iadd(b, as_addr, brw_nir_rt_load(b, as_addr, 256, 1, 64));
|
||||
}
|
||||
nir_pop_if(b, NULL);
|
||||
|
||||
return nir_if_phi(b, null_node_ptr, root_node_ptr);
|
||||
}
|
||||
|
||||
#endif /* BRW_NIR_RT_BUILDER_H */
|
||||
67
src/intel/compiler/elk/brw_nir_trig_workarounds.py
Normal file
67
src/intel/compiler/elk/brw_nir_trig_workarounds.py
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
#
|
||||
# Copyright (C) 2016 Intel Corporation
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the "Software"),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice (including the next
|
||||
# paragraph) shall be included in all copies or substantial portions of the
|
||||
# Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
# IN THE SOFTWARE.
|
||||
|
||||
# Prior to Kaby Lake, The SIN and COS instructions on Intel hardware can
|
||||
# produce values slightly outside of the [-1.0, 1.0] range for a small set of
|
||||
# values. Obviously, this can break everyone's expectations about trig
|
||||
# functions. This appears to be fixed in Kaby Lake.
|
||||
#
|
||||
# According to an internal presentation, the COS instruction can produce
|
||||
# a value up to 1.000027 for inputs in the range (0.08296, 0.09888). One
|
||||
# suggested workaround is to multiply by 0.99997, scaling down the
|
||||
# amplitude slightly. Apparently this also minimizes the error function,
|
||||
# reducing the maximum error from 0.00006 to about 0.00003.
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from math import pi
|
||||
|
||||
TRIG_WORKAROUNDS = [
|
||||
(('fsin', 'x(is_not_const)'), ('fmul', ('fsin', 'x'), 0.99997)),
|
||||
(('fcos', 'x(is_not_const)'), ('fmul', ('fcos', 'x'), 0.99997)),
|
||||
]
|
||||
|
||||
LIMIT_TRIG_INPUT_RANGE_WORKAROUND = [
|
||||
(('fsin', 'x(is_not_const)'), ('fsin', ('fmod', 'x', 2.0 * pi))),
|
||||
(('fcos', 'x(is_not_const)'), ('fcos', ('fmod', 'x', 2.0 * pi))),
|
||||
]
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-p', '--import-path', required=True)
|
||||
args = parser.parse_args()
|
||||
sys.path.insert(0, args.import_path)
|
||||
run()
|
||||
|
||||
|
||||
def run():
|
||||
import nir_algebraic # pylint: disable=import-error
|
||||
|
||||
print('#include "brw_nir.h"')
|
||||
print(nir_algebraic.AlgebraicPass("brw_nir_apply_trig_workarounds",
|
||||
TRIG_WORKAROUNDS).render())
|
||||
print(nir_algebraic.AlgebraicPass("brw_nir_limit_trig_input_range_workaround",
|
||||
LIMIT_TRIG_INPUT_RANGE_WORKAROUND).render())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
75
src/intel/compiler/elk/brw_packed_float.c
Normal file
75
src/intel/compiler/elk/brw_packed_float.c
Normal file
|
|
@ -0,0 +1,75 @@
|
|||
/*
|
||||
* Copyright © 2014 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
#include "brw_reg.h"
|
||||
|
||||
union fu {
|
||||
float f;
|
||||
unsigned u;
|
||||
struct {
|
||||
unsigned mantissa:23;
|
||||
unsigned exponent:8;
|
||||
unsigned sign:1;
|
||||
} s;
|
||||
};
|
||||
|
||||
int
|
||||
brw_float_to_vf(float f)
|
||||
{
|
||||
union fu fu = { .f = f };
|
||||
|
||||
/* ±0.0f is special cased. */
|
||||
if (f == 0.0f)
|
||||
return fu.s.sign << 7;
|
||||
|
||||
unsigned mantissa = fu.s.mantissa >> (23 - 4);
|
||||
unsigned exponent = fu.s.exponent - (127 - 3);
|
||||
unsigned vf = (fu.s.sign << 7) | (exponent << 4) | mantissa;
|
||||
|
||||
/* 0.125 would have had the same representation as 0.0, so reject it. */
|
||||
if ((vf & 0x7f) == 0)
|
||||
return -1;
|
||||
|
||||
/* Make sure the mantissa fits in 4-bits and the exponent in 3-bits. */
|
||||
if (fu.u & 0x7ffff || exponent > 7)
|
||||
return -1;
|
||||
|
||||
return vf;
|
||||
}
|
||||
|
||||
float
|
||||
brw_vf_to_float(unsigned char vf)
|
||||
{
|
||||
union fu fu;
|
||||
|
||||
/* ±0.0f is special cased. */
|
||||
if (vf == 0x00 || vf == 0x80) {
|
||||
fu.u = (unsigned)vf << 24;
|
||||
return fu.f;
|
||||
}
|
||||
|
||||
fu.s.sign = vf >> 7;
|
||||
fu.s.exponent = ((vf & 0x70) >> 4) + (127 - 3);
|
||||
fu.s.mantissa = (vf & 0xf) << (23 - 4);
|
||||
|
||||
return fu.f;
|
||||
}
|
||||
243
src/intel/compiler/elk/brw_predicated_break.cpp
Normal file
243
src/intel/compiler/elk/brw_predicated_break.cpp
Normal file
|
|
@ -0,0 +1,243 @@
|
|||
/*
|
||||
* Copyright © 2013 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_shader.h"
|
||||
|
||||
using namespace brw;
|
||||
|
||||
/** @file brw_predicated_break.cpp
|
||||
*
|
||||
* Loops are often structured as
|
||||
*
|
||||
* loop:
|
||||
* CMP.f0
|
||||
* (+f0) IF
|
||||
* BREAK
|
||||
* ENDIF
|
||||
* ...
|
||||
* WHILE loop
|
||||
*
|
||||
* This peephole pass removes the IF and ENDIF instructions and predicates the
|
||||
* BREAK, dropping two instructions from the loop body.
|
||||
*
|
||||
* If the loop was a DO { ... } WHILE loop, it looks like
|
||||
*
|
||||
* loop:
|
||||
* ...
|
||||
* CMP.f0
|
||||
* (+f0) IF
|
||||
* BREAK
|
||||
* ENDIF
|
||||
* WHILE loop
|
||||
*
|
||||
* and we can remove the BREAK instruction and predicate the WHILE.
|
||||
*/
|
||||
|
||||
#define MAX_NESTING 128
|
||||
|
||||
struct loop_continue_tracking {
|
||||
BITSET_WORD has_continue[BITSET_WORDS(MAX_NESTING)];
|
||||
unsigned depth;
|
||||
};
|
||||
|
||||
static void
|
||||
enter_loop(struct loop_continue_tracking *s)
|
||||
{
|
||||
s->depth++;
|
||||
|
||||
/* Any loops deeper than that maximum nesting will just re-use the last
|
||||
* flag. This simplifies most of the code. MAX_NESTING is chosen to be
|
||||
* large enough that it is unlikely to occur. Even if it does, the
|
||||
* optimization that uses this tracking is unlikely to make much
|
||||
* difference.
|
||||
*/
|
||||
if (s->depth < MAX_NESTING)
|
||||
BITSET_CLEAR(s->has_continue, s->depth);
|
||||
}
|
||||
|
||||
static void
|
||||
exit_loop(struct loop_continue_tracking *s)
|
||||
{
|
||||
assert(s->depth > 0);
|
||||
s->depth--;
|
||||
}
|
||||
|
||||
static void
|
||||
set_continue(struct loop_continue_tracking *s)
|
||||
{
|
||||
const unsigned i = MIN2(s->depth, MAX_NESTING - 1);
|
||||
|
||||
BITSET_SET(s->has_continue, i);
|
||||
}
|
||||
|
||||
static bool
|
||||
has_continue(const struct loop_continue_tracking *s)
|
||||
{
|
||||
const unsigned i = MIN2(s->depth, MAX_NESTING - 1);
|
||||
|
||||
return BITSET_TEST(s->has_continue, i);
|
||||
}
|
||||
|
||||
bool
|
||||
opt_predicated_break(backend_shader *s)
|
||||
{
|
||||
bool progress = false;
|
||||
struct loop_continue_tracking state = { {0, }, 0 };
|
||||
|
||||
foreach_block (block, s->cfg) {
|
||||
/* DO instructions, by definition, can only be found at the beginning of
|
||||
* basic blocks.
|
||||
*/
|
||||
backend_instruction *const do_inst = block->start();
|
||||
|
||||
/* BREAK, CONTINUE, and WHILE instructions, by definition, can only be
|
||||
* found at the ends of basic blocks.
|
||||
*/
|
||||
backend_instruction *jump_inst = block->end();
|
||||
|
||||
if (do_inst->opcode == BRW_OPCODE_DO)
|
||||
enter_loop(&state);
|
||||
|
||||
if (jump_inst->opcode == BRW_OPCODE_CONTINUE)
|
||||
set_continue(&state);
|
||||
else if (jump_inst->opcode == BRW_OPCODE_WHILE)
|
||||
exit_loop(&state);
|
||||
|
||||
if (block->start_ip != block->end_ip)
|
||||
continue;
|
||||
|
||||
if (jump_inst->opcode != BRW_OPCODE_BREAK &&
|
||||
jump_inst->opcode != BRW_OPCODE_CONTINUE)
|
||||
continue;
|
||||
|
||||
backend_instruction *if_inst = block->prev()->end();
|
||||
if (if_inst->opcode != BRW_OPCODE_IF)
|
||||
continue;
|
||||
|
||||
backend_instruction *endif_inst = block->next()->start();
|
||||
if (endif_inst->opcode != BRW_OPCODE_ENDIF)
|
||||
continue;
|
||||
|
||||
bblock_t *jump_block = block;
|
||||
bblock_t *if_block = jump_block->prev();
|
||||
bblock_t *endif_block = jump_block->next();
|
||||
|
||||
jump_inst->predicate = if_inst->predicate;
|
||||
jump_inst->predicate_inverse = if_inst->predicate_inverse;
|
||||
|
||||
bblock_t *earlier_block = if_block;
|
||||
if (if_block->start_ip == if_block->end_ip) {
|
||||
earlier_block = if_block->prev();
|
||||
}
|
||||
|
||||
if_inst->remove(if_block);
|
||||
|
||||
bblock_t *later_block = endif_block;
|
||||
if (endif_block->start_ip == endif_block->end_ip) {
|
||||
later_block = endif_block->next();
|
||||
}
|
||||
endif_inst->remove(endif_block);
|
||||
|
||||
if (!earlier_block->ends_with_control_flow()) {
|
||||
/* FIXME: There is a potential problem here. If earlier_block starts
|
||||
* with a DO instruction, this will delete the physical link to the
|
||||
* WHILE block. It is unclear whether ENDIF has the same potential
|
||||
* problem.
|
||||
*/
|
||||
assert(earlier_block->start() == NULL ||
|
||||
earlier_block->start()->opcode != BRW_OPCODE_DO);
|
||||
|
||||
earlier_block->unlink_children();
|
||||
earlier_block->add_successor(s->cfg->mem_ctx, jump_block,
|
||||
bblock_link_logical);
|
||||
}
|
||||
|
||||
if (!later_block->starts_with_control_flow()) {
|
||||
later_block->unlink_parents();
|
||||
}
|
||||
|
||||
/* If jump_block already has a link to later_block, don't create another
|
||||
* one. Instead, promote the link to logical.
|
||||
*/
|
||||
bool need_to_link = true;
|
||||
foreach_list_typed(bblock_link, link, link, &jump_block->children) {
|
||||
if (link->block == later_block) {
|
||||
assert(later_block->starts_with_control_flow());
|
||||
|
||||
/* Update the link from later_block back to jump_block. */
|
||||
foreach_list_typed(bblock_link, parent_link, link, &later_block->parents) {
|
||||
if (parent_link->block == jump_block) {
|
||||
parent_link->kind = bblock_link_logical;
|
||||
}
|
||||
}
|
||||
|
||||
/* Update the link from jump_block to later_block. */
|
||||
link->kind = bblock_link_logical;
|
||||
need_to_link = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (need_to_link) {
|
||||
jump_block->add_successor(s->cfg->mem_ctx, later_block,
|
||||
bblock_link_logical);
|
||||
}
|
||||
|
||||
if (earlier_block->can_combine_with(jump_block)) {
|
||||
earlier_block->combine_with(jump_block);
|
||||
|
||||
block = earlier_block;
|
||||
}
|
||||
|
||||
/* Now look at the first instruction of the block following the BREAK. If
|
||||
* it's a WHILE, we can delete the break, predicate the WHILE, and join
|
||||
* the two basic blocks.
|
||||
*
|
||||
* This optimization can only be applied if the only instruction that
|
||||
* can transfer control to the WHILE is the BREAK. If other paths can
|
||||
* lead to the while, the flags may be in an unknown state, and the loop
|
||||
* could terminate prematurely. This can occur if the loop contains a
|
||||
* CONT instruction.
|
||||
*/
|
||||
bblock_t *while_block = earlier_block->next();
|
||||
backend_instruction *while_inst = while_block->start();
|
||||
|
||||
if (jump_inst->opcode == BRW_OPCODE_BREAK &&
|
||||
while_inst->opcode == BRW_OPCODE_WHILE &&
|
||||
while_inst->predicate == BRW_PREDICATE_NONE &&
|
||||
!has_continue(&state)) {
|
||||
jump_inst->remove(earlier_block);
|
||||
while_inst->predicate = jump_inst->predicate;
|
||||
while_inst->predicate_inverse = !jump_inst->predicate_inverse;
|
||||
|
||||
assert(earlier_block->can_combine_with(while_block));
|
||||
earlier_block->combine_with(while_block);
|
||||
}
|
||||
|
||||
progress = true;
|
||||
}
|
||||
|
||||
if (progress)
|
||||
s->invalidate_analysis(DEPENDENCY_BLOCKS | DEPENDENCY_INSTRUCTIONS);
|
||||
|
||||
return progress;
|
||||
}
|
||||
50
src/intel/compiler/elk/brw_prim.h
Normal file
50
src/intel/compiler/elk/brw_prim.h
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
/*
|
||||
* Copyright © 2022 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BRW_PRIM_H
|
||||
#define BRW_PRIM_H
|
||||
|
||||
#define _3DPRIM_POINTLIST 0x01
|
||||
#define _3DPRIM_LINELIST 0x02
|
||||
#define _3DPRIM_LINESTRIP 0x03
|
||||
#define _3DPRIM_TRILIST 0x04
|
||||
#define _3DPRIM_TRISTRIP 0x05
|
||||
#define _3DPRIM_TRIFAN 0x06
|
||||
#define _3DPRIM_QUADLIST 0x07
|
||||
#define _3DPRIM_QUADSTRIP 0x08
|
||||
#define _3DPRIM_LINELIST_ADJ 0x09 /* G45+ */
|
||||
#define _3DPRIM_LINESTRIP_ADJ 0x0A /* G45+ */
|
||||
#define _3DPRIM_TRILIST_ADJ 0x0B /* G45+ */
|
||||
#define _3DPRIM_TRISTRIP_ADJ 0x0C /* G45+ */
|
||||
#define _3DPRIM_TRISTRIP_REVERSE 0x0D
|
||||
#define _3DPRIM_POLYGON 0x0E
|
||||
#define _3DPRIM_RECTLIST 0x0F
|
||||
#define _3DPRIM_LINELOOP 0x10
|
||||
#define _3DPRIM_POINTLIST_BF 0x11
|
||||
#define _3DPRIM_LINESTRIP_CONT 0x12
|
||||
#define _3DPRIM_LINESTRIP_BF 0x13
|
||||
#define _3DPRIM_LINESTRIP_CONT_BF 0x14
|
||||
#define _3DPRIM_TRIFAN_NOSTIPPLE 0x16
|
||||
#define _3DPRIM_PATCHLIST(n) ({ assert(n > 0 && n <= 32); 0x20 + (n - 1); })
|
||||
|
||||
#endif /* BRW_PRIM_H */
|
||||
76
src/intel/compiler/elk/brw_private.h
Normal file
76
src/intel/compiler/elk/brw_private.h
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
/* -*- c++ -*- */
|
||||
/*
|
||||
* Copyright © 2021 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BRW_PRIVATE_H
|
||||
#define BRW_PRIVATE_H
|
||||
|
||||
#include "brw_compiler.h"
|
||||
|
||||
#include <variant>
|
||||
|
||||
unsigned brw_required_dispatch_width(const struct shader_info *info);
|
||||
|
||||
static constexpr int SIMD_COUNT = 3;
|
||||
|
||||
struct brw_simd_selection_state {
|
||||
const struct intel_device_info *devinfo;
|
||||
|
||||
std::variant<struct brw_cs_prog_data *,
|
||||
struct brw_bs_prog_data *> prog_data;
|
||||
|
||||
unsigned required_width;
|
||||
|
||||
const char *error[SIMD_COUNT];
|
||||
|
||||
bool compiled[SIMD_COUNT];
|
||||
bool spilled[SIMD_COUNT];
|
||||
};
|
||||
|
||||
inline int brw_simd_first_compiled(const brw_simd_selection_state &state)
|
||||
{
|
||||
for (int i = 0; i < SIMD_COUNT; i++) {
|
||||
if (state.compiled[i])
|
||||
return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
inline bool brw_simd_any_compiled(const brw_simd_selection_state &state)
|
||||
{
|
||||
return brw_simd_first_compiled(state) >= 0;
|
||||
}
|
||||
|
||||
bool brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd);
|
||||
|
||||
void brw_simd_mark_compiled(brw_simd_selection_state &state, unsigned simd, bool spilled);
|
||||
|
||||
int brw_simd_select(const brw_simd_selection_state &state);
|
||||
|
||||
int brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo,
|
||||
const struct brw_cs_prog_data *prog_data,
|
||||
const unsigned *sizes);
|
||||
|
||||
bool brw_should_print_shader(const nir_shader *shader, uint64_t debug_flag);
|
||||
|
||||
#endif // BRW_PRIVATE_H
|
||||
1375
src/intel/compiler/elk/brw_reg.h
Normal file
1375
src/intel/compiler/elk/brw_reg.h
Normal file
File diff suppressed because it is too large
Load diff
563
src/intel/compiler/elk/brw_reg_type.c
Normal file
563
src/intel/compiler/elk/brw_reg_type.c
Normal file
|
|
@ -0,0 +1,563 @@
|
|||
/*
|
||||
* Copyright © 2017 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_reg.h"
|
||||
#include "brw_eu_defines.h"
|
||||
#include "dev/intel_device_info.h"
|
||||
|
||||
#define INVALID (-1)
|
||||
|
||||
enum hw_reg_type {
|
||||
BRW_HW_REG_TYPE_UD = 0,
|
||||
BRW_HW_REG_TYPE_D = 1,
|
||||
BRW_HW_REG_TYPE_UW = 2,
|
||||
BRW_HW_REG_TYPE_W = 3,
|
||||
BRW_HW_REG_TYPE_F = 7,
|
||||
GFX8_HW_REG_TYPE_UQ = 8,
|
||||
GFX8_HW_REG_TYPE_Q = 9,
|
||||
|
||||
BRW_HW_REG_TYPE_UB = 4,
|
||||
BRW_HW_REG_TYPE_B = 5,
|
||||
GFX7_HW_REG_TYPE_DF = 6,
|
||||
GFX8_HW_REG_TYPE_HF = 10,
|
||||
|
||||
GFX11_HW_REG_TYPE_UD = 0,
|
||||
GFX11_HW_REG_TYPE_D = 1,
|
||||
GFX11_HW_REG_TYPE_UW = 2,
|
||||
GFX11_HW_REG_TYPE_W = 3,
|
||||
GFX11_HW_REG_TYPE_UB = 4,
|
||||
GFX11_HW_REG_TYPE_B = 5,
|
||||
GFX11_HW_REG_TYPE_UQ = 6,
|
||||
GFX11_HW_REG_TYPE_Q = 7,
|
||||
GFX11_HW_REG_TYPE_HF = 8,
|
||||
GFX11_HW_REG_TYPE_F = 9,
|
||||
GFX11_HW_REG_TYPE_DF = 10,
|
||||
GFX11_HW_REG_TYPE_NF = 11,
|
||||
};
|
||||
|
||||
enum hw_imm_type {
|
||||
BRW_HW_IMM_TYPE_UD = 0,
|
||||
BRW_HW_IMM_TYPE_D = 1,
|
||||
BRW_HW_IMM_TYPE_UW = 2,
|
||||
BRW_HW_IMM_TYPE_W = 3,
|
||||
BRW_HW_IMM_TYPE_F = 7,
|
||||
GFX8_HW_IMM_TYPE_UQ = 8,
|
||||
GFX8_HW_IMM_TYPE_Q = 9,
|
||||
|
||||
BRW_HW_IMM_TYPE_UV = 4,
|
||||
BRW_HW_IMM_TYPE_VF = 5,
|
||||
BRW_HW_IMM_TYPE_V = 6,
|
||||
GFX8_HW_IMM_TYPE_DF = 10,
|
||||
GFX8_HW_IMM_TYPE_HF = 11,
|
||||
|
||||
GFX11_HW_IMM_TYPE_UD = 0,
|
||||
GFX11_HW_IMM_TYPE_D = 1,
|
||||
GFX11_HW_IMM_TYPE_UW = 2,
|
||||
GFX11_HW_IMM_TYPE_W = 3,
|
||||
GFX11_HW_IMM_TYPE_UV = 4,
|
||||
GFX11_HW_IMM_TYPE_V = 5,
|
||||
GFX11_HW_IMM_TYPE_UQ = 6,
|
||||
GFX11_HW_IMM_TYPE_Q = 7,
|
||||
GFX11_HW_IMM_TYPE_HF = 8,
|
||||
GFX11_HW_IMM_TYPE_F = 9,
|
||||
GFX11_HW_IMM_TYPE_DF = 10,
|
||||
GFX11_HW_IMM_TYPE_VF = 11,
|
||||
};
|
||||
|
||||
#define GFX12_HW_REG_TYPE_UINT(n) (n)
|
||||
#define GFX12_HW_REG_TYPE_SINT(n) (0x4 | (n))
|
||||
#define GFX12_HW_REG_TYPE_FLOAT(n) (0x8 | (n))
|
||||
|
||||
static const struct hw_type {
|
||||
enum hw_reg_type reg_type;
|
||||
enum hw_imm_type imm_type;
|
||||
} gfx4_hw_type[] = {
|
||||
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID, INVALID },
|
||||
|
||||
[BRW_REGISTER_TYPE_F] = { BRW_HW_REG_TYPE_F, BRW_HW_IMM_TYPE_F },
|
||||
[BRW_REGISTER_TYPE_VF] = { INVALID, BRW_HW_IMM_TYPE_VF },
|
||||
|
||||
[BRW_REGISTER_TYPE_D] = { BRW_HW_REG_TYPE_D, BRW_HW_IMM_TYPE_D },
|
||||
[BRW_REGISTER_TYPE_UD] = { BRW_HW_REG_TYPE_UD, BRW_HW_IMM_TYPE_UD },
|
||||
[BRW_REGISTER_TYPE_W] = { BRW_HW_REG_TYPE_W, BRW_HW_IMM_TYPE_W },
|
||||
[BRW_REGISTER_TYPE_UW] = { BRW_HW_REG_TYPE_UW, BRW_HW_IMM_TYPE_UW },
|
||||
[BRW_REGISTER_TYPE_B] = { BRW_HW_REG_TYPE_B, INVALID },
|
||||
[BRW_REGISTER_TYPE_UB] = { BRW_HW_REG_TYPE_UB, INVALID },
|
||||
[BRW_REGISTER_TYPE_V] = { INVALID, BRW_HW_IMM_TYPE_V },
|
||||
}, gfx6_hw_type[] = {
|
||||
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID, INVALID },
|
||||
|
||||
[BRW_REGISTER_TYPE_F] = { BRW_HW_REG_TYPE_F, BRW_HW_IMM_TYPE_F },
|
||||
[BRW_REGISTER_TYPE_VF] = { INVALID, BRW_HW_IMM_TYPE_VF },
|
||||
|
||||
[BRW_REGISTER_TYPE_D] = { BRW_HW_REG_TYPE_D, BRW_HW_IMM_TYPE_D },
|
||||
[BRW_REGISTER_TYPE_UD] = { BRW_HW_REG_TYPE_UD, BRW_HW_IMM_TYPE_UD },
|
||||
[BRW_REGISTER_TYPE_W] = { BRW_HW_REG_TYPE_W, BRW_HW_IMM_TYPE_W },
|
||||
[BRW_REGISTER_TYPE_UW] = { BRW_HW_REG_TYPE_UW, BRW_HW_IMM_TYPE_UW },
|
||||
[BRW_REGISTER_TYPE_B] = { BRW_HW_REG_TYPE_B, INVALID },
|
||||
[BRW_REGISTER_TYPE_UB] = { BRW_HW_REG_TYPE_UB, INVALID },
|
||||
[BRW_REGISTER_TYPE_V] = { INVALID, BRW_HW_IMM_TYPE_V },
|
||||
[BRW_REGISTER_TYPE_UV] = { INVALID, BRW_HW_IMM_TYPE_UV },
|
||||
}, gfx7_hw_type[] = {
|
||||
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID, INVALID },
|
||||
|
||||
[BRW_REGISTER_TYPE_DF] = { GFX7_HW_REG_TYPE_DF, INVALID },
|
||||
[BRW_REGISTER_TYPE_F] = { BRW_HW_REG_TYPE_F, BRW_HW_IMM_TYPE_F },
|
||||
[BRW_REGISTER_TYPE_VF] = { INVALID, BRW_HW_IMM_TYPE_VF },
|
||||
|
||||
[BRW_REGISTER_TYPE_D] = { BRW_HW_REG_TYPE_D, BRW_HW_IMM_TYPE_D },
|
||||
[BRW_REGISTER_TYPE_UD] = { BRW_HW_REG_TYPE_UD, BRW_HW_IMM_TYPE_UD },
|
||||
[BRW_REGISTER_TYPE_W] = { BRW_HW_REG_TYPE_W, BRW_HW_IMM_TYPE_W },
|
||||
[BRW_REGISTER_TYPE_UW] = { BRW_HW_REG_TYPE_UW, BRW_HW_IMM_TYPE_UW },
|
||||
[BRW_REGISTER_TYPE_B] = { BRW_HW_REG_TYPE_B, INVALID },
|
||||
[BRW_REGISTER_TYPE_UB] = { BRW_HW_REG_TYPE_UB, INVALID },
|
||||
[BRW_REGISTER_TYPE_V] = { INVALID, BRW_HW_IMM_TYPE_V },
|
||||
[BRW_REGISTER_TYPE_UV] = { INVALID, BRW_HW_IMM_TYPE_UV },
|
||||
}, gfx8_hw_type[] = {
|
||||
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID, INVALID },
|
||||
|
||||
[BRW_REGISTER_TYPE_DF] = { GFX7_HW_REG_TYPE_DF, GFX8_HW_IMM_TYPE_DF },
|
||||
[BRW_REGISTER_TYPE_F] = { BRW_HW_REG_TYPE_F, BRW_HW_IMM_TYPE_F },
|
||||
[BRW_REGISTER_TYPE_HF] = { GFX8_HW_REG_TYPE_HF, GFX8_HW_IMM_TYPE_HF },
|
||||
[BRW_REGISTER_TYPE_VF] = { INVALID, BRW_HW_IMM_TYPE_VF },
|
||||
|
||||
[BRW_REGISTER_TYPE_Q] = { GFX8_HW_REG_TYPE_Q, GFX8_HW_IMM_TYPE_Q },
|
||||
[BRW_REGISTER_TYPE_UQ] = { GFX8_HW_REG_TYPE_UQ, GFX8_HW_IMM_TYPE_UQ },
|
||||
[BRW_REGISTER_TYPE_D] = { BRW_HW_REG_TYPE_D, BRW_HW_IMM_TYPE_D },
|
||||
[BRW_REGISTER_TYPE_UD] = { BRW_HW_REG_TYPE_UD, BRW_HW_IMM_TYPE_UD },
|
||||
[BRW_REGISTER_TYPE_W] = { BRW_HW_REG_TYPE_W, BRW_HW_IMM_TYPE_W },
|
||||
[BRW_REGISTER_TYPE_UW] = { BRW_HW_REG_TYPE_UW, BRW_HW_IMM_TYPE_UW },
|
||||
[BRW_REGISTER_TYPE_B] = { BRW_HW_REG_TYPE_B, INVALID },
|
||||
[BRW_REGISTER_TYPE_UB] = { BRW_HW_REG_TYPE_UB, INVALID },
|
||||
[BRW_REGISTER_TYPE_V] = { INVALID, BRW_HW_IMM_TYPE_V },
|
||||
[BRW_REGISTER_TYPE_UV] = { INVALID, BRW_HW_IMM_TYPE_UV },
|
||||
}, gfx11_hw_type[] = {
|
||||
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID, INVALID },
|
||||
|
||||
[BRW_REGISTER_TYPE_NF] = { GFX11_HW_REG_TYPE_NF, INVALID },
|
||||
[BRW_REGISTER_TYPE_F] = { GFX11_HW_REG_TYPE_F, GFX11_HW_IMM_TYPE_F },
|
||||
[BRW_REGISTER_TYPE_HF] = { GFX11_HW_REG_TYPE_HF, GFX11_HW_IMM_TYPE_HF },
|
||||
[BRW_REGISTER_TYPE_VF] = { INVALID, GFX11_HW_IMM_TYPE_VF },
|
||||
|
||||
[BRW_REGISTER_TYPE_D] = { GFX11_HW_REG_TYPE_D, GFX11_HW_IMM_TYPE_D },
|
||||
[BRW_REGISTER_TYPE_UD] = { GFX11_HW_REG_TYPE_UD, GFX11_HW_IMM_TYPE_UD },
|
||||
[BRW_REGISTER_TYPE_W] = { GFX11_HW_REG_TYPE_W, GFX11_HW_IMM_TYPE_W },
|
||||
[BRW_REGISTER_TYPE_UW] = { GFX11_HW_REG_TYPE_UW, GFX11_HW_IMM_TYPE_UW },
|
||||
[BRW_REGISTER_TYPE_B] = { GFX11_HW_REG_TYPE_B, INVALID },
|
||||
[BRW_REGISTER_TYPE_UB] = { GFX11_HW_REG_TYPE_UB, INVALID },
|
||||
[BRW_REGISTER_TYPE_V] = { INVALID, GFX11_HW_IMM_TYPE_V },
|
||||
[BRW_REGISTER_TYPE_UV] = { INVALID, GFX11_HW_IMM_TYPE_UV },
|
||||
}, gfx12_hw_type[] = {
|
||||
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID, INVALID },
|
||||
|
||||
[BRW_REGISTER_TYPE_F] = { GFX12_HW_REG_TYPE_FLOAT(2), GFX12_HW_REG_TYPE_FLOAT(2) },
|
||||
[BRW_REGISTER_TYPE_HF] = { GFX12_HW_REG_TYPE_FLOAT(1), GFX12_HW_REG_TYPE_FLOAT(1) },
|
||||
[BRW_REGISTER_TYPE_VF] = { INVALID, GFX12_HW_REG_TYPE_FLOAT(0) },
|
||||
|
||||
[BRW_REGISTER_TYPE_D] = { GFX12_HW_REG_TYPE_SINT(2), GFX12_HW_REG_TYPE_SINT(2) },
|
||||
[BRW_REGISTER_TYPE_UD] = { GFX12_HW_REG_TYPE_UINT(2), GFX12_HW_REG_TYPE_UINT(2) },
|
||||
[BRW_REGISTER_TYPE_W] = { GFX12_HW_REG_TYPE_SINT(1), GFX12_HW_REG_TYPE_SINT(1) },
|
||||
[BRW_REGISTER_TYPE_UW] = { GFX12_HW_REG_TYPE_UINT(1), GFX12_HW_REG_TYPE_UINT(1) },
|
||||
[BRW_REGISTER_TYPE_B] = { GFX12_HW_REG_TYPE_SINT(0), INVALID },
|
||||
[BRW_REGISTER_TYPE_UB] = { GFX12_HW_REG_TYPE_UINT(0), INVALID },
|
||||
[BRW_REGISTER_TYPE_V] = { INVALID, GFX12_HW_REG_TYPE_SINT(0) },
|
||||
[BRW_REGISTER_TYPE_UV] = { INVALID, GFX12_HW_REG_TYPE_UINT(0) },
|
||||
}, gfx125_hw_type[] = {
|
||||
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID, INVALID },
|
||||
|
||||
[BRW_REGISTER_TYPE_DF] = { GFX12_HW_REG_TYPE_FLOAT(3), GFX12_HW_REG_TYPE_FLOAT(3) },
|
||||
[BRW_REGISTER_TYPE_F] = { GFX12_HW_REG_TYPE_FLOAT(2), GFX12_HW_REG_TYPE_FLOAT(2) },
|
||||
[BRW_REGISTER_TYPE_HF] = { GFX12_HW_REG_TYPE_FLOAT(1), GFX12_HW_REG_TYPE_FLOAT(1) },
|
||||
[BRW_REGISTER_TYPE_VF] = { INVALID, GFX12_HW_REG_TYPE_FLOAT(0) },
|
||||
|
||||
[BRW_REGISTER_TYPE_Q] = { GFX12_HW_REG_TYPE_SINT(3), GFX12_HW_REG_TYPE_SINT(3) },
|
||||
[BRW_REGISTER_TYPE_UQ] = { GFX12_HW_REG_TYPE_UINT(3), GFX12_HW_REG_TYPE_UINT(3) },
|
||||
[BRW_REGISTER_TYPE_D] = { GFX12_HW_REG_TYPE_SINT(2), GFX12_HW_REG_TYPE_SINT(2) },
|
||||
[BRW_REGISTER_TYPE_UD] = { GFX12_HW_REG_TYPE_UINT(2), GFX12_HW_REG_TYPE_UINT(2) },
|
||||
[BRW_REGISTER_TYPE_W] = { GFX12_HW_REG_TYPE_SINT(1), GFX12_HW_REG_TYPE_SINT(1) },
|
||||
[BRW_REGISTER_TYPE_UW] = { GFX12_HW_REG_TYPE_UINT(1), GFX12_HW_REG_TYPE_UINT(1) },
|
||||
[BRW_REGISTER_TYPE_B] = { GFX12_HW_REG_TYPE_SINT(0), INVALID },
|
||||
[BRW_REGISTER_TYPE_UB] = { GFX12_HW_REG_TYPE_UINT(0), INVALID },
|
||||
[BRW_REGISTER_TYPE_V] = { INVALID, GFX12_HW_REG_TYPE_SINT(0) },
|
||||
[BRW_REGISTER_TYPE_UV] = { INVALID, GFX12_HW_REG_TYPE_UINT(0) },
|
||||
};
|
||||
|
||||
/* SNB adds 3-src instructions (MAD and LRP) that only operate on floats, so
|
||||
* the types were implied. IVB adds BFE and BFI2 that operate on doublewords
|
||||
* and unsigned doublewords, so a new field is also available in the da3src
|
||||
* struct (part of struct brw_instruction.bits1 in brw_structs.h) to select
|
||||
* dst and shared-src types.
|
||||
*
|
||||
* CNL adds support for 3-src instructions in align1 mode, and with it support
|
||||
* for most register types.
|
||||
*/
|
||||
enum hw_3src_reg_type {
|
||||
GFX7_3SRC_TYPE_F = 0,
|
||||
GFX7_3SRC_TYPE_D = 1,
|
||||
GFX7_3SRC_TYPE_UD = 2,
|
||||
GFX7_3SRC_TYPE_DF = 3,
|
||||
GFX8_3SRC_TYPE_HF = 4,
|
||||
|
||||
/** When ExecutionDatatype is 1: @{ */
|
||||
GFX10_ALIGN1_3SRC_REG_TYPE_HF = 0b000,
|
||||
GFX10_ALIGN1_3SRC_REG_TYPE_F = 0b001,
|
||||
GFX10_ALIGN1_3SRC_REG_TYPE_DF = 0b010,
|
||||
GFX11_ALIGN1_3SRC_REG_TYPE_NF = 0b011,
|
||||
/** @} */
|
||||
|
||||
/** When ExecutionDatatype is 0: @{ */
|
||||
GFX10_ALIGN1_3SRC_REG_TYPE_UD = 0b000,
|
||||
GFX10_ALIGN1_3SRC_REG_TYPE_D = 0b001,
|
||||
GFX10_ALIGN1_3SRC_REG_TYPE_UW = 0b010,
|
||||
GFX10_ALIGN1_3SRC_REG_TYPE_W = 0b011,
|
||||
GFX10_ALIGN1_3SRC_REG_TYPE_UB = 0b100,
|
||||
GFX10_ALIGN1_3SRC_REG_TYPE_B = 0b101,
|
||||
/** @} */
|
||||
};
|
||||
|
||||
static const struct hw_3src_type {
|
||||
enum hw_3src_reg_type reg_type;
|
||||
enum gfx10_align1_3src_exec_type exec_type;
|
||||
} gfx6_hw_3src_type[] = {
|
||||
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
|
||||
|
||||
[BRW_REGISTER_TYPE_F] = { GFX7_3SRC_TYPE_F },
|
||||
}, gfx7_hw_3src_type[] = {
|
||||
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
|
||||
|
||||
[BRW_REGISTER_TYPE_F] = { GFX7_3SRC_TYPE_F },
|
||||
[BRW_REGISTER_TYPE_D] = { GFX7_3SRC_TYPE_D },
|
||||
[BRW_REGISTER_TYPE_UD] = { GFX7_3SRC_TYPE_UD },
|
||||
[BRW_REGISTER_TYPE_DF] = { GFX7_3SRC_TYPE_DF },
|
||||
}, gfx8_hw_3src_type[] = {
|
||||
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
|
||||
|
||||
[BRW_REGISTER_TYPE_F] = { GFX7_3SRC_TYPE_F },
|
||||
[BRW_REGISTER_TYPE_D] = { GFX7_3SRC_TYPE_D },
|
||||
[BRW_REGISTER_TYPE_UD] = { GFX7_3SRC_TYPE_UD },
|
||||
[BRW_REGISTER_TYPE_DF] = { GFX7_3SRC_TYPE_DF },
|
||||
[BRW_REGISTER_TYPE_HF] = { GFX8_3SRC_TYPE_HF },
|
||||
}, gfx10_hw_3src_align1_type[] = {
|
||||
#define E(x) BRW_ALIGN1_3SRC_EXEC_TYPE_##x
|
||||
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
|
||||
|
||||
[BRW_REGISTER_TYPE_DF] = { GFX10_ALIGN1_3SRC_REG_TYPE_DF, E(FLOAT) },
|
||||
[BRW_REGISTER_TYPE_F] = { GFX10_ALIGN1_3SRC_REG_TYPE_F, E(FLOAT) },
|
||||
[BRW_REGISTER_TYPE_HF] = { GFX10_ALIGN1_3SRC_REG_TYPE_HF, E(FLOAT) },
|
||||
|
||||
[BRW_REGISTER_TYPE_D] = { GFX10_ALIGN1_3SRC_REG_TYPE_D, E(INT) },
|
||||
[BRW_REGISTER_TYPE_UD] = { GFX10_ALIGN1_3SRC_REG_TYPE_UD, E(INT) },
|
||||
[BRW_REGISTER_TYPE_W] = { GFX10_ALIGN1_3SRC_REG_TYPE_W, E(INT) },
|
||||
[BRW_REGISTER_TYPE_UW] = { GFX10_ALIGN1_3SRC_REG_TYPE_UW, E(INT) },
|
||||
[BRW_REGISTER_TYPE_B] = { GFX10_ALIGN1_3SRC_REG_TYPE_B, E(INT) },
|
||||
[BRW_REGISTER_TYPE_UB] = { GFX10_ALIGN1_3SRC_REG_TYPE_UB, E(INT) },
|
||||
}, gfx11_hw_3src_type[] = {
|
||||
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
|
||||
|
||||
[BRW_REGISTER_TYPE_NF] = { GFX11_ALIGN1_3SRC_REG_TYPE_NF, E(FLOAT) },
|
||||
[BRW_REGISTER_TYPE_F] = { GFX10_ALIGN1_3SRC_REG_TYPE_F, E(FLOAT) },
|
||||
[BRW_REGISTER_TYPE_HF] = { GFX10_ALIGN1_3SRC_REG_TYPE_HF, E(FLOAT) },
|
||||
|
||||
[BRW_REGISTER_TYPE_D] = { GFX10_ALIGN1_3SRC_REG_TYPE_D, E(INT) },
|
||||
[BRW_REGISTER_TYPE_UD] = { GFX10_ALIGN1_3SRC_REG_TYPE_UD, E(INT) },
|
||||
[BRW_REGISTER_TYPE_W] = { GFX10_ALIGN1_3SRC_REG_TYPE_W, E(INT) },
|
||||
[BRW_REGISTER_TYPE_UW] = { GFX10_ALIGN1_3SRC_REG_TYPE_UW, E(INT) },
|
||||
[BRW_REGISTER_TYPE_B] = { GFX10_ALIGN1_3SRC_REG_TYPE_B, E(INT) },
|
||||
[BRW_REGISTER_TYPE_UB] = { GFX10_ALIGN1_3SRC_REG_TYPE_UB, E(INT) },
|
||||
}, gfx12_hw_3src_type[] = {
|
||||
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
|
||||
|
||||
[BRW_REGISTER_TYPE_F] = { GFX12_HW_REG_TYPE_UINT(2), E(FLOAT), },
|
||||
[BRW_REGISTER_TYPE_HF] = { GFX12_HW_REG_TYPE_UINT(1), E(FLOAT), },
|
||||
|
||||
[BRW_REGISTER_TYPE_D] = { GFX12_HW_REG_TYPE_SINT(2), E(INT), },
|
||||
[BRW_REGISTER_TYPE_UD] = { GFX12_HW_REG_TYPE_UINT(2), E(INT), },
|
||||
[BRW_REGISTER_TYPE_W] = { GFX12_HW_REG_TYPE_SINT(1), E(INT), },
|
||||
[BRW_REGISTER_TYPE_UW] = { GFX12_HW_REG_TYPE_UINT(1), E(INT), },
|
||||
[BRW_REGISTER_TYPE_B] = { GFX12_HW_REG_TYPE_SINT(0), E(INT), },
|
||||
[BRW_REGISTER_TYPE_UB] = { GFX12_HW_REG_TYPE_UINT(0), E(INT), },
|
||||
}, gfx125_hw_3src_type[] = {
|
||||
[0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
|
||||
|
||||
[BRW_REGISTER_TYPE_DF] = { GFX12_HW_REG_TYPE_UINT(3), E(FLOAT), },
|
||||
[BRW_REGISTER_TYPE_F] = { GFX12_HW_REG_TYPE_UINT(2), E(FLOAT), },
|
||||
[BRW_REGISTER_TYPE_HF] = { GFX12_HW_REG_TYPE_UINT(1), E(FLOAT), },
|
||||
|
||||
[BRW_REGISTER_TYPE_Q] = { GFX12_HW_REG_TYPE_SINT(3), E(INT), },
|
||||
[BRW_REGISTER_TYPE_UQ] = { GFX12_HW_REG_TYPE_UINT(3), E(INT), },
|
||||
[BRW_REGISTER_TYPE_D] = { GFX12_HW_REG_TYPE_SINT(2), E(INT), },
|
||||
[BRW_REGISTER_TYPE_UD] = { GFX12_HW_REG_TYPE_UINT(2), E(INT), },
|
||||
[BRW_REGISTER_TYPE_W] = { GFX12_HW_REG_TYPE_SINT(1), E(INT), },
|
||||
[BRW_REGISTER_TYPE_UW] = { GFX12_HW_REG_TYPE_UINT(1), E(INT), },
|
||||
[BRW_REGISTER_TYPE_B] = { GFX12_HW_REG_TYPE_SINT(0), E(INT), },
|
||||
[BRW_REGISTER_TYPE_UB] = { GFX12_HW_REG_TYPE_UINT(0), E(INT), },
|
||||
#undef E
|
||||
};
|
||||
|
||||
/**
|
||||
* Convert a brw_reg_type enumeration value into the hardware representation.
|
||||
*
|
||||
* The hardware encoding may depend on whether the value is an immediate.
|
||||
*/
|
||||
unsigned
|
||||
brw_reg_type_to_hw_type(const struct intel_device_info *devinfo,
|
||||
enum brw_reg_file file,
|
||||
enum brw_reg_type type)
|
||||
{
|
||||
const struct hw_type *table;
|
||||
|
||||
if (devinfo->verx10 >= 125) {
|
||||
assert(type < ARRAY_SIZE(gfx125_hw_type));
|
||||
table = gfx125_hw_type;
|
||||
} else if (devinfo->ver >= 12) {
|
||||
assert(type < ARRAY_SIZE(gfx12_hw_type));
|
||||
table = gfx12_hw_type;
|
||||
} else if (devinfo->ver >= 11) {
|
||||
assert(type < ARRAY_SIZE(gfx11_hw_type));
|
||||
table = gfx11_hw_type;
|
||||
} else if (devinfo->ver >= 8) {
|
||||
assert(type < ARRAY_SIZE(gfx8_hw_type));
|
||||
table = gfx8_hw_type;
|
||||
} else if (devinfo->ver >= 7) {
|
||||
assert(type < ARRAY_SIZE(gfx7_hw_type));
|
||||
table = gfx7_hw_type;
|
||||
} else if (devinfo->ver >= 6) {
|
||||
assert(type < ARRAY_SIZE(gfx6_hw_type));
|
||||
table = gfx6_hw_type;
|
||||
} else {
|
||||
assert(type < ARRAY_SIZE(gfx4_hw_type));
|
||||
table = gfx4_hw_type;
|
||||
}
|
||||
|
||||
if (file == BRW_IMMEDIATE_VALUE) {
|
||||
assert(table[type].imm_type != (enum hw_imm_type)INVALID);
|
||||
return table[type].imm_type;
|
||||
} else {
|
||||
assert(table[type].reg_type != (enum hw_reg_type)INVALID);
|
||||
return table[type].reg_type;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert the hardware representation into a brw_reg_type enumeration value.
|
||||
*
|
||||
* The hardware encoding may depend on whether the value is an immediate.
|
||||
*/
|
||||
enum brw_reg_type
|
||||
brw_hw_type_to_reg_type(const struct intel_device_info *devinfo,
|
||||
enum brw_reg_file file, unsigned hw_type)
|
||||
{
|
||||
const struct hw_type *table;
|
||||
|
||||
if (devinfo->verx10 >= 125) {
|
||||
table = gfx125_hw_type;
|
||||
} else if (devinfo->ver >= 12) {
|
||||
table = gfx12_hw_type;
|
||||
} else if (devinfo->ver >= 11) {
|
||||
table = gfx11_hw_type;
|
||||
} else if (devinfo->ver >= 8) {
|
||||
table = gfx8_hw_type;
|
||||
} else if (devinfo->ver >= 7) {
|
||||
table = gfx7_hw_type;
|
||||
} else if (devinfo->ver >= 6) {
|
||||
table = gfx6_hw_type;
|
||||
} else {
|
||||
table = gfx4_hw_type;
|
||||
}
|
||||
|
||||
if (file == BRW_IMMEDIATE_VALUE) {
|
||||
for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) {
|
||||
if (table[i].imm_type == (enum hw_imm_type)hw_type) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) {
|
||||
if (table[i].reg_type == (enum hw_reg_type)hw_type) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
}
|
||||
return INVALID_REG_TYPE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a brw_reg_type enumeration value into the hardware representation
|
||||
* for a 3-src align16 instruction
|
||||
*/
|
||||
unsigned
|
||||
brw_reg_type_to_a16_hw_3src_type(const struct intel_device_info *devinfo,
|
||||
enum brw_reg_type type)
|
||||
{
|
||||
const struct hw_3src_type *table;
|
||||
|
||||
if (devinfo->ver >= 8) {
|
||||
assert(type < ARRAY_SIZE(gfx8_hw_3src_type));
|
||||
table = gfx8_hw_3src_type;
|
||||
} else if (devinfo->ver >= 7) {
|
||||
assert(type < ARRAY_SIZE(gfx7_hw_3src_type));
|
||||
table = gfx7_hw_3src_type;
|
||||
} else {
|
||||
assert(type < ARRAY_SIZE(gfx6_hw_3src_type));
|
||||
table = gfx6_hw_3src_type;
|
||||
}
|
||||
|
||||
assert(table[type].reg_type != (enum hw_3src_reg_type)INVALID);
|
||||
return table[type].reg_type;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a brw_reg_type enumeration value into the hardware representation
|
||||
* for a 3-src align1 instruction
|
||||
*/
|
||||
unsigned
|
||||
brw_reg_type_to_a1_hw_3src_type(const struct intel_device_info *devinfo,
|
||||
enum brw_reg_type type)
|
||||
{
|
||||
if (devinfo->verx10 >= 125) {
|
||||
assert(type < ARRAY_SIZE(gfx125_hw_3src_type));
|
||||
return gfx125_hw_3src_type[type].reg_type;
|
||||
} else if (devinfo->ver >= 12) {
|
||||
assert(type < ARRAY_SIZE(gfx12_hw_3src_type));
|
||||
return gfx12_hw_3src_type[type].reg_type;
|
||||
} else if (devinfo->ver >= 11) {
|
||||
assert(type < ARRAY_SIZE(gfx11_hw_3src_type));
|
||||
return gfx11_hw_3src_type[type].reg_type;
|
||||
} else {
|
||||
assert(type < ARRAY_SIZE(gfx10_hw_3src_align1_type));
|
||||
return gfx10_hw_3src_align1_type[type].reg_type;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert the hardware representation for a 3-src align16 instruction into a
|
||||
* brw_reg_type enumeration value.
|
||||
*/
|
||||
enum brw_reg_type
|
||||
brw_a16_hw_3src_type_to_reg_type(const struct intel_device_info *devinfo,
|
||||
unsigned hw_type)
|
||||
{
|
||||
const struct hw_3src_type *table = NULL;
|
||||
|
||||
if (devinfo->ver >= 8) {
|
||||
table = gfx8_hw_3src_type;
|
||||
} else if (devinfo->ver >= 7) {
|
||||
table = gfx7_hw_3src_type;
|
||||
} else if (devinfo->ver >= 6) {
|
||||
table = gfx6_hw_3src_type;
|
||||
}
|
||||
|
||||
for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) {
|
||||
if (table[i].reg_type == hw_type) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return INVALID_REG_TYPE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert the hardware representation for a 3-src align1 instruction into a
|
||||
* brw_reg_type enumeration value.
|
||||
*/
|
||||
enum brw_reg_type
|
||||
brw_a1_hw_3src_type_to_reg_type(const struct intel_device_info *devinfo,
|
||||
unsigned hw_type, unsigned exec_type)
|
||||
{
|
||||
const struct hw_3src_type *table =
|
||||
(devinfo->verx10 >= 125 ? gfx125_hw_3src_type :
|
||||
devinfo->ver >= 12 ? gfx12_hw_3src_type :
|
||||
devinfo->ver >= 11 ? gfx11_hw_3src_type :
|
||||
gfx10_hw_3src_align1_type);
|
||||
|
||||
for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) {
|
||||
if (table[i].reg_type == hw_type &&
|
||||
table[i].exec_type == exec_type) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return INVALID_REG_TYPE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the element size given a register type.
|
||||
*/
|
||||
unsigned
|
||||
brw_reg_type_to_size(enum brw_reg_type type)
|
||||
{
|
||||
static const unsigned type_size[] = {
|
||||
[BRW_REGISTER_TYPE_NF] = 8,
|
||||
[BRW_REGISTER_TYPE_DF] = 8,
|
||||
[BRW_REGISTER_TYPE_F] = 4,
|
||||
[BRW_REGISTER_TYPE_HF] = 2,
|
||||
[BRW_REGISTER_TYPE_VF] = 4,
|
||||
|
||||
[BRW_REGISTER_TYPE_Q] = 8,
|
||||
[BRW_REGISTER_TYPE_UQ] = 8,
|
||||
[BRW_REGISTER_TYPE_D] = 4,
|
||||
[BRW_REGISTER_TYPE_UD] = 4,
|
||||
[BRW_REGISTER_TYPE_W] = 2,
|
||||
[BRW_REGISTER_TYPE_UW] = 2,
|
||||
[BRW_REGISTER_TYPE_B] = 1,
|
||||
[BRW_REGISTER_TYPE_UB] = 1,
|
||||
[BRW_REGISTER_TYPE_V] = 2,
|
||||
[BRW_REGISTER_TYPE_UV] = 2,
|
||||
};
|
||||
if (type >= ARRAY_SIZE(type_size))
|
||||
return -1;
|
||||
|
||||
return type_size[type];
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a BRW_REGISTER_TYPE_* enum to a short string (F, UD, and so on).
|
||||
*
|
||||
* This is different than reg_encoding from brw_disasm.c in that it operates
|
||||
* on the abstract enum values, rather than the generation-specific encoding.
|
||||
*/
|
||||
const char *
|
||||
brw_reg_type_to_letters(enum brw_reg_type type)
|
||||
{
|
||||
static const char letters[][3] = {
|
||||
[BRW_REGISTER_TYPE_NF] = "NF",
|
||||
[BRW_REGISTER_TYPE_DF] = "DF",
|
||||
[BRW_REGISTER_TYPE_F] = "F",
|
||||
[BRW_REGISTER_TYPE_HF] = "HF",
|
||||
[BRW_REGISTER_TYPE_VF] = "VF",
|
||||
|
||||
[BRW_REGISTER_TYPE_Q] = "Q",
|
||||
[BRW_REGISTER_TYPE_UQ] = "UQ",
|
||||
[BRW_REGISTER_TYPE_D] = "D",
|
||||
[BRW_REGISTER_TYPE_UD] = "UD",
|
||||
[BRW_REGISTER_TYPE_W] = "W",
|
||||
[BRW_REGISTER_TYPE_UW] = "UW",
|
||||
[BRW_REGISTER_TYPE_B] = "B",
|
||||
[BRW_REGISTER_TYPE_UB] = "UB",
|
||||
[BRW_REGISTER_TYPE_V] = "V",
|
||||
[BRW_REGISTER_TYPE_UV] = "UV",
|
||||
};
|
||||
if (type >= ARRAY_SIZE(letters))
|
||||
return "INVALID";
|
||||
|
||||
assert(type < ARRAY_SIZE(letters));
|
||||
return letters[type];
|
||||
}
|
||||
209
src/intel/compiler/elk/brw_reg_type.h
Normal file
209
src/intel/compiler/elk/brw_reg_type.h
Normal file
|
|
@ -0,0 +1,209 @@
|
|||
/*
|
||||
* Copyright © 2017 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BRW_REG_TYPE_H
|
||||
#define BRW_REG_TYPE_H
|
||||
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_FUNC_ATTRIBUTE_PURE
|
||||
#define ATTRIBUTE_PURE __attribute__((__pure__))
|
||||
#else
|
||||
#define ATTRIBUTE_PURE
|
||||
#endif
|
||||
|
||||
enum brw_reg_file;
|
||||
struct intel_device_info;
|
||||
|
||||
/*
|
||||
* The ordering has been chosen so that no enum value is the same as a
|
||||
* compatible hardware encoding.
|
||||
*/
|
||||
enum PACKED brw_reg_type {
|
||||
/** Floating-point types: @{ */
|
||||
BRW_REGISTER_TYPE_NF, /* >64-bit (accumulator-only) native float (gfx11+) */
|
||||
BRW_REGISTER_TYPE_DF, /* 64-bit float (double float) */
|
||||
BRW_REGISTER_TYPE_F, /* 32-bit float */
|
||||
BRW_REGISTER_TYPE_HF, /* 16-bit float (half float) */
|
||||
BRW_REGISTER_TYPE_VF, /* 32-bit vector of 4 8-bit floats */
|
||||
/** @} */
|
||||
|
||||
/** Integer types: @{ */
|
||||
BRW_REGISTER_TYPE_Q, /* 64-bit signed integer (quad word) */
|
||||
BRW_REGISTER_TYPE_UQ, /* 64-bit unsigned integer (quad word) */
|
||||
BRW_REGISTER_TYPE_D, /* 32-bit signed integer (double word) */
|
||||
BRW_REGISTER_TYPE_UD, /* 32-bit unsigned integer (double word) */
|
||||
BRW_REGISTER_TYPE_W, /* 16-bit signed integer (word) */
|
||||
BRW_REGISTER_TYPE_UW, /* 16-bit unsigned integer (word) */
|
||||
BRW_REGISTER_TYPE_B, /* 8-bit signed integer (byte) */
|
||||
BRW_REGISTER_TYPE_UB, /* 8-bit unsigned integer (byte) */
|
||||
BRW_REGISTER_TYPE_V, /* vector of 8 signed 4-bit integers (treated as W) */
|
||||
BRW_REGISTER_TYPE_UV, /* vector of 8 unsigned 4-bit integers (treated as UW) */
|
||||
/** @} */
|
||||
|
||||
BRW_REGISTER_TYPE_LAST = BRW_REGISTER_TYPE_UV
|
||||
};
|
||||
|
||||
static inline bool
|
||||
brw_reg_type_is_floating_point(enum brw_reg_type type)
|
||||
{
|
||||
switch (type) {
|
||||
case BRW_REGISTER_TYPE_NF:
|
||||
case BRW_REGISTER_TYPE_DF:
|
||||
case BRW_REGISTER_TYPE_F:
|
||||
case BRW_REGISTER_TYPE_HF:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool
|
||||
brw_reg_type_is_integer(enum brw_reg_type type)
|
||||
{
|
||||
switch (type) {
|
||||
case BRW_REGISTER_TYPE_Q:
|
||||
case BRW_REGISTER_TYPE_UQ:
|
||||
case BRW_REGISTER_TYPE_D:
|
||||
case BRW_REGISTER_TYPE_UD:
|
||||
case BRW_REGISTER_TYPE_W:
|
||||
case BRW_REGISTER_TYPE_UW:
|
||||
case BRW_REGISTER_TYPE_B:
|
||||
case BRW_REGISTER_TYPE_UB:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool
|
||||
brw_reg_type_is_unsigned_integer(enum brw_reg_type tp)
|
||||
{
|
||||
return tp == BRW_REGISTER_TYPE_UB ||
|
||||
tp == BRW_REGISTER_TYPE_UW ||
|
||||
tp == BRW_REGISTER_TYPE_UD ||
|
||||
tp == BRW_REGISTER_TYPE_UQ;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns a type based on a reference_type (word, float, half-float) and a
|
||||
* given bit_size.
|
||||
*/
|
||||
static inline enum brw_reg_type
|
||||
brw_reg_type_from_bit_size(unsigned bit_size,
|
||||
enum brw_reg_type reference_type)
|
||||
{
|
||||
switch(reference_type) {
|
||||
case BRW_REGISTER_TYPE_HF:
|
||||
case BRW_REGISTER_TYPE_F:
|
||||
case BRW_REGISTER_TYPE_DF:
|
||||
switch(bit_size) {
|
||||
case 16:
|
||||
return BRW_REGISTER_TYPE_HF;
|
||||
case 32:
|
||||
return BRW_REGISTER_TYPE_F;
|
||||
case 64:
|
||||
return BRW_REGISTER_TYPE_DF;
|
||||
default:
|
||||
unreachable("Invalid bit size");
|
||||
}
|
||||
case BRW_REGISTER_TYPE_B:
|
||||
case BRW_REGISTER_TYPE_W:
|
||||
case BRW_REGISTER_TYPE_D:
|
||||
case BRW_REGISTER_TYPE_Q:
|
||||
switch(bit_size) {
|
||||
case 8:
|
||||
return BRW_REGISTER_TYPE_B;
|
||||
case 16:
|
||||
return BRW_REGISTER_TYPE_W;
|
||||
case 32:
|
||||
return BRW_REGISTER_TYPE_D;
|
||||
case 64:
|
||||
return BRW_REGISTER_TYPE_Q;
|
||||
default:
|
||||
unreachable("Invalid bit size");
|
||||
}
|
||||
case BRW_REGISTER_TYPE_UB:
|
||||
case BRW_REGISTER_TYPE_UW:
|
||||
case BRW_REGISTER_TYPE_UD:
|
||||
case BRW_REGISTER_TYPE_UQ:
|
||||
switch(bit_size) {
|
||||
case 8:
|
||||
return BRW_REGISTER_TYPE_UB;
|
||||
case 16:
|
||||
return BRW_REGISTER_TYPE_UW;
|
||||
case 32:
|
||||
return BRW_REGISTER_TYPE_UD;
|
||||
case 64:
|
||||
return BRW_REGISTER_TYPE_UQ;
|
||||
default:
|
||||
unreachable("Invalid bit size");
|
||||
}
|
||||
default:
|
||||
unreachable("Unknown type");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#define INVALID_REG_TYPE ((enum brw_reg_type)-1)
|
||||
#define INVALID_HW_REG_TYPE ((unsigned)-1)
|
||||
|
||||
unsigned
|
||||
brw_reg_type_to_hw_type(const struct intel_device_info *devinfo,
|
||||
enum brw_reg_file file, enum brw_reg_type type);
|
||||
|
||||
enum brw_reg_type ATTRIBUTE_PURE
|
||||
brw_hw_type_to_reg_type(const struct intel_device_info *devinfo,
|
||||
enum brw_reg_file file, unsigned hw_type);
|
||||
|
||||
unsigned
|
||||
brw_reg_type_to_a16_hw_3src_type(const struct intel_device_info *devinfo,
|
||||
enum brw_reg_type type);
|
||||
|
||||
unsigned
|
||||
brw_reg_type_to_a1_hw_3src_type(const struct intel_device_info *devinfo,
|
||||
enum brw_reg_type type);
|
||||
|
||||
enum brw_reg_type
|
||||
brw_a16_hw_3src_type_to_reg_type(const struct intel_device_info *devinfo,
|
||||
unsigned hw_type);
|
||||
|
||||
enum brw_reg_type
|
||||
brw_a1_hw_3src_type_to_reg_type(const struct intel_device_info *devinfo,
|
||||
unsigned hw_type, unsigned exec_type);
|
||||
|
||||
unsigned
|
||||
brw_reg_type_to_size(enum brw_reg_type type);
|
||||
|
||||
const char *
|
||||
brw_reg_type_to_letters(enum brw_reg_type type);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
292
src/intel/compiler/elk/brw_rt.h
Normal file
292
src/intel/compiler/elk/brw_rt.h
Normal file
|
|
@ -0,0 +1,292 @@
|
|||
/*
|
||||
* Copyright © 2020 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BRW_RT_H
|
||||
#define BRW_RT_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "compiler/shader_enums.h"
|
||||
#include "util/macros.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/** Vulkan defines shaderGroupHandleSize = 32 */
|
||||
#define BRW_RT_SBT_HANDLE_SIZE 32
|
||||
|
||||
/** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */
|
||||
#define BRW_RT_DISPATCH_GLOBALS_SIZE 80
|
||||
|
||||
/** Offset after the RT dispatch globals at which "push" constants live */
|
||||
#define BRW_RT_PUSH_CONST_OFFSET 128
|
||||
|
||||
/** Stride of the resume SBT */
|
||||
#define BRW_BTD_RESUME_SBT_STRIDE 8
|
||||
|
||||
/* Vulkan always uses exactly two levels of BVH: world and object. At the API
|
||||
* level, these are referred to as top and bottom.
|
||||
*/
|
||||
enum brw_rt_bvh_level {
|
||||
BRW_RT_BVH_LEVEL_WORLD = 0,
|
||||
BRW_RT_BVH_LEVEL_OBJECT = 1,
|
||||
};
|
||||
#define BRW_RT_MAX_BVH_LEVELS 2
|
||||
|
||||
enum brw_rt_bvh_node_type {
|
||||
BRW_RT_BVH_NODE_TYPE_INTERNAL = 0,
|
||||
BRW_RT_BVH_NODE_TYPE_INSTANCE = 1,
|
||||
BRW_RT_BVH_NODE_TYPE_PROCEDURAL = 3,
|
||||
BRW_RT_BVH_NODE_TYPE_QUAD = 4,
|
||||
};
|
||||
|
||||
/** HitKind values returned for triangle geometry
|
||||
*
|
||||
* This enum must match the SPIR-V enum.
|
||||
*/
|
||||
enum brw_rt_hit_kind {
|
||||
BRW_RT_HIT_KIND_FRONT_FACE = 0xfe,
|
||||
BRW_RT_HIT_KIND_BACK_FACE = 0xff,
|
||||
};
|
||||
|
||||
/** Ray flags
|
||||
*
|
||||
* This enum must match the SPIR-V RayFlags enum.
|
||||
*/
|
||||
enum brw_rt_ray_flags {
|
||||
BRW_RT_RAY_FLAG_FORCE_OPAQUE = 0x01,
|
||||
BRW_RT_RAY_FLAG_FORCE_NON_OPAQUE = 0x02,
|
||||
BRW_RT_RAY_FLAG_TERMINATE_ON_FIRST_HIT = 0x04,
|
||||
BRW_RT_RAY_FLAG_SKIP_CLOSEST_HIT_SHADER = 0x08,
|
||||
BRW_RT_RAY_FLAG_CULL_BACK_FACING_TRIANGLES = 0x10,
|
||||
BRW_RT_RAY_FLAG_CULL_FRONT_FACING_TRIANGLES = 0x20,
|
||||
BRW_RT_RAY_FLAG_CULL_OPAQUE = 0x40,
|
||||
BRW_RT_RAY_FLAG_CULL_NON_OPAQUE = 0x80,
|
||||
BRW_RT_RAY_FLAG_SKIP_TRIANGLES = 0x100,
|
||||
BRW_RT_RAY_FLAG_SKIP_AABBS = 0x200,
|
||||
};
|
||||
|
||||
struct brw_rt_scratch_layout {
|
||||
/** Number of stack IDs per DSS */
|
||||
uint32_t stack_ids_per_dss;
|
||||
|
||||
/** Start offset (in bytes) of the hardware MemRay stack */
|
||||
uint32_t ray_stack_start;
|
||||
|
||||
/** Stride (in bytes) of the hardware MemRay stack */
|
||||
uint32_t ray_stack_stride;
|
||||
|
||||
/** Start offset (in bytes) of the SW stacks */
|
||||
uint64_t sw_stack_start;
|
||||
|
||||
/** Size (in bytes) of the SW stack for a single shader invocation */
|
||||
uint32_t sw_stack_size;
|
||||
|
||||
/** Total size (in bytes) of the RT scratch memory area */
|
||||
uint64_t total_size;
|
||||
};
|
||||
|
||||
/** Parameters passed to the raygen trampoline shader
|
||||
*
|
||||
* This struct is carefully construected to be 32B and must be passed to the
|
||||
* raygen trampoline shader as as inline constant data.
|
||||
*/
|
||||
struct brw_rt_raygen_trampoline_params {
|
||||
/** The GPU address of the RT_DISPATCH_GLOBALS */
|
||||
uint64_t rt_disp_globals_addr;
|
||||
|
||||
/** The GPU address of the BINDLESS_SHADER_RECORD for the raygen shader */
|
||||
uint64_t raygen_bsr_addr;
|
||||
|
||||
/** 1 if this is an indirect dispatch, 0 otherwise */
|
||||
uint8_t is_indirect;
|
||||
|
||||
/** The integer log2 of the local group size
|
||||
*
|
||||
* Ray-tracing shaders don't have a concept of local vs. global workgroup
|
||||
* size. They only have a single 3D launch size. The raygen trampoline
|
||||
* shader is always dispatched with a local workgroup size equal to the
|
||||
* SIMD width but the shape of the local workgroup is determined at
|
||||
* dispatch time based on the shape of the launch and passed to the
|
||||
* trampoline via this field. (There's no sense having a Z dimension on
|
||||
* the local workgroup if the launch is 2D.)
|
||||
*
|
||||
* We use the integer log2 of the size because there's no point in
|
||||
* non-power-of-two sizes and shifts are cheaper than division.
|
||||
*/
|
||||
uint8_t local_group_size_log2[3];
|
||||
|
||||
uint32_t pad[3];
|
||||
};
|
||||
|
||||
/** Size of the "hot zone" in bytes
|
||||
*
|
||||
* The hot zone is a SW-defined data structure which is a single uvec4
|
||||
* containing two bits of information:
|
||||
*
|
||||
* - hotzone.x: Stack offset (in bytes)
|
||||
*
|
||||
* This is the offset (in bytes) into the per-thread scratch space at which
|
||||
* the current shader's stack starts. This is incremented by the calling
|
||||
* shader prior to any shader call type instructions and gets decremented
|
||||
* by the resume shader as part of completing the return operation.
|
||||
*
|
||||
*
|
||||
* - hotzone.yzw: The launch ID associated with the current thread
|
||||
*
|
||||
* Inside a bindless shader, the only information we have is the DSS ID
|
||||
* from the hardware EU and a per-DSS stack ID. In particular, the three-
|
||||
* dimensional launch ID is lost the moment we leave the raygen trampoline.
|
||||
*/
|
||||
#define BRW_RT_SIZEOF_HOTZONE 16
|
||||
|
||||
/* From the BSpec "Address Computation for Memory Based Data Structures:
|
||||
* Ray and TraversalStack (Async Ray Tracing)":
|
||||
*
|
||||
* sizeof(Ray) = 64B, sizeof(HitInfo) = 32B, sizeof(TravStack) = 32B.
|
||||
*/
|
||||
#define BRW_RT_SIZEOF_RAY 64
|
||||
#define BRW_RT_SIZEOF_HIT_INFO 32
|
||||
#define BRW_RT_SIZEOF_TRAV_STACK 32
|
||||
|
||||
/* From the BSpec:
|
||||
*
|
||||
* syncStackSize = (maxBVHLevels % 2 == 1) ?
|
||||
* (sizeof(HitInfo) * 2 +
|
||||
* (sizeof(Ray) + sizeof(TravStack)) * maxBVHLevels + 32B) :
|
||||
* (sizeof(HitInfo) * 2 +
|
||||
* (sizeof(Ray) + sizeof(TravStack)) * maxBVHLevels);
|
||||
*
|
||||
* The select is just to align to 64B.
|
||||
*/
|
||||
#define BRW_RT_SIZEOF_RAY_QUERY \
|
||||
(BRW_RT_SIZEOF_HIT_INFO * 2 + \
|
||||
(BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS + \
|
||||
(BRW_RT_MAX_BVH_LEVELS % 2 ? 32 : 0))
|
||||
|
||||
#define BRW_RT_SIZEOF_SHADOW_RAY_QUERY \
|
||||
(BRW_RT_SIZEOF_HIT_INFO * 2 + \
|
||||
(BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS)
|
||||
|
||||
#define BRW_RT_SIZEOF_HW_STACK \
|
||||
(BRW_RT_SIZEOF_HIT_INFO * 2 + \
|
||||
BRW_RT_SIZEOF_RAY * BRW_RT_MAX_BVH_LEVELS + \
|
||||
BRW_RT_SIZEOF_TRAV_STACK * BRW_RT_MAX_BVH_LEVELS)
|
||||
|
||||
/* This is a mesa-defined region for hit attribute data */
|
||||
#define BRW_RT_SIZEOF_HIT_ATTRIB_DATA 64
|
||||
#define BRW_RT_OFFSETOF_HIT_ATTRIB_DATA BRW_RT_SIZEOF_HW_STACK
|
||||
|
||||
#define BRW_RT_ASYNC_STACK_STRIDE \
|
||||
ALIGN_POT(BRW_RT_OFFSETOF_HIT_ATTRIB_DATA + \
|
||||
BRW_RT_SIZEOF_HIT_ATTRIB_DATA, 64)
|
||||
|
||||
static inline void
|
||||
brw_rt_compute_scratch_layout(struct brw_rt_scratch_layout *layout,
|
||||
const struct intel_device_info *devinfo,
|
||||
uint32_t stack_ids_per_dss,
|
||||
uint32_t sw_stack_size)
|
||||
{
|
||||
layout->stack_ids_per_dss = stack_ids_per_dss;
|
||||
|
||||
const uint32_t dss_count = intel_device_info_dual_subslice_id_bound(devinfo);
|
||||
const uint32_t num_stack_ids = dss_count * stack_ids_per_dss;
|
||||
|
||||
uint64_t size = 0;
|
||||
|
||||
/* The first thing in our scratch area is an array of "hot zones" which
|
||||
* store the stack offset as well as the launch IDs for each active
|
||||
* invocation.
|
||||
*/
|
||||
size += BRW_RT_SIZEOF_HOTZONE * num_stack_ids;
|
||||
|
||||
/* Next, we place the HW ray stacks */
|
||||
assert(size % 64 == 0); /* Cache-line aligned */
|
||||
assert(size < UINT32_MAX);
|
||||
layout->ray_stack_start = size;
|
||||
layout->ray_stack_stride = BRW_RT_ASYNC_STACK_STRIDE;
|
||||
size += num_stack_ids * layout->ray_stack_stride;
|
||||
|
||||
/* Finally, we place the SW stacks for the individual ray-tracing shader
|
||||
* invocations. We align these to 64B to ensure that we don't have any
|
||||
* shared cache lines which could hurt performance.
|
||||
*/
|
||||
assert(size % 64 == 0);
|
||||
layout->sw_stack_start = size;
|
||||
layout->sw_stack_size = ALIGN(sw_stack_size, 64);
|
||||
|
||||
/* Currently it's always the case that sw_stack_size is a power of
|
||||
* two, but power-of-two SW stack sizes are prone to causing
|
||||
* collisions in the hashing function used by the L3 to map memory
|
||||
* addresses to banks, which can cause stack accesses from most
|
||||
* DSSes to bottleneck on a single L3 bank. Fix it by padding the
|
||||
* SW stack by a single cacheline if it was a power of two.
|
||||
*/
|
||||
if (layout->sw_stack_size > 64 &&
|
||||
util_is_power_of_two_nonzero(layout->sw_stack_size))
|
||||
layout->sw_stack_size += 64;
|
||||
|
||||
size += num_stack_ids * layout->sw_stack_size;
|
||||
|
||||
layout->total_size = size;
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
brw_rt_ray_queries_hw_stacks_size(const struct intel_device_info *devinfo)
|
||||
{
|
||||
/* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids
|
||||
* which includes all the threads.
|
||||
*/
|
||||
uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE];
|
||||
uint32_t max_simd_size = 16; /* Cannot run in SIMD32 with ray queries */
|
||||
return max_eu_id * max_simd_size * BRW_RT_SIZEOF_RAY_QUERY;
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
brw_rt_ray_queries_shadow_stack_size(const struct intel_device_info *devinfo)
|
||||
{
|
||||
/* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids
|
||||
* which includes all the threads.
|
||||
*/
|
||||
uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE];
|
||||
uint32_t max_simd_size = 16; /* Cannot run in SIMD32 with ray queries */
|
||||
return max_eu_id * max_simd_size * BRW_RT_SIZEOF_SHADOW_RAY_QUERY;
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
brw_rt_ray_queries_shadow_stacks_size(const struct intel_device_info *devinfo,
|
||||
uint32_t ray_queries)
|
||||
{
|
||||
/* Don't bother a shadow stack if we only have a single query. We can
|
||||
* directly write in the HW buffer.
|
||||
*/
|
||||
return (ray_queries > 1 ? ray_queries : 0) * brw_rt_ray_queries_shadow_stack_size(devinfo) +
|
||||
ray_queries * 4; /* Ctrl + Level data */
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* BRW_RT_H */
|
||||
2096
src/intel/compiler/elk/brw_schedule_instructions.cpp
Normal file
2096
src/intel/compiler/elk/brw_schedule_instructions.cpp
Normal file
File diff suppressed because it is too large
Load diff
1427
src/intel/compiler/elk/brw_shader.cpp
Normal file
1427
src/intel/compiler/elk/brw_shader.cpp
Normal file
File diff suppressed because it is too large
Load diff
196
src/intel/compiler/elk/brw_shader.h
Normal file
196
src/intel/compiler/elk/brw_shader.h
Normal file
|
|
@ -0,0 +1,196 @@
|
|||
/*
|
||||
* Copyright © 2010 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BRW_SHADER_H
|
||||
#define BRW_SHADER_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "brw_cfg.h"
|
||||
#include "brw_compiler.h"
|
||||
#include "compiler/nir/nir.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
#include "brw_ir_analysis.h"
|
||||
#include "brw_ir_allocator.h"
|
||||
|
||||
enum instruction_scheduler_mode {
|
||||
SCHEDULE_PRE,
|
||||
SCHEDULE_PRE_NON_LIFO,
|
||||
SCHEDULE_PRE_LIFO,
|
||||
SCHEDULE_POST,
|
||||
SCHEDULE_NONE,
|
||||
};
|
||||
|
||||
#define UBO_START ((1 << 16) - 4)
|
||||
|
||||
struct backend_shader {
|
||||
protected:
|
||||
|
||||
backend_shader(const struct brw_compiler *compiler,
|
||||
const struct brw_compile_params *params,
|
||||
const nir_shader *shader,
|
||||
struct brw_stage_prog_data *stage_prog_data,
|
||||
bool debug_enabled);
|
||||
|
||||
public:
|
||||
virtual ~backend_shader();
|
||||
|
||||
const struct brw_compiler *compiler;
|
||||
void *log_data; /* Passed to compiler->*_log functions */
|
||||
|
||||
const struct intel_device_info * const devinfo;
|
||||
const nir_shader *nir;
|
||||
struct brw_stage_prog_data * const stage_prog_data;
|
||||
|
||||
/** ralloc context for temporary data used during compile */
|
||||
void *mem_ctx;
|
||||
|
||||
/**
|
||||
* List of either fs_inst or vec4_instruction (inheriting from
|
||||
* backend_instruction)
|
||||
*/
|
||||
exec_list instructions;
|
||||
|
||||
cfg_t *cfg;
|
||||
brw_analysis<brw::idom_tree, backend_shader> idom_analysis;
|
||||
|
||||
gl_shader_stage stage;
|
||||
bool debug_enabled;
|
||||
|
||||
brw::simple_allocator alloc;
|
||||
|
||||
virtual void dump_instruction_to_file(const backend_instruction *inst, FILE *file) const = 0;
|
||||
virtual void dump_instructions_to_file(FILE *file) const;
|
||||
|
||||
/* Convenience functions based on the above. */
|
||||
void dump_instruction(const backend_instruction *inst, FILE *file = stderr) const {
|
||||
dump_instruction_to_file(inst, file);
|
||||
}
|
||||
void dump_instructions(const char *name = nullptr) const;
|
||||
|
||||
void calculate_cfg();
|
||||
|
||||
virtual void invalidate_analysis(brw::analysis_dependency_class c);
|
||||
};
|
||||
|
||||
#else
|
||||
struct backend_shader;
|
||||
#endif /* __cplusplus */
|
||||
|
||||
enum brw_reg_type brw_type_for_base_type(const struct glsl_type *type);
|
||||
uint32_t brw_math_function(enum opcode op);
|
||||
const char *brw_instruction_name(const struct brw_isa_info *isa,
|
||||
enum opcode op);
|
||||
bool brw_saturate_immediate(enum brw_reg_type type, struct brw_reg *reg);
|
||||
bool brw_negate_immediate(enum brw_reg_type type, struct brw_reg *reg);
|
||||
bool brw_abs_immediate(enum brw_reg_type type, struct brw_reg *reg);
|
||||
|
||||
bool opt_predicated_break(struct backend_shader *s);
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* brw_fs_reg_allocate.cpp */
|
||||
void brw_fs_alloc_reg_sets(struct brw_compiler *compiler);
|
||||
|
||||
/* brw_vec4_reg_allocate.cpp */
|
||||
void brw_vec4_alloc_reg_set(struct brw_compiler *compiler);
|
||||
|
||||
/* brw_disasm.c */
|
||||
extern const char *const conditional_modifier[16];
|
||||
extern const char *const pred_ctrl_align16[16];
|
||||
|
||||
/* Per-thread scratch space is a power-of-two multiple of 1KB. */
|
||||
static inline unsigned
|
||||
brw_get_scratch_size(int size)
|
||||
{
|
||||
return MAX2(1024, util_next_power_of_two(size));
|
||||
}
|
||||
|
||||
|
||||
static inline nir_variable_mode
|
||||
brw_nir_no_indirect_mask(const struct brw_compiler *compiler,
|
||||
gl_shader_stage stage)
|
||||
{
|
||||
const struct intel_device_info *devinfo = compiler->devinfo;
|
||||
const bool is_scalar = compiler->scalar_stage[stage];
|
||||
nir_variable_mode indirect_mask = (nir_variable_mode) 0;
|
||||
|
||||
switch (stage) {
|
||||
case MESA_SHADER_VERTEX:
|
||||
case MESA_SHADER_FRAGMENT:
|
||||
indirect_mask |= nir_var_shader_in;
|
||||
break;
|
||||
|
||||
case MESA_SHADER_GEOMETRY:
|
||||
if (!is_scalar)
|
||||
indirect_mask |= nir_var_shader_in;
|
||||
break;
|
||||
|
||||
default:
|
||||
/* Everything else can handle indirect inputs */
|
||||
break;
|
||||
}
|
||||
|
||||
if (is_scalar && stage != MESA_SHADER_TESS_CTRL &&
|
||||
stage != MESA_SHADER_TASK &&
|
||||
stage != MESA_SHADER_MESH)
|
||||
indirect_mask |= nir_var_shader_out;
|
||||
|
||||
/* On HSW+, we allow indirects in scalar shaders. They get implemented
|
||||
* using nir_lower_vars_to_explicit_types and nir_lower_explicit_io in
|
||||
* brw_postprocess_nir.
|
||||
*
|
||||
* We haven't plumbed through the indirect scratch messages on gfx6 or
|
||||
* earlier so doing indirects via scratch doesn't work there. On gfx7 and
|
||||
* earlier the scratch space size is limited to 12kB. If we allowed
|
||||
* indirects as scratch all the time, we may easily exceed this limit
|
||||
* without having any fallback.
|
||||
*/
|
||||
if (is_scalar && devinfo->verx10 <= 70)
|
||||
indirect_mask |= nir_var_function_temp;
|
||||
|
||||
return indirect_mask;
|
||||
}
|
||||
|
||||
bool brw_texture_offset(const nir_tex_instr *tex, unsigned src,
|
||||
uint32_t *offset_bits);
|
||||
|
||||
/**
|
||||
* Scratch data used when compiling a GLSL geometry shader.
|
||||
*/
|
||||
struct brw_gs_compile
|
||||
{
|
||||
struct brw_gs_prog_key key;
|
||||
struct intel_vue_map input_vue_map;
|
||||
|
||||
unsigned control_data_bits_per_vertex;
|
||||
unsigned control_data_header_size_bits;
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* BRW_SHADER_H */
|
||||
268
src/intel/compiler/elk/brw_simd_selection.cpp
Normal file
268
src/intel/compiler/elk/brw_simd_selection.cpp
Normal file
|
|
@ -0,0 +1,268 @@
|
|||
/*
|
||||
* Copyright © 2021 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_private.h"
|
||||
#include "compiler/shader_info.h"
|
||||
#include "intel/dev/intel_debug.h"
|
||||
#include "intel/dev/intel_device_info.h"
|
||||
#include "util/ralloc.h"
|
||||
|
||||
unsigned
|
||||
brw_required_dispatch_width(const struct shader_info *info)
|
||||
{
|
||||
if ((int)info->subgroup_size >= (int)SUBGROUP_SIZE_REQUIRE_8) {
|
||||
assert(gl_shader_stage_uses_workgroup(info->stage));
|
||||
/* These enum values are expressly chosen to be equal to the subgroup
|
||||
* size that they require.
|
||||
*/
|
||||
return (unsigned)info->subgroup_size;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool
|
||||
test_bit(unsigned mask, unsigned bit) {
|
||||
return mask & (1u << bit);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
struct brw_cs_prog_data *
|
||||
get_cs_prog_data(brw_simd_selection_state &state)
|
||||
{
|
||||
if (std::holds_alternative<struct brw_cs_prog_data *>(state.prog_data))
|
||||
return std::get<struct brw_cs_prog_data *>(state.prog_data);
|
||||
else
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
struct brw_stage_prog_data *
|
||||
get_prog_data(brw_simd_selection_state &state)
|
||||
{
|
||||
if (std::holds_alternative<struct brw_cs_prog_data *>(state.prog_data))
|
||||
return &std::get<struct brw_cs_prog_data *>(state.prog_data)->base;
|
||||
else if (std::holds_alternative<struct brw_bs_prog_data *>(state.prog_data))
|
||||
return &std::get<struct brw_bs_prog_data *>(state.prog_data)->base;
|
||||
else
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
bool
|
||||
brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd)
|
||||
{
|
||||
assert(simd < SIMD_COUNT);
|
||||
assert(!state.compiled[simd]);
|
||||
|
||||
const auto cs_prog_data = get_cs_prog_data(state);
|
||||
const auto prog_data = get_prog_data(state);
|
||||
const unsigned width = 8u << simd;
|
||||
|
||||
/* For shaders with variable size workgroup, in most cases we can compile
|
||||
* all the variants (exceptions are bindless dispatch & ray queries), since
|
||||
* the choice will happen only at dispatch time.
|
||||
*/
|
||||
const bool workgroup_size_variable = cs_prog_data && cs_prog_data->local_size[0] == 0;
|
||||
|
||||
if (!workgroup_size_variable) {
|
||||
if (state.spilled[simd]) {
|
||||
state.error[simd] = "Would spill";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (state.required_width && state.required_width != width) {
|
||||
state.error[simd] = "Different than required dispatch width";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (cs_prog_data) {
|
||||
const unsigned workgroup_size = cs_prog_data->local_size[0] *
|
||||
cs_prog_data->local_size[1] *
|
||||
cs_prog_data->local_size[2];
|
||||
|
||||
unsigned max_threads = state.devinfo->max_cs_workgroup_threads;
|
||||
|
||||
const unsigned min_simd = state.devinfo->ver >= 20 ? 1 : 0;
|
||||
if (simd > min_simd && state.compiled[simd - 1] &&
|
||||
workgroup_size <= (width / 2)) {
|
||||
state.error[simd] = "Workgroup size already fits in smaller SIMD";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (DIV_ROUND_UP(workgroup_size, width) > max_threads) {
|
||||
state.error[simd] = "Would need more than max_threads to fit all invocations";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/* The SIMD32 is only enabled for cases it is needed unless forced.
|
||||
*
|
||||
* TODO: Use performance_analysis and drop this rule.
|
||||
*/
|
||||
if (width == 32 && state.devinfo->ver < 20) {
|
||||
if (!INTEL_DEBUG(DEBUG_DO32) && (state.compiled[0] || state.compiled[1])) {
|
||||
state.error[simd] = "SIMD32 not required (use INTEL_DEBUG=do32 to force)";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (width == 8 && state.devinfo->ver >= 20) {
|
||||
state.error[simd] = "SIMD8 not supported on Xe2+";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (width == 32 && cs_prog_data && cs_prog_data->base.ray_queries > 0) {
|
||||
state.error[simd] = "Ray queries not supported";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (width == 32 && cs_prog_data && cs_prog_data->uses_btd_stack_ids) {
|
||||
state.error[simd] = "Bindless shader calls not supported";
|
||||
return false;
|
||||
}
|
||||
|
||||
uint64_t start;
|
||||
switch (prog_data->stage) {
|
||||
case MESA_SHADER_COMPUTE:
|
||||
start = DEBUG_CS_SIMD8;
|
||||
break;
|
||||
case MESA_SHADER_TASK:
|
||||
start = DEBUG_TS_SIMD8;
|
||||
break;
|
||||
case MESA_SHADER_MESH:
|
||||
start = DEBUG_MS_SIMD8;
|
||||
break;
|
||||
case MESA_SHADER_RAYGEN:
|
||||
case MESA_SHADER_ANY_HIT:
|
||||
case MESA_SHADER_CLOSEST_HIT:
|
||||
case MESA_SHADER_MISS:
|
||||
case MESA_SHADER_INTERSECTION:
|
||||
case MESA_SHADER_CALLABLE:
|
||||
start = DEBUG_RT_SIMD8;
|
||||
break;
|
||||
default:
|
||||
unreachable("unknown shader stage in brw_simd_should_compile");
|
||||
}
|
||||
|
||||
const bool env_skip[] = {
|
||||
(intel_simd & (start << 0)) == 0,
|
||||
(intel_simd & (start << 1)) == 0,
|
||||
(intel_simd & (start << 2)) == 0,
|
||||
};
|
||||
|
||||
static_assert(ARRAY_SIZE(env_skip) == SIMD_COUNT);
|
||||
|
||||
if (unlikely(env_skip[simd])) {
|
||||
state.error[simd] = "Disabled by INTEL_DEBUG environment variable";
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
brw_simd_mark_compiled(brw_simd_selection_state &state, unsigned simd, bool spilled)
|
||||
{
|
||||
assert(simd < SIMD_COUNT);
|
||||
assert(!state.compiled[simd]);
|
||||
|
||||
auto cs_prog_data = get_cs_prog_data(state);
|
||||
|
||||
state.compiled[simd] = true;
|
||||
if (cs_prog_data)
|
||||
cs_prog_data->prog_mask |= 1u << simd;
|
||||
|
||||
/* If a SIMD spilled, all the larger ones would spill too. */
|
||||
if (spilled) {
|
||||
for (unsigned i = simd; i < SIMD_COUNT; i++) {
|
||||
state.spilled[i] = true;
|
||||
if (cs_prog_data)
|
||||
cs_prog_data->prog_spilled |= 1u << i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
brw_simd_select(const struct brw_simd_selection_state &state)
|
||||
{
|
||||
for (int i = SIMD_COUNT - 1; i >= 0; i--) {
|
||||
if (state.compiled[i] && !state.spilled[i])
|
||||
return i;
|
||||
}
|
||||
for (int i = SIMD_COUNT - 1; i >= 0; i--) {
|
||||
if (state.compiled[i])
|
||||
return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int
|
||||
brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo,
|
||||
const struct brw_cs_prog_data *prog_data,
|
||||
const unsigned *sizes)
|
||||
{
|
||||
if (!sizes || (prog_data->local_size[0] == sizes[0] &&
|
||||
prog_data->local_size[1] == sizes[1] &&
|
||||
prog_data->local_size[2] == sizes[2])) {
|
||||
brw_simd_selection_state simd_state{
|
||||
.prog_data = const_cast<struct brw_cs_prog_data *>(prog_data),
|
||||
};
|
||||
|
||||
/* Propagate the prog_data information back to the simd_state,
|
||||
* so we can use select() directly.
|
||||
*/
|
||||
for (int i = 0; i < SIMD_COUNT; i++) {
|
||||
simd_state.compiled[i] = test_bit(prog_data->prog_mask, i);
|
||||
simd_state.spilled[i] = test_bit(prog_data->prog_spilled, i);
|
||||
}
|
||||
|
||||
return brw_simd_select(simd_state);
|
||||
}
|
||||
|
||||
struct brw_cs_prog_data cloned = *prog_data;
|
||||
for (unsigned i = 0; i < 3; i++)
|
||||
cloned.local_size[i] = sizes[i];
|
||||
|
||||
cloned.prog_mask = 0;
|
||||
cloned.prog_spilled = 0;
|
||||
|
||||
brw_simd_selection_state simd_state{
|
||||
.devinfo = devinfo,
|
||||
.prog_data = &cloned,
|
||||
};
|
||||
|
||||
for (unsigned simd = 0; simd < SIMD_COUNT; simd++) {
|
||||
/* We are not recompiling, so use original results of prog_mask and
|
||||
* prog_spilled as they will already contain all possible compilations.
|
||||
*/
|
||||
if (brw_simd_should_compile(simd_state, simd) &&
|
||||
test_bit(prog_data->prog_mask, simd)) {
|
||||
brw_simd_mark_compiled(simd_state, simd, test_bit(prog_data->prog_spilled, simd));
|
||||
}
|
||||
}
|
||||
|
||||
return brw_simd_select(simd_state);
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Reference in a new issue