diff --git a/src/gallium/drivers/r300/compiler/nir_to_rc.c b/src/gallium/drivers/r300/compiler/nir_to_rc.c new file mode 100644 index 00000000000..b220711c3bf --- /dev/null +++ b/src/gallium/drivers/r300/compiler/nir_to_rc.c @@ -0,0 +1,4079 @@ +/* + * Copyright © 2014-2015 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "compiler/nir/nir.h" +#include "compiler/nir/nir_deref.h" +#include "compiler/nir/nir_legacy.h" +#include "compiler/nir/nir_worklist.h" +#include "nir_to_rc.h" +#include "pipe/p_screen.h" +#include "pipe/p_state.h" +#include "tgsi/tgsi_dump.h" +#include "tgsi/tgsi_from_mesa.h" +#include "tgsi/tgsi_info.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_ureg.h" +#include "tgsi/tgsi_util.h" +#include "util/u_debug.h" +#include "util/u_math.h" +#include "util/u_memory.h" +#include "util/u_dynarray.h" + +struct ntr_insn { + enum tgsi_opcode opcode; + struct ureg_dst dst[2]; + struct ureg_src src[4]; + enum tgsi_texture_type tex_target; + enum tgsi_return_type tex_return_type; + struct tgsi_texture_offset tex_offset[4]; + + unsigned mem_qualifier; + enum pipe_format mem_format; + + bool is_tex : 1; + bool is_mem : 1; + bool precise : 1; +}; + +struct ntr_block { + /* Array of struct ntr_insn */ + struct util_dynarray insns; + int start_ip; + int end_ip; +}; + +struct ntr_reg_interval { + uint32_t start, end; +}; + +struct ntr_compile { + nir_shader *s; + nir_function_impl *impl; + const struct nir_to_rc_options *options; + struct pipe_screen *screen; + struct ureg_program *ureg; + + bool needs_texcoord_semantic; + bool native_integers; + bool has_txf_lz; + + bool addr_declared[3]; + struct ureg_dst addr_reg[3]; + + /* if condition set up at the end of a block, for ntr_emit_if(). */ + struct ureg_src if_cond; + + /* TGSI temps for our NIR SSA and register values. */ + struct ureg_dst *reg_temp; + struct ureg_src *ssa_temp; + + struct ntr_reg_interval *liveness; + + /* Map from nir_block to ntr_block */ + struct hash_table *blocks; + struct ntr_block *cur_block; + unsigned current_if_else; + unsigned cf_label; + + /* Whether we're currently emitting instructiosn for a precise NIR instruction. */ + bool precise; + + unsigned num_temps; + unsigned first_non_array_temp; + + /* Mappings from driver_location to TGSI input/output number. + * + * We'll be declaring TGSI input/outputs in an arbitrary order, and they get + * their numbers assigned incrementally, unlike inputs or constants. + */ + struct ureg_src *input_index_map; + uint64_t centroid_inputs; + + uint32_t first_ubo; + uint32_t first_ssbo; + + struct ureg_src images[PIPE_MAX_SHADER_IMAGES]; +}; + +static struct ureg_dst +ntr_temp(struct ntr_compile *c) +{ + return ureg_dst_register(TGSI_FILE_TEMPORARY, c->num_temps++); +} + +static struct ntr_block * +ntr_block_from_nir(struct ntr_compile *c, struct nir_block *block) +{ + struct hash_entry *entry = _mesa_hash_table_search(c->blocks, block); + return entry->data; +} + +static void ntr_emit_cf_list(struct ntr_compile *c, struct exec_list *list); +static void ntr_emit_cf_list_ureg(struct ntr_compile *c, struct exec_list *list); + +static struct ntr_insn * +ntr_insn(struct ntr_compile *c, enum tgsi_opcode opcode, + struct ureg_dst dst, + struct ureg_src src0, struct ureg_src src1, + struct ureg_src src2, struct ureg_src src3) +{ + struct ntr_insn insn = { + .opcode = opcode, + .dst = { dst, ureg_dst_undef() }, + .src = { src0, src1, src2, src3 }, + .precise = c->precise, + }; + util_dynarray_append(&c->cur_block->insns, struct ntr_insn, insn); + return util_dynarray_top_ptr(&c->cur_block->insns, struct ntr_insn); +} + +#define OP00( op ) \ +static inline void ntr_##op(struct ntr_compile *c) \ +{ \ + ntr_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \ +} + +#define OP01( op ) \ +static inline void ntr_##op(struct ntr_compile *c, \ + struct ureg_src src0) \ +{ \ + ntr_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \ +} + + +#define OP10( op ) \ +static inline void ntr_##op(struct ntr_compile *c, \ + struct ureg_dst dst) \ +{ \ + ntr_insn(c, TGSI_OPCODE_##op, dst, ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \ +} + +#define OP11( op ) \ +static inline void ntr_##op(struct ntr_compile *c, \ + struct ureg_dst dst, \ + struct ureg_src src0) \ +{ \ + ntr_insn(c, TGSI_OPCODE_##op, dst, src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \ +} + +#define OP12( op ) \ +static inline void ntr_##op(struct ntr_compile *c, \ + struct ureg_dst dst, \ + struct ureg_src src0, \ + struct ureg_src src1) \ +{ \ + ntr_insn(c, TGSI_OPCODE_##op, dst, src0, src1, ureg_src_undef(), ureg_src_undef()); \ +} + +#define OP13( op ) \ +static inline void ntr_##op(struct ntr_compile *c, \ + struct ureg_dst dst, \ + struct ureg_src src0, \ + struct ureg_src src1, \ + struct ureg_src src2) \ +{ \ + ntr_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, ureg_src_undef()); \ +} + +#define OP14( op ) \ +static inline void ntr_##op(struct ntr_compile *c, \ + struct ureg_dst dst, \ + struct ureg_src src0, \ + struct ureg_src src1, \ + struct ureg_src src2, \ + struct ureg_src src3) \ +{ \ + ntr_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, src3); \ +} + +/* We hand-craft our tex instructions */ +#define OP12_TEX(op) +#define OP14_TEX(op) + +/* Use a template include to generate a correctly-typed ntr_OP() + * function for each TGSI opcode: + */ +#include "gallium/auxiliary/tgsi/tgsi_opcode_tmp.h" + +/** + * Interprets a nir_load_const used as a NIR src as a uint. + * + * For non-native-integers drivers, nir_load_const_instrs used by an integer ALU + * instruction (or in a phi-web used by an integer ALU instruction) were + * converted to floats and the ALU instruction swapped to the float equivalent. + * However, this means that integer load_consts used by intrinsics (which don't + * normally get that conversion) may have been reformatted to be floats. Given + * that all of our intrinsic nir_src_as_uint() calls are expected to be small, + * we can just look and see if they look like floats and convert them back to + * ints. + */ +static uint32_t +ntr_src_as_uint(struct ntr_compile *c, nir_src src) +{ + uint32_t val = nir_src_as_uint(src); + if (!c->native_integers && val >= fui(1.0)) + val = (uint32_t)uif(val); + return val; +} + +static unsigned +ntr_64bit_write_mask(unsigned write_mask) +{ + return ((write_mask & 1) ? 0x3 : 0) | ((write_mask & 2) ? 0xc : 0); +} + +static struct ureg_src +ntr_64bit_1f(struct ntr_compile *c) +{ + return ureg_imm4u(c->ureg, + 0x00000000, 0x3ff00000, + 0x00000000, 0x3ff00000); +} + +/* Per-channel masks of def/use within the block, and the per-channel + * livein/liveout for the block as a whole. + */ +struct ntr_live_reg_block_state { + uint8_t *def, *use, *livein, *liveout, *defin, *defout; +}; + +struct ntr_live_reg_state { + unsigned bitset_words; + + struct ntr_reg_interval *regs; + + /* Used in propagate_across_edge() */ + BITSET_WORD *tmp_live; + + struct ntr_live_reg_block_state *blocks; + + nir_block_worklist worklist; +}; + +static void +ntr_live_reg_mark_use(struct ntr_compile *c, struct ntr_live_reg_block_state *bs, + int ip, unsigned index, unsigned used_mask) +{ + bs->use[index] |= used_mask & ~bs->def[index]; + + c->liveness[index].start = MIN2(c->liveness[index].start, ip); + c->liveness[index].end = MAX2(c->liveness[index].end, ip); + +} +static void +ntr_live_reg_setup_def_use(struct ntr_compile *c, nir_function_impl *impl, struct ntr_live_reg_state *state) +{ + for (int i = 0; i < impl->num_blocks; i++) { + state->blocks[i].def = rzalloc_array(state->blocks, uint8_t, c->num_temps); + state->blocks[i].defin = rzalloc_array(state->blocks, uint8_t, c->num_temps); + state->blocks[i].defout = rzalloc_array(state->blocks, uint8_t, c->num_temps); + state->blocks[i].use = rzalloc_array(state->blocks, uint8_t, c->num_temps); + state->blocks[i].livein = rzalloc_array(state->blocks, uint8_t, c->num_temps); + state->blocks[i].liveout = rzalloc_array(state->blocks, uint8_t, c->num_temps); + } + + int ip = 0; + nir_foreach_block(block, impl) { + struct ntr_live_reg_block_state *bs = &state->blocks[block->index]; + struct ntr_block *ntr_block = ntr_block_from_nir(c, block); + + ntr_block->start_ip = ip; + + util_dynarray_foreach(&ntr_block->insns, struct ntr_insn, insn) { + const struct tgsi_opcode_info *opcode_info = + tgsi_get_opcode_info(insn->opcode); + + /* Set up use[] for the srcs. + * + * Uses are the channels of the reg read in the block that don't have a + * preceding def to screen them off. Note that we don't do per-element + * tracking of array regs, so they're never screened off. + */ + for (int i = 0; i < opcode_info->num_src; i++) { + if (insn->src[i].File != TGSI_FILE_TEMPORARY) + continue; + int index = insn->src[i].Index; + + uint32_t used_mask = tgsi_util_get_src_usage_mask(insn->opcode, i, + insn->dst->WriteMask, + insn->src[i].SwizzleX, + insn->src[i].SwizzleY, + insn->src[i].SwizzleZ, + insn->src[i].SwizzleW, + insn->tex_target, + insn->tex_target); + + assert(!insn->src[i].Indirect || index < c->first_non_array_temp); + ntr_live_reg_mark_use(c, bs, ip, index, used_mask); + } + + if (insn->is_tex) { + for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) { + if (insn->tex_offset[i].File == TGSI_FILE_TEMPORARY) + ntr_live_reg_mark_use(c, bs, ip, insn->tex_offset[i].Index, 0xf); + } + } + + /* Set up def[] for the srcs. + * + * Defs are the unconditionally-written (not R/M/W) channels of the reg in + * the block that don't have a preceding use. + */ + for (int i = 0; i < opcode_info->num_dst; i++) { + if (insn->dst[i].File != TGSI_FILE_TEMPORARY) + continue; + int index = insn->dst[i].Index; + uint32_t writemask = insn->dst[i].WriteMask; + + bs->def[index] |= writemask & ~bs->use[index]; + bs->defout[index] |= writemask; + + assert(!insn->dst[i].Indirect || index < c->first_non_array_temp); + c->liveness[index].start = MIN2(c->liveness[index].start, ip); + c->liveness[index].end = MAX2(c->liveness[index].end, ip); + } + ip++; + } + + ntr_block->end_ip = ip; + } +} + +static void +ntr_live_regs(struct ntr_compile *c, nir_function_impl *impl) +{ + nir_metadata_require(impl, nir_metadata_block_index); + + c->liveness = rzalloc_array(c, struct ntr_reg_interval, c->num_temps); + + struct ntr_live_reg_state state = { + .blocks = rzalloc_array(impl, struct ntr_live_reg_block_state, impl->num_blocks), + }; + + /* The intervals start out with start > end (indicating unused) */ + for (int i = 0; i < c->num_temps; i++) + c->liveness[i].start = ~0; + + ntr_live_reg_setup_def_use(c, impl, &state); + + /* Make a forward-order worklist of all the blocks. */ + nir_block_worklist_init(&state.worklist, impl->num_blocks, NULL); + nir_foreach_block(block, impl) { + nir_block_worklist_push_tail(&state.worklist, block); + } + + /* Propagate defin/defout down the CFG to calculate the live variables + * potentially defined along any possible control flow path. We'll use this + * to keep things like conditional defs of the reg (or array regs where we + * don't track defs!) from making the reg's live range extend back to the + * start of the program. + */ + while (!nir_block_worklist_is_empty(&state.worklist)) { + nir_block *block = nir_block_worklist_pop_head(&state.worklist); + for (int j = 0; j < ARRAY_SIZE(block->successors); j++) { + nir_block *succ = block->successors[j]; + if (!succ || succ->index == impl->num_blocks) + continue; + + for (int i = 0; i < c->num_temps; i++) { + uint8_t new_def = state.blocks[block->index].defout[i] & ~state.blocks[succ->index].defin[i]; + + if (new_def) { + state.blocks[succ->index].defin[i] |= new_def; + state.blocks[succ->index].defout[i] |= new_def; + nir_block_worklist_push_tail(&state.worklist, succ); + } + } + } + } + + /* Make a reverse-order worklist of all the blocks. */ + nir_foreach_block(block, impl) { + nir_block_worklist_push_head(&state.worklist, block); + } + + /* We're now ready to work through the worklist and update the liveness sets + * of each of the blocks. As long as we keep the worklist up-to-date as we + * go, everything will get covered. + */ + while (!nir_block_worklist_is_empty(&state.worklist)) { + /* We pop them off in the reverse order we pushed them on. This way + * the first walk of the instructions is backwards so we only walk + * once in the case of no control flow. + */ + nir_block *block = nir_block_worklist_pop_head(&state.worklist); + struct ntr_block *ntr_block = ntr_block_from_nir(c, block); + struct ntr_live_reg_block_state *bs = &state.blocks[block->index]; + + for (int i = 0; i < c->num_temps; i++) { + /* Collect livein from our successors to include in our liveout. */ + for (int j = 0; j < ARRAY_SIZE(block->successors); j++) { + nir_block *succ = block->successors[j]; + if (!succ || succ->index == impl->num_blocks) + continue; + struct ntr_live_reg_block_state *sbs = &state.blocks[succ->index]; + + uint8_t new_liveout = sbs->livein[i] & ~bs->liveout[i]; + if (new_liveout) { + if (state.blocks[block->index].defout[i]) + c->liveness[i].end = MAX2(c->liveness[i].end, ntr_block->end_ip); + bs->liveout[i] |= sbs->livein[i]; + } + } + + /* Propagate use requests from either our block's uses or our + * non-screened-off liveout up to our predecessors. + */ + uint8_t new_livein = ((bs->use[i] | (bs->liveout[i] & ~bs->def[i])) & + ~bs->livein[i]); + if (new_livein) { + bs->livein[i] |= new_livein; + set_foreach(block->predecessors, entry) { + nir_block *pred = (void *)entry->key; + nir_block_worklist_push_tail(&state.worklist, pred); + } + + if (new_livein & state.blocks[block->index].defin[i]) + c->liveness[i].start = MIN2(c->liveness[i].start, ntr_block->start_ip); + } + } + } + + ralloc_free(state.blocks); + nir_block_worklist_fini(&state.worklist); +} + +static void +ntr_ra_check(struct ntr_compile *c, unsigned *ra_map, BITSET_WORD *released, int ip, unsigned index) +{ + if (index < c->first_non_array_temp) + return; + + if (c->liveness[index].start == ip && ra_map[index] == ~0) + ra_map[index] = ureg_DECL_temporary(c->ureg).Index; + + if (c->liveness[index].end == ip && !BITSET_TEST(released, index)) { + ureg_release_temporary(c->ureg, ureg_dst_register(TGSI_FILE_TEMPORARY, ra_map[index])); + BITSET_SET(released, index); + } +} + +static void +ntr_allocate_regs(struct ntr_compile *c, nir_function_impl *impl) +{ + ntr_live_regs(c, impl); + + unsigned *ra_map = ralloc_array(c, unsigned, c->num_temps); + unsigned *released = rzalloc_array(c, BITSET_WORD, BITSET_WORDS(c->num_temps)); + + /* No RA on NIR array regs */ + for (int i = 0; i < c->first_non_array_temp; i++) + ra_map[i] = i; + + for (int i = c->first_non_array_temp; i < c->num_temps; i++) + ra_map[i] = ~0; + + int ip = 0; + nir_foreach_block(block, impl) { + struct ntr_block *ntr_block = ntr_block_from_nir(c, block); + + for (int i = 0; i < c->num_temps; i++) + ntr_ra_check(c, ra_map, released, ip, i); + + util_dynarray_foreach(&ntr_block->insns, struct ntr_insn, insn) { + const struct tgsi_opcode_info *opcode_info = + tgsi_get_opcode_info(insn->opcode); + + for (int i = 0; i < opcode_info->num_src; i++) { + if (insn->src[i].File == TGSI_FILE_TEMPORARY) { + ntr_ra_check(c, ra_map, released, ip, insn->src[i].Index); + insn->src[i].Index = ra_map[insn->src[i].Index]; + } + } + + if (insn->is_tex) { + for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) { + if (insn->tex_offset[i].File == TGSI_FILE_TEMPORARY) { + ntr_ra_check(c, ra_map, released, ip, insn->tex_offset[i].Index); + insn->tex_offset[i].Index = ra_map[insn->tex_offset[i].Index]; + } + } + } + + for (int i = 0; i < opcode_info->num_dst; i++) { + if (insn->dst[i].File == TGSI_FILE_TEMPORARY) { + ntr_ra_check(c, ra_map, released, ip, insn->dst[i].Index); + insn->dst[i].Index = ra_map[insn->dst[i].Index]; + } + } + ip++; + } + + for (int i = 0; i < c->num_temps; i++) + ntr_ra_check(c, ra_map, released, ip, i); + } +} + +static void +ntr_allocate_regs_unoptimized(struct ntr_compile *c, nir_function_impl *impl) +{ + for (int i = c->first_non_array_temp; i < c->num_temps; i++) + ureg_DECL_temporary(c->ureg); +} + + +/** + * Try to find an iadd of a constant value with a non-constant value in the + * nir_src's first component, returning the constant offset and replacing *src + * with the non-constant component. + */ +static const uint32_t +ntr_extract_const_src_offset(nir_src *src) +{ + nir_scalar s = nir_get_scalar(src->ssa, 0); + + while (nir_scalar_is_alu(s)) { + nir_alu_instr *alu = nir_instr_as_alu(s.def->parent_instr); + + if (alu->op == nir_op_iadd) { + for (int i = 0; i < 2; i++) { + nir_const_value *v = nir_src_as_const_value(alu->src[i].src); + if (v != NULL) { + *src = alu->src[1 - i].src; + return v[alu->src[i].swizzle[s.comp]].u32; + } + } + + return 0; + } + + /* We'd like to reuse nir_scalar_chase_movs(), but it assumes SSA and that + * seems reasonable for something used in inner loops of the compiler. + */ + if (alu->op == nir_op_mov) { + s.def = alu->src[0].src.ssa; + s.comp = alu->src[0].swizzle[s.comp]; + } else if (nir_op_is_vec(alu->op)) { + s.def = alu->src[s.comp].src.ssa; + s.comp = alu->src[s.comp].swizzle[0]; + } else { + return 0; + } + } + + return 0; +} + +static const struct glsl_type * +ntr_shader_input_type(struct ntr_compile *c, + struct nir_variable *var) +{ + switch (c->s->info.stage) { + case MESA_SHADER_GEOMETRY: + case MESA_SHADER_TESS_EVAL: + case MESA_SHADER_TESS_CTRL: + if (glsl_type_is_array(var->type)) + return glsl_get_array_element(var->type); + else + return var->type; + default: + return var->type; + } +} + +static void +ntr_get_gl_varying_semantic(struct ntr_compile *c, unsigned location, + unsigned *semantic_name, unsigned *semantic_index) +{ + /* We want to use most of tgsi_get_gl_varying_semantic(), but the + * !texcoord shifting has already been applied, so avoid that. + */ + if (!c->needs_texcoord_semantic && + (location >= VARYING_SLOT_VAR0 && location < VARYING_SLOT_PATCH0)) { + *semantic_name = TGSI_SEMANTIC_GENERIC; + *semantic_index = location - VARYING_SLOT_VAR0; + return; + } + + tgsi_get_gl_varying_semantic(location, true, + semantic_name, semantic_index); +} + +/* TGSI varying declarations have a component usage mask associated (used by + * r600 and svga). + */ +static uint32_t +ntr_tgsi_usage_mask(unsigned start_component, unsigned num_components, + bool is_64) +{ + uint32_t usage_mask = + u_bit_consecutive(start_component, num_components); + + if (is_64) { + if (start_component >= 2) + usage_mask >>= 2; + + uint32_t tgsi_usage_mask = 0; + + if (usage_mask & TGSI_WRITEMASK_X) + tgsi_usage_mask |= TGSI_WRITEMASK_XY; + if (usage_mask & TGSI_WRITEMASK_Y) + tgsi_usage_mask |= TGSI_WRITEMASK_ZW; + + return tgsi_usage_mask; + } else { + return usage_mask; + } +} + +/* TGSI varying declarations have a component usage mask associated (used by + * r600 and svga). + */ +static uint32_t +ntr_tgsi_var_usage_mask(const struct nir_variable *var) +{ + const struct glsl_type *type_without_array = + glsl_without_array(var->type); + unsigned num_components = glsl_get_vector_elements(type_without_array); + if (num_components == 0) /* structs */ + num_components = 4; + + return ntr_tgsi_usage_mask(var->data.location_frac, num_components, + glsl_type_is_64bit(type_without_array)); +} + +static struct ureg_dst +ntr_output_decl(struct ntr_compile *c, nir_intrinsic_instr *instr, uint32_t *frac) +{ + nir_io_semantics semantics = nir_intrinsic_io_semantics(instr); + int base = nir_intrinsic_base(instr); + *frac = nir_intrinsic_component(instr); + bool is_64 = nir_src_bit_size(instr->src[0]) == 64; + + struct ureg_dst out; + if (c->s->info.stage == MESA_SHADER_FRAGMENT) { + unsigned semantic_name, semantic_index; + tgsi_get_gl_frag_result_semantic(semantics.location, + &semantic_name, &semantic_index); + semantic_index += semantics.dual_source_blend_index; + + switch (semantics.location) { + case FRAG_RESULT_DEPTH: + *frac = 2; /* z write is the to the .z channel in TGSI */ + break; + case FRAG_RESULT_STENCIL: + *frac = 1; + break; + default: + break; + } + + out = ureg_DECL_output(c->ureg, semantic_name, semantic_index); + } else { + unsigned semantic_name, semantic_index; + + ntr_get_gl_varying_semantic(c, semantics.location, + &semantic_name, &semantic_index); + + uint32_t usage_mask = ntr_tgsi_usage_mask(*frac, + instr->num_components, + is_64); + uint32_t gs_streams = semantics.gs_streams; + for (int i = 0; i < 4; i++) { + if (!(usage_mask & (1 << i))) + gs_streams &= ~(0x3 << 2 * i); + } + + /* No driver appears to use array_id of outputs. */ + unsigned array_id = 0; + + /* This bit is lost in the i/o semantics, but it's unused in in-tree + * drivers. + */ + bool invariant = semantics.invariant; + + unsigned num_slots = semantics.num_slots; + if (semantics.location == VARYING_SLOT_TESS_LEVEL_INNER || + semantics.location == VARYING_SLOT_TESS_LEVEL_OUTER) { + /* Compact vars get a num_slots in NIR as number of components, but we + * want the number of vec4 slots here. + */ + num_slots = 1; + } + + out = ureg_DECL_output_layout(c->ureg, + semantic_name, semantic_index, + gs_streams, + base, + usage_mask, + array_id, + num_slots, + invariant); + } + + unsigned write_mask; + if (nir_intrinsic_has_write_mask(instr)) + write_mask = nir_intrinsic_write_mask(instr); + else + write_mask = ((1 << instr->num_components) - 1) << *frac; + + if (is_64) { + write_mask = ntr_64bit_write_mask(write_mask); + if (*frac >= 2) + write_mask = write_mask << 2; + } else { + write_mask = write_mask << *frac; + } + return ureg_writemask(out, write_mask); +} + +static bool +ntr_try_store_in_tgsi_output_with_use(struct ntr_compile *c, + struct ureg_dst *dst, + nir_src *src) +{ + *dst = ureg_dst_undef(); + + switch (c->s->info.stage) { + case MESA_SHADER_FRAGMENT: + case MESA_SHADER_VERTEX: + break; + default: + /* tgsi_exec (at least) requires that output stores happen per vertex + * emitted, you don't get to reuse a previous output value for the next + * vertex. + */ + return false; + } + + if (src->is_if) + return false; + + if (src->parent_instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(src->parent_instr); + if (intr->intrinsic != nir_intrinsic_store_output || + !nir_src_is_const(intr->src[1])) { + return false; + } + + uint32_t frac; + *dst = ntr_output_decl(c, intr, &frac); + dst->Index += ntr_src_as_uint(c, intr->src[1]); + + return frac == 0; +} + +/* If this reg is used only for storing an output, then in the simple + * cases we can write directly to the TGSI output instead of having + * store_output emit its own MOV. + */ +static bool +ntr_try_store_reg_in_tgsi_output(struct ntr_compile *c, struct ureg_dst *dst, + nir_intrinsic_instr *reg_decl) +{ + assert(reg_decl->intrinsic == nir_intrinsic_decl_reg); + + *dst = ureg_dst_undef(); + + /* Look for a single use for try_store_in_tgsi_output */ + nir_src *use = NULL; + nir_foreach_reg_load(src, reg_decl) { + nir_intrinsic_instr *load = nir_instr_as_intrinsic(src->parent_instr); + nir_foreach_use_including_if(load_use, &load->def) { + /* We can only have one use */ + if (use != NULL) + return false; + + use = load_use; + } + } + + if (use == NULL) + return false; + + return ntr_try_store_in_tgsi_output_with_use(c, dst, use); +} + +/* If this SSA def is used only for storing an output, then in the simple + * cases we can write directly to the TGSI output instead of having + * store_output emit its own MOV. + */ +static bool +ntr_try_store_ssa_in_tgsi_output(struct ntr_compile *c, struct ureg_dst *dst, + nir_def *def) +{ + *dst = ureg_dst_undef(); + + if (!list_is_singular(&def->uses)) + return false; + + nir_foreach_use_including_if(use, def) { + return ntr_try_store_in_tgsi_output_with_use(c, dst, use); + } + unreachable("We have one use"); +} + +static void +ntr_setup_inputs(struct ntr_compile *c) +{ + if (c->s->info.stage != MESA_SHADER_FRAGMENT) + return; + + unsigned num_inputs = 0; + int num_input_arrays = 0; + + nir_foreach_shader_in_variable(var, c->s) { + const struct glsl_type *type = ntr_shader_input_type(c, var); + unsigned array_len = + glsl_count_attribute_slots(type, false); + + num_inputs = MAX2(num_inputs, var->data.driver_location + array_len); + } + + c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs); + + nir_foreach_shader_in_variable(var, c->s) { + const struct glsl_type *type = ntr_shader_input_type(c, var); + unsigned array_len = + glsl_count_attribute_slots(type, false); + + unsigned interpolation = TGSI_INTERPOLATE_CONSTANT; + unsigned sample_loc; + struct ureg_src decl; + + if (c->s->info.stage == MESA_SHADER_FRAGMENT) { + interpolation = + tgsi_get_interp_mode(var->data.interpolation, + var->data.location == VARYING_SLOT_COL0 || + var->data.location == VARYING_SLOT_COL1); + + if (var->data.location == VARYING_SLOT_POS) + interpolation = TGSI_INTERPOLATE_LINEAR; + } + + unsigned semantic_name, semantic_index; + ntr_get_gl_varying_semantic(c, var->data.location, + &semantic_name, &semantic_index); + + if (var->data.sample) { + sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE; + } else if (var->data.centroid) { + sample_loc = TGSI_INTERPOLATE_LOC_CENTROID; + c->centroid_inputs |= (BITSET_MASK(array_len) << + var->data.driver_location); + } else { + sample_loc = TGSI_INTERPOLATE_LOC_CENTER; + } + + unsigned array_id = 0; + if (glsl_type_is_array(type)) + array_id = ++num_input_arrays; + + uint32_t usage_mask = ntr_tgsi_var_usage_mask(var); + + decl = ureg_DECL_fs_input_centroid_layout(c->ureg, + semantic_name, + semantic_index, + interpolation, + sample_loc, + var->data.driver_location, + usage_mask, + array_id, array_len); + + if (semantic_name == TGSI_SEMANTIC_FACE) { + struct ureg_dst temp = ntr_temp(c); + if (c->native_integers) { + /* NIR is ~0 front and 0 back, while TGSI is +1 front */ + ntr_SGE(c, temp, decl, ureg_imm1f(c->ureg, 0)); + } else { + /* tgsi docs say that floating point FACE will be positive for + * frontface and negative for backface, but realistically + * GLSL-to-TGSI had been doing MOV_SAT to turn it into 0.0 vs 1.0. + * Copy that behavior, since some drivers (r300) have been doing a + * 0.0 vs 1.0 backface (and I don't think anybody has a non-1.0 + * front face). + */ + temp.Saturate = true; + ntr_MOV(c, temp, decl); + + } + decl = ureg_src(temp); + } + + for (unsigned i = 0; i < array_len; i++) { + c->input_index_map[var->data.driver_location + i] = decl; + c->input_index_map[var->data.driver_location + i].Index += i; + } + } +} + +static int +ntr_sort_by_location(const nir_variable *a, const nir_variable *b) +{ + return a->data.location - b->data.location; +} + +/** + * Workaround for virglrenderer requiring that TGSI FS output color variables + * are declared in order. Besides, it's a lot nicer to read the TGSI this way. + */ +static void +ntr_setup_outputs(struct ntr_compile *c) +{ + if (c->s->info.stage != MESA_SHADER_FRAGMENT) + return; + + nir_sort_variables_with_modes(c->s, ntr_sort_by_location, nir_var_shader_out); + + nir_foreach_shader_out_variable(var, c->s) { + if (var->data.location == FRAG_RESULT_COLOR) + ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1); + + unsigned semantic_name, semantic_index; + tgsi_get_gl_frag_result_semantic(var->data.location, + &semantic_name, &semantic_index); + + (void)ureg_DECL_output(c->ureg, semantic_name, semantic_index); + } +} + +static enum tgsi_texture_type +tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array, bool is_shadow) +{ + switch (dim) { + case GLSL_SAMPLER_DIM_1D: + if (is_shadow) + return is_array ? TGSI_TEXTURE_SHADOW1D_ARRAY : TGSI_TEXTURE_SHADOW1D; + else + return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D; + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_EXTERNAL: + if (is_shadow) + return is_array ? TGSI_TEXTURE_SHADOW2D_ARRAY : TGSI_TEXTURE_SHADOW2D; + else + return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D; + case GLSL_SAMPLER_DIM_3D: + return TGSI_TEXTURE_3D; + case GLSL_SAMPLER_DIM_CUBE: + if (is_shadow) + return is_array ? TGSI_TEXTURE_SHADOWCUBE_ARRAY : TGSI_TEXTURE_SHADOWCUBE; + else + return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE; + case GLSL_SAMPLER_DIM_RECT: + if (is_shadow) + return TGSI_TEXTURE_SHADOWRECT; + else + return TGSI_TEXTURE_RECT; + case GLSL_SAMPLER_DIM_MS: + return is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA; + case GLSL_SAMPLER_DIM_BUF: + return TGSI_TEXTURE_BUFFER; + default: + unreachable("unknown sampler dim"); + } +} + +static enum tgsi_return_type +tgsi_return_type_from_base_type(enum glsl_base_type type) +{ + switch (type) { + case GLSL_TYPE_INT: + return TGSI_RETURN_TYPE_SINT; + case GLSL_TYPE_UINT: + return TGSI_RETURN_TYPE_UINT; + case GLSL_TYPE_FLOAT: + return TGSI_RETURN_TYPE_FLOAT; + default: + unreachable("unexpected texture type"); + } +} + +static void +ntr_setup_uniforms(struct ntr_compile *c) +{ + nir_foreach_uniform_variable(var, c->s) { + if (glsl_type_is_sampler(glsl_without_array(var->type)) || + glsl_type_is_texture(glsl_without_array(var->type))) { + /* Don't use this size for the check for samplers -- arrays of structs + * containing samplers should be ignored, and just the separate lowered + * sampler uniform decl used. + */ + int size = glsl_type_get_sampler_count(var->type) + + glsl_type_get_texture_count(var->type); + + const struct glsl_type *stype = glsl_without_array(var->type); + enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(stype), + glsl_sampler_type_is_array(stype), + glsl_sampler_type_is_shadow(stype)); + enum tgsi_return_type ret_type = tgsi_return_type_from_base_type(glsl_get_sampler_result_type(stype)); + for (int i = 0; i < size; i++) { + ureg_DECL_sampler_view(c->ureg, var->data.binding + i, + target, ret_type, ret_type, ret_type, ret_type); + ureg_DECL_sampler(c->ureg, var->data.binding + i); + } + } else if (glsl_contains_atomic(var->type)) { + uint32_t offset = var->data.offset / 4; + uint32_t size = glsl_atomic_size(var->type) / 4; + ureg_DECL_hw_atomic(c->ureg, offset, offset + size - 1, var->data.binding, 0); + } + + /* lower_uniforms_to_ubo lowered non-sampler uniforms to UBOs, so CB0 + * size declaration happens with other UBOs below. + */ + } + + nir_foreach_image_variable(var, c->s) { + int image_count = glsl_type_get_image_count(var->type); + const struct glsl_type *itype = glsl_without_array(var->type); + enum tgsi_texture_type tex_type = + tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(itype), + glsl_sampler_type_is_array(itype), false); + + for (int i = 0; i < image_count; i++) { + c->images[var->data.binding] = ureg_DECL_image(c->ureg, + var->data.binding + i, + tex_type, + var->data.image.format, + !(var->data.access & ACCESS_NON_WRITEABLE), + false); + } + } + + c->first_ubo = ~0; + + unsigned ubo_sizes[PIPE_MAX_CONSTANT_BUFFERS] = {0}; + nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) { + int ubo = var->data.driver_location; + if (ubo == -1) + continue; + + if (!(ubo == 0 && c->s->info.first_ubo_is_default_ubo)) + c->first_ubo = MIN2(c->first_ubo, ubo); + + unsigned size = glsl_get_explicit_size(var->interface_type, false); + + int array_size = 1; + if (glsl_type_is_interface(glsl_without_array(var->type))) + array_size = MAX2(1, glsl_get_aoa_size(var->type)); + + for (int i = 0; i < array_size; i++) { + /* Even if multiple NIR variables are in the same uniform block, their + * explicit size is the size of the block. + */ + if (ubo_sizes[ubo + i]) + assert(ubo_sizes[ubo + i] == size); + + ubo_sizes[ubo + i] = size; + } + } + + for (int i = 0; i < ARRAY_SIZE(ubo_sizes); i++) { + if (ubo_sizes[i]) + ureg_DECL_constant2D(c->ureg, 0, DIV_ROUND_UP(ubo_sizes[i], 16) - 1, i); + } + + if (c->options->lower_ssbo_bindings) { + c->first_ssbo = 255; + nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ssbo) { + if (c->first_ssbo > var->data.binding) + c->first_ssbo = var->data.binding; + } + } else + c->first_ssbo = 0; + + /* XXX: nv50 uses the atomic flag to set caching for (lowered) atomic + * counters + */ + bool atomic = false; + for (int i = 0; i < c->s->info.num_ssbos; ++i) + ureg_DECL_buffer(c->ureg, c->first_ssbo + i, atomic); + +} + +static void +ntr_setup_registers(struct ntr_compile *c) +{ + assert(c->num_temps == 0); + + nir_foreach_reg_decl_safe(nir_reg, nir_shader_get_entrypoint(c->s)) { + /* Permanently allocate all the array regs at the start. */ + unsigned num_array_elems = nir_intrinsic_num_array_elems(nir_reg); + unsigned index = nir_reg->def.index; + + if (num_array_elems != 0) { + struct ureg_dst decl = ureg_DECL_array_temporary(c->ureg, num_array_elems, true); + c->reg_temp[index] = decl; + assert(c->num_temps == decl.Index); + c->num_temps += num_array_elems; + } + } + c->first_non_array_temp = c->num_temps; + + /* After that, allocate non-array regs in our virtual space that we'll + * register-allocate before ureg emit. + */ + nir_foreach_reg_decl_safe(nir_reg, nir_shader_get_entrypoint(c->s)) { + unsigned num_array_elems = nir_intrinsic_num_array_elems(nir_reg); + unsigned num_components = nir_intrinsic_num_components(nir_reg); + unsigned bit_size = nir_intrinsic_bit_size(nir_reg); + unsigned index = nir_reg->def.index; + + /* We already handled arrays */ + if (num_array_elems == 0) { + struct ureg_dst decl; + uint32_t write_mask = BITFIELD_MASK(num_components); + + if (!ntr_try_store_reg_in_tgsi_output(c, &decl, nir_reg)) { + if (bit_size == 64) { + if (num_components > 2) { + fprintf(stderr, "NIR-to-TGSI: error: %d-component NIR r%d\n", + num_components, index); + } + + write_mask = ntr_64bit_write_mask(write_mask); + } + + decl = ureg_writemask(ntr_temp(c), write_mask); + } + c->reg_temp[index] = decl; + } + } +} + +static struct ureg_src +ntr_get_load_const_src(struct ntr_compile *c, nir_load_const_instr *instr) +{ + int num_components = instr->def.num_components; + + if (!c->native_integers) { + float values[4]; + assert(instr->def.bit_size == 32); + for (int i = 0; i < num_components; i++) + values[i] = uif(instr->value[i].u32); + + return ureg_DECL_immediate(c->ureg, values, num_components); + } else { + uint32_t values[4]; + + if (instr->def.bit_size == 32) { + for (int i = 0; i < num_components; i++) + values[i] = instr->value[i].u32; + } else { + assert(num_components <= 2); + for (int i = 0; i < num_components; i++) { + values[i * 2 + 0] = instr->value[i].u64 & 0xffffffff; + values[i * 2 + 1] = instr->value[i].u64 >> 32; + } + num_components *= 2; + } + + return ureg_DECL_immediate_uint(c->ureg, values, num_components); + } +} + +static struct ureg_src +ntr_reladdr(struct ntr_compile *c, struct ureg_src addr, int addr_index) +{ + assert(addr_index < ARRAY_SIZE(c->addr_reg)); + + for (int i = 0; i <= addr_index; i++) { + if (!c->addr_declared[i]) { + c->addr_reg[i] = ureg_writemask(ureg_DECL_address(c->ureg), + TGSI_WRITEMASK_X); + c->addr_declared[i] = true; + } + } + + if (c->native_integers) + ntr_UARL(c, c->addr_reg[addr_index], addr); + else + ntr_ARL(c, c->addr_reg[addr_index], addr); + return ureg_scalar(ureg_src(c->addr_reg[addr_index]), 0); +} + +/* Forward declare for recursion with indirects */ +static struct ureg_src +ntr_get_src(struct ntr_compile *c, nir_src src); + +static struct ureg_src +ntr_get_chased_src(struct ntr_compile *c, nir_legacy_src *src) +{ + if (src->is_ssa) { + if (src->ssa->parent_instr->type == nir_instr_type_load_const) + return ntr_get_load_const_src(c, nir_instr_as_load_const(src->ssa->parent_instr)); + + return c->ssa_temp[src->ssa->index]; + } else { + struct ureg_dst reg_temp = c->reg_temp[src->reg.handle->index]; + reg_temp.Index += src->reg.base_offset; + + if (src->reg.indirect) { + struct ureg_src offset = ntr_get_src(c, nir_src_for_ssa(src->reg.indirect)); + return ureg_src_indirect(ureg_src(reg_temp), + ntr_reladdr(c, offset, 0)); + } else { + return ureg_src(reg_temp); + } + } +} + +static struct ureg_src +ntr_get_src(struct ntr_compile *c, nir_src src) +{ + nir_legacy_src chased = nir_legacy_chase_src(&src); + return ntr_get_chased_src(c, &chased); +} + +static struct ureg_src +ntr_get_alu_src(struct ntr_compile *c, nir_alu_instr *instr, int i) +{ + /* We only support 32-bit float modifiers. The only other modifier type + * officially supported by TGSI is 32-bit integer negates, but even those are + * broken on virglrenderer, so skip lowering all integer and f64 float mods. + * + * The options->lower_fabs requests that we not have native source modifiers + * for fabs, and instead emit MAX(a,-a) for nir_op_fabs. + */ + nir_legacy_alu_src src = + nir_legacy_chase_alu_src(&instr->src[i], !c->options->lower_fabs); + struct ureg_src usrc = ntr_get_chased_src(c, &src.src); + + /* Expand double/dvec2 src references to TGSI swizzles using a pair of 32-bit + * channels. We skip this for undefs, as those don't get split to vec2s (but + * the specific swizzles from an undef don't matter) + */ + if (nir_src_bit_size(instr->src[i].src) == 64 && + !(src.src.is_ssa && src.src.ssa->parent_instr->type == nir_instr_type_undef)) { + int chan1 = 1; + if (nir_op_infos[instr->op].input_sizes[i] == 0) { + chan1 = instr->def.num_components > 1 ? 1 : 0; + } + usrc = ureg_swizzle(usrc, + src.swizzle[0] * 2, + src.swizzle[0] * 2 + 1, + src.swizzle[chan1] * 2, + src.swizzle[chan1] * 2 + 1); + } else { + usrc = ureg_swizzle(usrc, + src.swizzle[0], + src.swizzle[1], + src.swizzle[2], + src.swizzle[3]); + } + + if (src.fabs) + usrc = ureg_abs(usrc); + if (src.fneg) + usrc = ureg_negate(usrc); + + return usrc; +} + +/* Reswizzles a source so that the unset channels in the write mask still refer + * to one of the channels present in the write mask. + */ +static struct ureg_src +ntr_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask) +{ + assert(write_mask); + int first_chan = ffs(write_mask) - 1; + return ureg_swizzle(src, + (write_mask & TGSI_WRITEMASK_X) ? TGSI_SWIZZLE_X : first_chan, + (write_mask & TGSI_WRITEMASK_Y) ? TGSI_SWIZZLE_Y : first_chan, + (write_mask & TGSI_WRITEMASK_Z) ? TGSI_SWIZZLE_Z : first_chan, + (write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan); +} + +static struct ureg_dst +ntr_get_ssa_def_decl(struct ntr_compile *c, nir_def *ssa) +{ + uint32_t writemask = BITSET_MASK(ssa->num_components); + if (ssa->bit_size == 64) + writemask = ntr_64bit_write_mask(writemask); + + struct ureg_dst dst; + if (!ntr_try_store_ssa_in_tgsi_output(c, &dst, ssa)) + dst = ntr_temp(c); + + c->ssa_temp[ssa->index] = ntr_swizzle_for_write_mask(ureg_src(dst), writemask); + + return ureg_writemask(dst, writemask); +} + +static struct ureg_dst +ntr_get_chased_dest_decl(struct ntr_compile *c, nir_legacy_dest *dest) +{ + if (dest->is_ssa) + return ntr_get_ssa_def_decl(c, dest->ssa); + else + return c->reg_temp[dest->reg.handle->index]; +} + +static struct ureg_dst +ntr_get_chased_dest(struct ntr_compile *c, nir_legacy_dest *dest) +{ + struct ureg_dst dst = ntr_get_chased_dest_decl(c, dest); + + if (!dest->is_ssa) { + dst.Index += dest->reg.base_offset; + + if (dest->reg.indirect) { + struct ureg_src offset = ntr_get_src(c, nir_src_for_ssa(dest->reg.indirect)); + dst = ureg_dst_indirect(dst, ntr_reladdr(c, offset, 0)); + } + } + + return dst; +} + +static struct ureg_dst +ntr_get_dest(struct ntr_compile *c, nir_def *def) +{ + nir_legacy_dest chased = nir_legacy_chase_dest(def); + return ntr_get_chased_dest(c, &chased); +} + +static struct ureg_dst +ntr_get_alu_dest(struct ntr_compile *c, nir_def *def) +{ + nir_legacy_alu_dest chased = nir_legacy_chase_alu_dest(def); + struct ureg_dst dst = ntr_get_chased_dest(c, &chased.dest); + + if (chased.fsat) + dst.Saturate = true; + + /* Only registers get write masks */ + if (chased.dest.is_ssa) + return dst; + + int dst_64 = def->bit_size == 64; + unsigned write_mask = chased.write_mask; + + if (dst_64) + return ureg_writemask(dst, ntr_64bit_write_mask(write_mask)); + else + return ureg_writemask(dst, write_mask); +} + +/* For an SSA dest being populated by a constant src, replace the storage with + * a copy of the ureg_src. + */ +static void +ntr_store_def(struct ntr_compile *c, nir_def *def, struct ureg_src src) +{ + if (!src.Indirect && !src.DimIndirect) { + switch (src.File) { + case TGSI_FILE_IMMEDIATE: + case TGSI_FILE_INPUT: + case TGSI_FILE_CONSTANT: + case TGSI_FILE_SYSTEM_VALUE: + c->ssa_temp[def->index] = src; + return; + } + } + + ntr_MOV(c, ntr_get_ssa_def_decl(c, def), src); +} + +static void +ntr_store(struct ntr_compile *c, nir_def *def, struct ureg_src src) +{ + nir_legacy_dest chased = nir_legacy_chase_dest(def); + + if (chased.is_ssa) + ntr_store_def(c, chased.ssa, src); + else { + struct ureg_dst dst = ntr_get_chased_dest(c, &chased); + ntr_MOV(c, dst, src); + } +} + +static void +ntr_emit_scalar(struct ntr_compile *c, unsigned tgsi_op, + struct ureg_dst dst, + struct ureg_src src0, + struct ureg_src src1) +{ + unsigned i; + + /* POW is the only 2-operand scalar op. */ + if (tgsi_op != TGSI_OPCODE_POW) + src1 = src0; + + for (i = 0; i < 4; i++) { + if (dst.WriteMask & (1 << i)) { + ntr_insn(c, tgsi_op, + ureg_writemask(dst, 1 << i), + ureg_scalar(src0, i), + ureg_scalar(src1, i), + ureg_src_undef(), ureg_src_undef()); + } + } +} + +static void +ntr_emit_alu(struct ntr_compile *c, nir_alu_instr *instr) +{ + struct ureg_src src[4]; + struct ureg_dst dst; + unsigned i; + int dst_64 = instr->def.bit_size == 64; + int src_64 = nir_src_bit_size(instr->src[0].src) == 64; + int num_srcs = nir_op_infos[instr->op].num_inputs; + + /* Don't try to translate folded fsat since their source won't be valid */ + if (instr->op == nir_op_fsat && nir_legacy_fsat_folds(instr)) + return; + + c->precise = instr->exact; + + assert(num_srcs <= ARRAY_SIZE(src)); + for (i = 0; i < num_srcs; i++) + src[i] = ntr_get_alu_src(c, instr, i); + for (; i < ARRAY_SIZE(src); i++) + src[i] = ureg_src_undef(); + + dst = ntr_get_alu_dest(c, &instr->def); + + static enum tgsi_opcode op_map[][2] = { + [nir_op_mov] = { TGSI_OPCODE_MOV, TGSI_OPCODE_MOV }, + + /* fabs/fneg 32-bit are special-cased below. */ + [nir_op_fabs] = { 0, TGSI_OPCODE_DABS }, + [nir_op_fneg] = { 0, TGSI_OPCODE_DNEG }, + + [nir_op_fdot2] = { TGSI_OPCODE_DP2 }, + [nir_op_fdot3] = { TGSI_OPCODE_DP3 }, + [nir_op_fdot4] = { TGSI_OPCODE_DP4 }, + [nir_op_fdot2_replicated] = { TGSI_OPCODE_DP2 }, + [nir_op_fdot3_replicated] = { TGSI_OPCODE_DP3 }, + [nir_op_fdot4_replicated] = { TGSI_OPCODE_DP4 }, + [nir_op_ffloor] = { TGSI_OPCODE_FLR, TGSI_OPCODE_DFLR }, + [nir_op_ffract] = { TGSI_OPCODE_FRC, TGSI_OPCODE_DFRAC }, + [nir_op_fceil] = { TGSI_OPCODE_CEIL, TGSI_OPCODE_DCEIL }, + [nir_op_fround_even] = { TGSI_OPCODE_ROUND, TGSI_OPCODE_DROUND }, + [nir_op_fdiv] = { TGSI_OPCODE_DIV, TGSI_OPCODE_DDIV }, + [nir_op_idiv] = { TGSI_OPCODE_IDIV, TGSI_OPCODE_I64DIV }, + [nir_op_udiv] = { TGSI_OPCODE_UDIV, TGSI_OPCODE_U64DIV }, + + [nir_op_frcp] = { 0, TGSI_OPCODE_DRCP }, + [nir_op_frsq] = { 0, TGSI_OPCODE_DRSQ }, + [nir_op_fsqrt] = { 0, TGSI_OPCODE_DSQRT }, + + /* The conversions will have one combination of src and dst bitsize. */ + [nir_op_f2f32] = { 0, TGSI_OPCODE_D2F }, + [nir_op_f2f64] = { TGSI_OPCODE_F2D }, + [nir_op_i2i64] = { TGSI_OPCODE_I2I64 }, + + [nir_op_f2i32] = { TGSI_OPCODE_F2I, TGSI_OPCODE_D2I }, + [nir_op_f2i64] = { TGSI_OPCODE_F2I64, TGSI_OPCODE_D2I64 }, + [nir_op_f2u32] = { TGSI_OPCODE_F2U, TGSI_OPCODE_D2U }, + [nir_op_f2u64] = { TGSI_OPCODE_F2U64, TGSI_OPCODE_D2U64 }, + [nir_op_i2f32] = { TGSI_OPCODE_I2F, TGSI_OPCODE_I642F }, + [nir_op_i2f64] = { TGSI_OPCODE_I2D, TGSI_OPCODE_I642D }, + [nir_op_u2f32] = { TGSI_OPCODE_U2F, TGSI_OPCODE_U642F }, + [nir_op_u2f64] = { TGSI_OPCODE_U2D, TGSI_OPCODE_U642D }, + + [nir_op_slt] = { TGSI_OPCODE_SLT }, + [nir_op_sge] = { TGSI_OPCODE_SGE }, + [nir_op_seq] = { TGSI_OPCODE_SEQ }, + [nir_op_sne] = { TGSI_OPCODE_SNE }, + + [nir_op_flt32] = { TGSI_OPCODE_FSLT, TGSI_OPCODE_DSLT }, + [nir_op_fge32] = { TGSI_OPCODE_FSGE, TGSI_OPCODE_DSGE }, + [nir_op_feq32] = { TGSI_OPCODE_FSEQ, TGSI_OPCODE_DSEQ }, + [nir_op_fneu32] = { TGSI_OPCODE_FSNE, TGSI_OPCODE_DSNE }, + + [nir_op_ilt32] = { TGSI_OPCODE_ISLT, TGSI_OPCODE_I64SLT }, + [nir_op_ige32] = { TGSI_OPCODE_ISGE, TGSI_OPCODE_I64SGE }, + [nir_op_ieq32] = { TGSI_OPCODE_USEQ, TGSI_OPCODE_U64SEQ }, + [nir_op_ine32] = { TGSI_OPCODE_USNE, TGSI_OPCODE_U64SNE }, + + [nir_op_ult32] = { TGSI_OPCODE_USLT, TGSI_OPCODE_U64SLT }, + [nir_op_uge32] = { TGSI_OPCODE_USGE, TGSI_OPCODE_U64SGE }, + + [nir_op_iabs] = { TGSI_OPCODE_IABS, TGSI_OPCODE_I64ABS }, + [nir_op_ineg] = { TGSI_OPCODE_INEG, TGSI_OPCODE_I64NEG }, + [nir_op_fsign] = { TGSI_OPCODE_SSG, TGSI_OPCODE_DSSG }, + [nir_op_isign] = { TGSI_OPCODE_ISSG, TGSI_OPCODE_I64SSG }, + [nir_op_ftrunc] = { TGSI_OPCODE_TRUNC, TGSI_OPCODE_DTRUNC }, + [nir_op_fddx] = { TGSI_OPCODE_DDX }, + [nir_op_fddy] = { TGSI_OPCODE_DDY }, + [nir_op_fddx_coarse] = { TGSI_OPCODE_DDX }, + [nir_op_fddy_coarse] = { TGSI_OPCODE_DDY }, + [nir_op_fddx_fine] = { TGSI_OPCODE_DDX_FINE }, + [nir_op_fddy_fine] = { TGSI_OPCODE_DDY_FINE }, + [nir_op_pack_half_2x16] = { TGSI_OPCODE_PK2H }, + [nir_op_unpack_half_2x16] = { TGSI_OPCODE_UP2H }, + [nir_op_ibitfield_extract] = { TGSI_OPCODE_IBFE }, + [nir_op_ubitfield_extract] = { TGSI_OPCODE_UBFE }, + [nir_op_bitfield_insert] = { TGSI_OPCODE_BFI }, + [nir_op_bitfield_reverse] = { TGSI_OPCODE_BREV }, + [nir_op_bit_count] = { TGSI_OPCODE_POPC }, + [nir_op_ifind_msb] = { TGSI_OPCODE_IMSB }, + [nir_op_ufind_msb] = { TGSI_OPCODE_UMSB }, + [nir_op_find_lsb] = { TGSI_OPCODE_LSB }, + [nir_op_fadd] = { TGSI_OPCODE_ADD, TGSI_OPCODE_DADD }, + [nir_op_iadd] = { TGSI_OPCODE_UADD, TGSI_OPCODE_U64ADD }, + [nir_op_fmul] = { TGSI_OPCODE_MUL, TGSI_OPCODE_DMUL }, + [nir_op_imul] = { TGSI_OPCODE_UMUL, TGSI_OPCODE_U64MUL }, + [nir_op_imod] = { TGSI_OPCODE_MOD, TGSI_OPCODE_I64MOD }, + [nir_op_umod] = { TGSI_OPCODE_UMOD, TGSI_OPCODE_U64MOD }, + [nir_op_imul_high] = { TGSI_OPCODE_IMUL_HI }, + [nir_op_umul_high] = { TGSI_OPCODE_UMUL_HI }, + [nir_op_ishl] = { TGSI_OPCODE_SHL, TGSI_OPCODE_U64SHL }, + [nir_op_ishr] = { TGSI_OPCODE_ISHR, TGSI_OPCODE_I64SHR }, + [nir_op_ushr] = { TGSI_OPCODE_USHR, TGSI_OPCODE_U64SHR }, + + /* These bitwise ops don't care about 32 vs 64 types, so they have the + * same TGSI op. + */ + [nir_op_inot] = { TGSI_OPCODE_NOT, TGSI_OPCODE_NOT }, + [nir_op_iand] = { TGSI_OPCODE_AND, TGSI_OPCODE_AND }, + [nir_op_ior] = { TGSI_OPCODE_OR, TGSI_OPCODE_OR }, + [nir_op_ixor] = { TGSI_OPCODE_XOR, TGSI_OPCODE_XOR }, + + [nir_op_fmin] = { TGSI_OPCODE_MIN, TGSI_OPCODE_DMIN }, + [nir_op_imin] = { TGSI_OPCODE_IMIN, TGSI_OPCODE_I64MIN }, + [nir_op_umin] = { TGSI_OPCODE_UMIN, TGSI_OPCODE_U64MIN }, + [nir_op_fmax] = { TGSI_OPCODE_MAX, TGSI_OPCODE_DMAX }, + [nir_op_imax] = { TGSI_OPCODE_IMAX, TGSI_OPCODE_I64MAX }, + [nir_op_umax] = { TGSI_OPCODE_UMAX, TGSI_OPCODE_U64MAX }, + [nir_op_ffma] = { TGSI_OPCODE_MAD, TGSI_OPCODE_DMAD }, + [nir_op_ldexp] = { TGSI_OPCODE_LDEXP, 0 }, + }; + + if (src_64 && !dst_64) { + if (num_srcs == 2 || nir_op_infos[instr->op].output_type == nir_type_bool32) { + /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz instead + * of .xy. + */ + assert(!(dst.WriteMask & TGSI_WRITEMASK_YW)); + } else { + /* TGSI 64bit-to-32-bit conversions only generate results in the .xy + * channels and will need to get fixed up. + */ + assert(!(dst.WriteMask & TGSI_WRITEMASK_ZW)); + } + } + + bool table_op64 = src_64; + if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op][table_op64] != 0) { + /* The normal path for NIR to TGSI ALU op translation */ + ntr_insn(c, op_map[instr->op][table_op64], + dst, src[0], src[1], src[2], src[3]); + } else { + /* Special cases for NIR to TGSI ALU op translation. */ + + /* TODO: Use something like the ntr_store() path for the MOV calls so we + * don't emit extra MOVs for swizzles/srcmods of inputs/const/imm. + */ + + switch (instr->op) { + case nir_op_u2u64: + ntr_AND(c, dst, ureg_swizzle(src[0], + TGSI_SWIZZLE_X, TGSI_SWIZZLE_X, + TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y), + ureg_imm4u(c->ureg, ~0, 0, ~0, 0)); + break; + + case nir_op_i2i32: + case nir_op_u2u32: + assert(src_64); + ntr_MOV(c, dst, ureg_swizzle(src[0], + TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z, + TGSI_SWIZZLE_X, TGSI_SWIZZLE_X)); + break; + + case nir_op_fabs: + /* Try to eliminate */ + if (!c->options->lower_fabs && nir_legacy_float_mod_folds(instr)) + break; + + if (c->options->lower_fabs) + ntr_MAX(c, dst, src[0], ureg_negate(src[0])); + else + ntr_MOV(c, dst, ureg_abs(src[0])); + break; + + case nir_op_fsat: + if (dst_64) { + ntr_MIN(c, dst, src[0], ntr_64bit_1f(c)); + ntr_MAX(c, dst, ureg_src(dst), ureg_imm1u(c->ureg, 0)); + } else { + ntr_MOV(c, ureg_saturate(dst), src[0]); + } + break; + + case nir_op_fneg: + /* Try to eliminate */ + if (nir_legacy_float_mod_folds(instr)) + break; + + ntr_MOV(c, dst, ureg_negate(src[0])); + break; + + /* NOTE: TGSI 32-bit math ops have the old "one source channel + * replicated to all dst channels" behavior, while 64 is normal mapping + * of src channels to dst. + */ + case nir_op_frcp: + assert(!dst_64); + ntr_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], ureg_src_undef()); + break; + + case nir_op_frsq: + assert(!dst_64); + ntr_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], ureg_src_undef()); + break; + + case nir_op_fsqrt: + assert(!dst_64); + ntr_emit_scalar(c, TGSI_OPCODE_SQRT, dst, src[0], ureg_src_undef()); + break; + + case nir_op_fexp2: + assert(!dst_64); + ntr_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], ureg_src_undef()); + break; + + case nir_op_flog2: + assert(!dst_64); + ntr_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], ureg_src_undef()); + break; + + case nir_op_b2f32: + ntr_AND(c, dst, src[0], ureg_imm1f(c->ureg, 1.0)); + break; + + case nir_op_b2f64: + ntr_AND(c, dst, + ureg_swizzle(src[0], + TGSI_SWIZZLE_X, TGSI_SWIZZLE_X, + TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y), + ntr_64bit_1f(c)); + break; + + case nir_op_b2i32: + ntr_AND(c, dst, src[0], ureg_imm1u(c->ureg, 1)); + break; + + case nir_op_b2i64: + ntr_AND(c, dst, + ureg_swizzle(src[0], + TGSI_SWIZZLE_X, TGSI_SWIZZLE_X, + TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y), + ureg_imm4u(c->ureg, 1, 0, 1, 0)); + break; + + case nir_op_fsin: + ntr_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], ureg_src_undef()); + break; + + case nir_op_fcos: + ntr_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], ureg_src_undef()); + break; + + case nir_op_fsub: + assert(!dst_64); + ntr_ADD(c, dst, src[0], ureg_negate(src[1])); + break; + + case nir_op_isub: + assert(!dst_64); + ntr_UADD(c, dst, src[0], ureg_negate(src[1])); + break; + + case nir_op_fmod: + unreachable("should be handled by .lower_fmod = true"); + break; + + case nir_op_fpow: + ntr_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]); + break; + + case nir_op_flrp: + ntr_LRP(c, dst, src[2], src[1], src[0]); + break; + + case nir_op_pack_64_2x32_split: + ntr_MOV(c, ureg_writemask(dst, TGSI_WRITEMASK_XZ), + ureg_swizzle(src[0], + TGSI_SWIZZLE_X, TGSI_SWIZZLE_X, + TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y)); + ntr_MOV(c, ureg_writemask(dst, TGSI_WRITEMASK_YW), + ureg_swizzle(src[1], + TGSI_SWIZZLE_X, TGSI_SWIZZLE_X, + TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y)); + break; + + case nir_op_unpack_64_2x32_split_x: + ntr_MOV(c, dst, ureg_swizzle(src[0], + TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z, + TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z)); + break; + + case nir_op_unpack_64_2x32_split_y: + ntr_MOV(c, dst, ureg_swizzle(src[0], + TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W, + TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W)); + break; + + case nir_op_b32csel: + if (nir_src_bit_size(instr->src[1].src) == 64) { + ntr_UCMP(c, dst, ureg_swizzle(src[0], + TGSI_SWIZZLE_X, TGSI_SWIZZLE_X, + TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y), + src[1], src[2]); + } else { + ntr_UCMP(c, dst, src[0], src[1], src[2]); + } + break; + + case nir_op_fcsel: + /* If CMP isn't supported, then the flags that enable NIR to generate + * this opcode should also not be set. + */ + assert(!c->options->lower_cmp); + + /* Implement this as CMP(-abs(src0), src1, src2). */ + ntr_CMP(c, dst, ureg_negate(ureg_abs(src[0])), src[1], src[2]); + break; + + case nir_op_fcsel_gt: + /* If CMP isn't supported, then the flags that enable NIR to generate + * these opcodes should also not be set. + */ + assert(!c->options->lower_cmp); + + ntr_CMP(c, dst, ureg_negate(src[0]), src[1], src[2]); + break; + + case nir_op_fcsel_ge: + /* If CMP isn't supported, then the flags that enable NIR to generate + * these opcodes should also not be set. + */ + assert(!c->options->lower_cmp); + + /* Implement this as if !(src0 < 0.0) was identical to src0 >= 0.0. */ + ntr_CMP(c, dst, src[0], src[2], src[1]); + break; + + case nir_op_frexp_sig: + case nir_op_frexp_exp: + unreachable("covered by nir_lower_frexp()"); + break; + + case nir_op_ldexp: + assert(dst_64); /* 32bit handled in table. */ + ntr_DLDEXP(c, dst, src[0], + ureg_swizzle(src[1], + TGSI_SWIZZLE_X, TGSI_SWIZZLE_X, + TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y)); + break; + + case nir_op_vec4: + case nir_op_vec3: + case nir_op_vec2: + unreachable("covered by nir_lower_vec_to_movs()"); + + default: + fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name); + unreachable("Unknown NIR opcode"); + } + } + + c->precise = false; +} + +static struct ureg_src +ntr_ureg_src_indirect(struct ntr_compile *c, struct ureg_src usrc, + nir_src src, int addr_reg) +{ + if (nir_src_is_const(src)) { + usrc.Index += ntr_src_as_uint(c, src); + return usrc; + } else { + return ureg_src_indirect(usrc, ntr_reladdr(c, ntr_get_src(c, src), addr_reg)); + } +} + +static struct ureg_dst +ntr_ureg_dst_indirect(struct ntr_compile *c, struct ureg_dst dst, + nir_src src) +{ + if (nir_src_is_const(src)) { + dst.Index += ntr_src_as_uint(c, src); + return dst; + } else { + return ureg_dst_indirect(dst, ntr_reladdr(c, ntr_get_src(c, src), 0)); + } +} + +static struct ureg_src +ntr_ureg_src_dimension_indirect(struct ntr_compile *c, struct ureg_src usrc, + nir_src src) +{ + if (nir_src_is_const(src)) { + return ureg_src_dimension(usrc, ntr_src_as_uint(c, src)); + } + else + { + return ureg_src_dimension_indirect(usrc, + ntr_reladdr(c, ntr_get_src(c, src), 1), + 0); + } +} + +static struct ureg_dst +ntr_ureg_dst_dimension_indirect(struct ntr_compile *c, struct ureg_dst udst, + nir_src src) +{ + if (nir_src_is_const(src)) { + return ureg_dst_dimension(udst, ntr_src_as_uint(c, src)); + } else { + return ureg_dst_dimension_indirect(udst, + ntr_reladdr(c, ntr_get_src(c, src), 1), + 0); + } +} +/* Some load operations in NIR will have a fractional offset that we need to + * swizzle down before storing to the result register. + */ +static struct ureg_src +ntr_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components) +{ + return ureg_swizzle(src, + frac, + frac + MIN2(num_components - 1, 1), + frac + MIN2(num_components - 1, 2), + frac + MIN2(num_components - 1, 3)); +} + + +static void +ntr_emit_load_ubo(struct ntr_compile *c, nir_intrinsic_instr *instr) +{ + int bit_size = instr->def.bit_size; + assert(bit_size == 32 || instr->num_components <= 2); + + struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0); + + struct ureg_dst addr_temp = ureg_dst_undef(); + + if (nir_src_is_const(instr->src[0])) { + src = ureg_src_dimension(src, ntr_src_as_uint(c, instr->src[0])); + } else { + /* virglrenderer requires that indirect UBO references have the UBO + * array's base index in the Index field, not added to the indrect + * address. + * + * Many nir intrinsics have a base address const value for the start of + * their array indirection, but load_ubo doesn't. We fake it by + * subtracting it off here. + */ + addr_temp = ntr_temp(c); + ntr_UADD(c, addr_temp, ntr_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, -c->first_ubo)); + src = ureg_src_dimension_indirect(src, + ntr_reladdr(c, ureg_src(addr_temp), 1), + c->first_ubo); + } + + if (instr->intrinsic == nir_intrinsic_load_ubo_vec4) { + /* !PIPE_CAP_LOAD_CONSTBUF: Just emit it as a vec4 reference to the const + * file. + */ + src.Index = nir_intrinsic_base(instr); + + if (nir_src_is_const(instr->src[1])) { + src.Index += ntr_src_as_uint(c, instr->src[1]); + } else { + src = ureg_src_indirect(src, ntr_reladdr(c, ntr_get_src(c, instr->src[1]), 0)); + } + + int start_component = nir_intrinsic_component(instr); + if (bit_size == 64) + start_component *= 2; + + src = ntr_shift_by_frac(src, start_component, + instr->num_components * bit_size / 32); + + ntr_store(c, &instr->def, src); + } else { + /* PIPE_CAP_LOAD_CONSTBUF: Not necessarily vec4 aligned, emit a + * TGSI_OPCODE_LOAD instruction from the const file. + */ + struct ntr_insn *insn = + ntr_insn(c, TGSI_OPCODE_LOAD, + ntr_get_dest(c, &instr->def), + src, ntr_get_src(c, instr->src[1]), + ureg_src_undef(), ureg_src_undef()); + insn->is_mem = true; + insn->tex_target = 0; + insn->mem_qualifier = 0; + insn->mem_format = 0; /* unused */ + } +} + +static unsigned +ntr_get_access_qualifier(nir_intrinsic_instr *instr) +{ + enum gl_access_qualifier access = nir_intrinsic_access(instr); + unsigned qualifier = 0; + + if (access & ACCESS_COHERENT) + qualifier |= TGSI_MEMORY_COHERENT; + if (access & ACCESS_VOLATILE) + qualifier |= TGSI_MEMORY_VOLATILE; + if (access & ACCESS_RESTRICT) + qualifier |= TGSI_MEMORY_RESTRICT; + + return qualifier; +} + +static unsigned +ntr_translate_atomic_op(nir_atomic_op op) +{ + switch (op) { + case nir_atomic_op_iadd: return TGSI_OPCODE_ATOMUADD; + case nir_atomic_op_fadd: return TGSI_OPCODE_ATOMFADD; + case nir_atomic_op_imin: return TGSI_OPCODE_ATOMIMIN; + case nir_atomic_op_imax: return TGSI_OPCODE_ATOMIMAX; + case nir_atomic_op_umin: return TGSI_OPCODE_ATOMUMIN; + case nir_atomic_op_umax: return TGSI_OPCODE_ATOMUMAX; + case nir_atomic_op_iand: return TGSI_OPCODE_ATOMAND; + case nir_atomic_op_ixor: return TGSI_OPCODE_ATOMXOR; + case nir_atomic_op_ior: return TGSI_OPCODE_ATOMOR; + case nir_atomic_op_xchg: return TGSI_OPCODE_ATOMXCHG; + default: unreachable("invalid atomic"); + } +} + +static void +ntr_emit_mem(struct ntr_compile *c, nir_intrinsic_instr *instr, + nir_variable_mode mode) +{ + bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo || + instr->intrinsic == nir_intrinsic_store_shared); + bool is_load = (instr->intrinsic == nir_intrinsic_atomic_counter_read || + instr->intrinsic == nir_intrinsic_load_ssbo || + instr->intrinsic == nir_intrinsic_load_shared); + unsigned opcode; + struct ureg_src src[4]; + int num_src = 0; + int next_src; + struct ureg_dst addr_temp = ureg_dst_undef(); + + struct ureg_src memory; + switch (mode) { + case nir_var_mem_ssbo: + memory = ntr_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_BUFFER, + c->first_ssbo), + instr->src[is_store ? 1 : 0], 2); + next_src = 1; + break; + case nir_var_mem_shared: + memory = ureg_src_register(TGSI_FILE_MEMORY, 0); + next_src = 0; + break; + case nir_var_uniform: { /* HW atomic buffers */ + nir_src src = instr->src[0]; + uint32_t offset = (ntr_extract_const_src_offset(&src) + + nir_intrinsic_range_base(instr)) / 4; + + memory = ureg_src_register(TGSI_FILE_HW_ATOMIC, offset); + /* ntr_ureg_src_indirect, except dividing by 4 */ + if (nir_src_is_const(src)) { + memory.Index += nir_src_as_uint(src) / 4; + } else { + addr_temp = ntr_temp(c); + ntr_USHR(c, addr_temp, ntr_get_src(c, src), ureg_imm1i(c->ureg, 2)); + memory = ureg_src_indirect(memory, ntr_reladdr(c, ureg_src(addr_temp), 2)); + } + memory = ureg_src_dimension(memory, nir_intrinsic_base(instr)); + next_src = 0; + break; + } + + default: + unreachable("unknown memory type"); + } + + if (is_store) { + src[num_src++] = ntr_get_src(c, instr->src[next_src + 1]); /* offset */ + src[num_src++] = ntr_get_src(c, instr->src[0]); /* value */ + } else { + src[num_src++] = memory; + if (instr->intrinsic != nir_intrinsic_get_ssbo_size) { + src[num_src++] = ntr_get_src(c, instr->src[next_src++]); /* offset */ + switch (instr->intrinsic) { + case nir_intrinsic_atomic_counter_inc: + src[num_src++] = ureg_imm1i(c->ureg, 1); + break; + case nir_intrinsic_atomic_counter_post_dec: + src[num_src++] = ureg_imm1i(c->ureg, -1); + break; + default: + if (!is_load) + src[num_src++] = ntr_get_src(c, instr->src[next_src++]); /* value */ + break; + } + } + } + + + switch (instr->intrinsic) { + case nir_intrinsic_ssbo_atomic: + case nir_intrinsic_shared_atomic: + opcode = ntr_translate_atomic_op(nir_intrinsic_atomic_op(instr)); + break; + case nir_intrinsic_atomic_counter_add: + case nir_intrinsic_atomic_counter_inc: + case nir_intrinsic_atomic_counter_post_dec: + opcode = TGSI_OPCODE_ATOMUADD; + break; + case nir_intrinsic_atomic_counter_min: + opcode = TGSI_OPCODE_ATOMIMIN; + break; + case nir_intrinsic_atomic_counter_max: + opcode = TGSI_OPCODE_ATOMIMAX; + break; + case nir_intrinsic_atomic_counter_and: + opcode = TGSI_OPCODE_ATOMAND; + break; + case nir_intrinsic_atomic_counter_or: + opcode = TGSI_OPCODE_ATOMOR; + break; + case nir_intrinsic_atomic_counter_xor: + opcode = TGSI_OPCODE_ATOMXOR; + break; + case nir_intrinsic_atomic_counter_exchange: + opcode = TGSI_OPCODE_ATOMXCHG; + break; + case nir_intrinsic_atomic_counter_comp_swap: + case nir_intrinsic_ssbo_atomic_swap: + case nir_intrinsic_shared_atomic_swap: + opcode = TGSI_OPCODE_ATOMCAS; + src[num_src++] = ntr_get_src(c, instr->src[next_src++]); + break; + case nir_intrinsic_atomic_counter_read: + case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_shared: + opcode = TGSI_OPCODE_LOAD; + break; + case nir_intrinsic_store_ssbo: + case nir_intrinsic_store_shared: + opcode = TGSI_OPCODE_STORE; + break; + case nir_intrinsic_get_ssbo_size: + opcode = TGSI_OPCODE_RESQ; + break; + default: + unreachable("unknown memory op"); + } + + unsigned qualifier = 0; + if (mode == nir_var_mem_ssbo && + instr->intrinsic != nir_intrinsic_get_ssbo_size) { + qualifier = ntr_get_access_qualifier(instr); + } + + struct ureg_dst dst; + if (is_store) { + dst = ureg_dst(memory); + + unsigned write_mask = nir_intrinsic_write_mask(instr); + if (nir_src_bit_size(instr->src[0]) == 64) + write_mask = ntr_64bit_write_mask(write_mask); + dst = ureg_writemask(dst, write_mask); + } else { + dst = ntr_get_dest(c, &instr->def); + } + + struct ntr_insn *insn = ntr_insn(c, opcode, dst, src[0], src[1], src[2], src[3]); + insn->tex_target = TGSI_TEXTURE_BUFFER; + insn->mem_qualifier = qualifier; + insn->mem_format = 0; /* unused */ + insn->is_mem = true; +} + +static void +ntr_emit_image_load_store(struct ntr_compile *c, nir_intrinsic_instr *instr) +{ + unsigned op; + struct ureg_src srcs[4]; + int num_src = 0; + enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); + bool is_array = nir_intrinsic_image_array(instr); + + struct ureg_dst temp = ureg_dst_undef(); + + enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(dim, is_array, false); + + struct ureg_src resource; + switch (instr->intrinsic) { + case nir_intrinsic_bindless_image_load: + case nir_intrinsic_bindless_image_store: + case nir_intrinsic_bindless_image_size: + case nir_intrinsic_bindless_image_samples: + case nir_intrinsic_bindless_image_atomic: + case nir_intrinsic_bindless_image_atomic_swap: + resource = ntr_get_src(c, instr->src[0]); + break; + default: + resource = ntr_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_IMAGE, 0), + instr->src[0], 2); + resource.Index += nir_intrinsic_range_base(instr); + } + + struct ureg_dst dst; + if (instr->intrinsic == nir_intrinsic_image_store || + instr->intrinsic == nir_intrinsic_bindless_image_store) { + dst = ureg_dst(resource); + } else { + srcs[num_src++] = resource; + dst = ntr_get_dest(c, &instr->def); + } + struct ureg_dst opcode_dst = dst; + + if (instr->intrinsic != nir_intrinsic_image_size && + instr->intrinsic != nir_intrinsic_image_samples && + instr->intrinsic != nir_intrinsic_bindless_image_size && + instr->intrinsic != nir_intrinsic_bindless_image_samples) { + struct ureg_src coord = ntr_get_src(c, instr->src[1]); + + if (dim == GLSL_SAMPLER_DIM_MS) { + temp = ntr_temp(c); + ntr_MOV(c, temp, coord); + ntr_MOV(c, ureg_writemask(temp, TGSI_WRITEMASK_W), + ureg_scalar(ntr_get_src(c, instr->src[2]), TGSI_SWIZZLE_X)); + coord = ureg_src(temp); + } + srcs[num_src++] = coord; + + if (instr->intrinsic != nir_intrinsic_image_load && + instr->intrinsic != nir_intrinsic_bindless_image_load) { + srcs[num_src++] = ntr_get_src(c, instr->src[3]); /* data */ + if (instr->intrinsic == nir_intrinsic_image_atomic_swap || + instr->intrinsic == nir_intrinsic_bindless_image_atomic_swap) + srcs[num_src++] = ntr_get_src(c, instr->src[4]); /* data2 */ + } + } + + switch (instr->intrinsic) { + case nir_intrinsic_image_load: + case nir_intrinsic_bindless_image_load: + op = TGSI_OPCODE_LOAD; + break; + case nir_intrinsic_image_store: + case nir_intrinsic_bindless_image_store: + op = TGSI_OPCODE_STORE; + break; + case nir_intrinsic_image_size: + case nir_intrinsic_bindless_image_size: + op = TGSI_OPCODE_RESQ; + break; + case nir_intrinsic_image_samples: + case nir_intrinsic_bindless_image_samples: + op = TGSI_OPCODE_RESQ; + opcode_dst = ureg_writemask(ntr_temp(c), TGSI_WRITEMASK_W); + break; + case nir_intrinsic_image_atomic: + case nir_intrinsic_bindless_image_atomic: + op = ntr_translate_atomic_op(nir_intrinsic_atomic_op(instr)); + break; + case nir_intrinsic_image_atomic_swap: + case nir_intrinsic_bindless_image_atomic_swap: + op = TGSI_OPCODE_ATOMCAS; + break; + default: + unreachable("bad op"); + } + + struct ntr_insn *insn = ntr_insn(c, op, opcode_dst, srcs[0], srcs[1], srcs[2], srcs[3]); + insn->tex_target = target; + insn->mem_qualifier = ntr_get_access_qualifier(instr); + insn->mem_format = nir_intrinsic_format(instr); + insn->is_mem = true; + + if (instr->intrinsic == nir_intrinsic_image_samples || + instr->intrinsic == nir_intrinsic_bindless_image_samples) + ntr_MOV(c, dst, ureg_scalar(ureg_src(opcode_dst), 3)); +} + +static void +ntr_emit_load_input(struct ntr_compile *c, nir_intrinsic_instr *instr) +{ + uint32_t frac = nir_intrinsic_component(instr); + uint32_t num_components = instr->num_components; + unsigned base = nir_intrinsic_base(instr); + struct ureg_src input; + nir_io_semantics semantics = nir_intrinsic_io_semantics(instr); + bool is_64 = instr->def.bit_size == 64; + + if (c->s->info.stage == MESA_SHADER_VERTEX) { + input = ureg_DECL_vs_input(c->ureg, base); + for (int i = 1; i < semantics.num_slots; i++) + ureg_DECL_vs_input(c->ureg, base + i); + } else if (c->s->info.stage != MESA_SHADER_FRAGMENT) { + unsigned semantic_name, semantic_index; + ntr_get_gl_varying_semantic(c, semantics.location, + &semantic_name, &semantic_index); + + /* XXX: ArrayID is used in r600 gs inputs */ + uint32_t array_id = 0; + + input = ureg_DECL_input_layout(c->ureg, + semantic_name, + semantic_index, + base, + ntr_tgsi_usage_mask(frac, + instr->num_components, + is_64), + array_id, + semantics.num_slots); + } else { + input = c->input_index_map[base]; + } + + if (is_64) + num_components *= 2; + + input = ntr_shift_by_frac(input, frac, num_components); + + switch (instr->intrinsic) { + case nir_intrinsic_load_input: + input = ntr_ureg_src_indirect(c, input, instr->src[0], 0); + ntr_store(c, &instr->def, input); + break; + + case nir_intrinsic_load_per_vertex_input: + input = ntr_ureg_src_indirect(c, input, instr->src[1], 0); + input = ntr_ureg_src_dimension_indirect(c, input, instr->src[0]); + ntr_store(c, &instr->def, input); + break; + + case nir_intrinsic_load_interpolated_input: { + input = ntr_ureg_src_indirect(c, input, instr->src[1], 0); + + nir_intrinsic_instr *bary_instr = + nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr); + + switch (bary_instr->intrinsic) { + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_sample: + /* For these, we know that the barycentric load matches the + * interpolation on the input declaration, so we can use it directly. + */ + ntr_store(c, &instr->def, input); + break; + + case nir_intrinsic_load_barycentric_centroid: + /* If the input was declared centroid, then there's no need to + * emit the extra TGSI interp instruction, we can just read the + * input. + */ + if (c->centroid_inputs & (1ull << nir_intrinsic_base(instr))) { + ntr_store(c, &instr->def, input); + } else { + ntr_INTERP_CENTROID(c, ntr_get_dest(c, &instr->def), input); + } + break; + + case nir_intrinsic_load_barycentric_at_sample: + /* We stored the sample in the fake "bary" dest. */ + ntr_INTERP_SAMPLE(c, ntr_get_dest(c, &instr->def), input, + ntr_get_src(c, instr->src[0])); + break; + + case nir_intrinsic_load_barycentric_at_offset: + /* We stored the offset in the fake "bary" dest. */ + ntr_INTERP_OFFSET(c, ntr_get_dest(c, &instr->def), input, + ntr_get_src(c, instr->src[0])); + break; + + default: + unreachable("bad barycentric interp intrinsic\n"); + } + break; + } + + default: + unreachable("bad load input intrinsic\n"); + } +} + +static void +ntr_emit_store_output(struct ntr_compile *c, nir_intrinsic_instr *instr) +{ + struct ureg_src src = ntr_get_src(c, instr->src[0]); + + if (src.File == TGSI_FILE_OUTPUT) { + /* If our src is the output file, that's an indication that we were able + * to emit the output stores in the generating instructions and we have + * nothing to do here. + */ + return; + } + + uint32_t frac; + struct ureg_dst out = ntr_output_decl(c, instr, &frac); + + if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) { + out = ntr_ureg_dst_indirect(c, out, instr->src[2]); + out = ntr_ureg_dst_dimension_indirect(c, out, instr->src[1]); + } else { + out = ntr_ureg_dst_indirect(c, out, instr->src[1]); + } + + uint8_t swizzle[4] = { 0, 0, 0, 0 }; + for (int i = frac; i <= 4; i++) { + if (out.WriteMask & (1 << i)) + swizzle[i] = i - frac; + } + + src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]); + + ntr_MOV(c, out, src); +} + +static void +ntr_emit_load_output(struct ntr_compile *c, nir_intrinsic_instr *instr) +{ + nir_io_semantics semantics = nir_intrinsic_io_semantics(instr); + + /* ntr_try_store_in_tgsi_output() optimization is not valid if normal + * load_output is present. + */ + assert(c->s->info.stage != MESA_SHADER_VERTEX && + (c->s->info.stage != MESA_SHADER_FRAGMENT || semantics.fb_fetch_output)); + + uint32_t frac; + struct ureg_dst out = ntr_output_decl(c, instr, &frac); + + if (instr->intrinsic == nir_intrinsic_load_per_vertex_output) { + out = ntr_ureg_dst_indirect(c, out, instr->src[1]); + out = ntr_ureg_dst_dimension_indirect(c, out, instr->src[0]); + } else { + out = ntr_ureg_dst_indirect(c, out, instr->src[0]); + } + + struct ureg_dst dst = ntr_get_dest(c, &instr->def); + struct ureg_src out_src = ureg_src(out); + + /* Don't swizzling unavailable channels of the output in the writemasked-out + * components. Avoids compile failures in virglrenderer with + * TESS_LEVEL_INNER. + */ + int fill_channel = ffs(dst.WriteMask) - 1; + uint8_t swizzles[4] = { 0, 1, 2, 3 }; + for (int i = 0; i < 4; i++) + if (!(dst.WriteMask & (1 << i))) + swizzles[i] = fill_channel; + out_src = ureg_swizzle(out_src, swizzles[0], swizzles[1], swizzles[2], swizzles[3]); + + if (semantics.fb_fetch_output) + ntr_FBFETCH(c, dst, out_src); + else + ntr_MOV(c, dst, out_src); +} + +static void +ntr_emit_load_sysval(struct ntr_compile *c, nir_intrinsic_instr *instr) +{ + gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic); + enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval); + struct ureg_src sv = ureg_DECL_system_value(c->ureg, semantic, 0); + + /* virglrenderer doesn't like references to channels of the sysval that + * aren't defined, even if they aren't really read. (GLSL compile fails on + * gl_NumWorkGroups.w, for example). + */ + uint32_t write_mask = BITSET_MASK(instr->def.num_components); + sv = ntr_swizzle_for_write_mask(sv, write_mask); + + /* TGSI and NIR define these intrinsics as always loading ints, but they can + * still appear on hardware with non-native-integers fragment shaders using + * the draw path (i915g). In that case, having called nir_lower_int_to_float + * means that we actually want floats instead. + */ + if (!c->native_integers) { + switch (instr->intrinsic) { + case nir_intrinsic_load_vertex_id: + case nir_intrinsic_load_instance_id: + ntr_U2F(c, ntr_get_dest(c, &instr->def), sv); + return; + + default: + break; + } + } + + ntr_store(c, &instr->def, sv); +} + +static void +ntr_emit_barrier(struct ntr_compile *c, nir_intrinsic_instr *intr) +{ + bool compute = gl_shader_stage_is_compute(c->s->info.stage); + + if (nir_intrinsic_memory_scope(intr) != SCOPE_NONE) { + nir_variable_mode modes = nir_intrinsic_memory_modes(intr); + unsigned membar = 0; + + if (modes & nir_var_image) + membar |= TGSI_MEMBAR_SHADER_IMAGE; + + if (modes & nir_var_mem_shared) + membar |= TGSI_MEMBAR_SHARED; + + /* Atomic counters are lowered to SSBOs, there's no NIR mode corresponding + * exactly to atomics. Take the closest match. + */ + if (modes & nir_var_mem_ssbo) + membar |= TGSI_MEMBAR_SHADER_BUFFER | TGSI_MEMBAR_ATOMIC_BUFFER; + + if (modes & nir_var_mem_global) + membar |= TGSI_MEMBAR_SHADER_BUFFER; + + /* If we only need workgroup scope (not device-scope), we might be able to + * optimize a bit. + */ + if (membar && compute && + nir_intrinsic_memory_scope(intr) == SCOPE_WORKGROUP) { + + membar |= TGSI_MEMBAR_THREAD_GROUP; + } + + /* Only emit a memory barrier if there are any relevant modes */ + if (membar) + ntr_MEMBAR(c, ureg_imm1u(c->ureg, membar)); + } + + if (nir_intrinsic_execution_scope(intr) != SCOPE_NONE) { + assert(compute || c->s->info.stage == MESA_SHADER_TESS_CTRL); + ntr_BARRIER(c); + } +} + +static void +ntr_emit_intrinsic(struct ntr_compile *c, nir_intrinsic_instr *instr) +{ + switch (instr->intrinsic) { + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ubo_vec4: + ntr_emit_load_ubo(c, instr); + break; + + /* Vertex */ + case nir_intrinsic_load_vertex_id: + case nir_intrinsic_load_vertex_id_zero_base: + case nir_intrinsic_load_base_vertex: + case nir_intrinsic_load_base_instance: + case nir_intrinsic_load_instance_id: + case nir_intrinsic_load_draw_id: + case nir_intrinsic_load_invocation_id: + case nir_intrinsic_load_frag_coord: + case nir_intrinsic_load_point_coord: + case nir_intrinsic_load_front_face: + case nir_intrinsic_load_sample_id: + case nir_intrinsic_load_sample_pos: + case nir_intrinsic_load_sample_mask_in: + case nir_intrinsic_load_helper_invocation: + case nir_intrinsic_load_tess_coord: + case nir_intrinsic_load_patch_vertices_in: + case nir_intrinsic_load_primitive_id: + case nir_intrinsic_load_tess_level_outer: + case nir_intrinsic_load_tess_level_inner: + case nir_intrinsic_load_local_invocation_id: + case nir_intrinsic_load_workgroup_id: + case nir_intrinsic_load_num_workgroups: + case nir_intrinsic_load_workgroup_size: + case nir_intrinsic_load_subgroup_size: + case nir_intrinsic_load_subgroup_invocation: + case nir_intrinsic_load_subgroup_eq_mask: + case nir_intrinsic_load_subgroup_ge_mask: + case nir_intrinsic_load_subgroup_gt_mask: + case nir_intrinsic_load_subgroup_lt_mask: + case nir_intrinsic_load_subgroup_le_mask: + ntr_emit_load_sysval(c, instr); + break; + + case nir_intrinsic_load_input: + case nir_intrinsic_load_per_vertex_input: + case nir_intrinsic_load_interpolated_input: + ntr_emit_load_input(c, instr); + break; + + case nir_intrinsic_store_output: + case nir_intrinsic_store_per_vertex_output: + ntr_emit_store_output(c, instr); + break; + + case nir_intrinsic_load_output: + case nir_intrinsic_load_per_vertex_output: + ntr_emit_load_output(c, instr); + break; + + case nir_intrinsic_demote: + ntr_DEMOTE(c); + break; + + case nir_intrinsic_discard: + ntr_KILL(c); + break; + + case nir_intrinsic_discard_if: { + struct ureg_src cond = ureg_scalar(ntr_get_src(c, instr->src[0]), 0); + + if (c->native_integers) { + struct ureg_dst temp = ureg_writemask(ntr_temp(c), 1); + ntr_AND(c, temp, cond, ureg_imm1f(c->ureg, 1.0)); + ntr_KILL_IF(c, ureg_scalar(ureg_negate(ureg_src(temp)), 0)); + } else { + /* For !native_integers, the bool got lowered to 1.0 or 0.0. */ + ntr_KILL_IF(c, ureg_negate(cond)); + } + break; + } + + case nir_intrinsic_is_helper_invocation: + ntr_READ_HELPER(c, ntr_get_dest(c, &instr->def)); + break; + + case nir_intrinsic_vote_all: + ntr_VOTE_ALL(c, ntr_get_dest(c, &instr->def), ntr_get_src(c,instr->src[0])); + return; + case nir_intrinsic_vote_any: + ntr_VOTE_ANY(c, ntr_get_dest(c, &instr->def), ntr_get_src(c, instr->src[0])); + return; + case nir_intrinsic_vote_ieq: + ntr_VOTE_EQ(c, ntr_get_dest(c, &instr->def), ntr_get_src(c, instr->src[0])); + return; + case nir_intrinsic_ballot: + ntr_BALLOT(c, ntr_get_dest(c, &instr->def), ntr_get_src(c, instr->src[0])); + return; + case nir_intrinsic_read_first_invocation: + ntr_READ_FIRST(c, ntr_get_dest(c, &instr->def), ntr_get_src(c, instr->src[0])); + return; + case nir_intrinsic_read_invocation: + ntr_READ_INVOC(c, ntr_get_dest(c, &instr->def), ntr_get_src(c, instr->src[0]), ntr_get_src(c, instr->src[1])); + return; + + case nir_intrinsic_load_ssbo: + case nir_intrinsic_store_ssbo: + case nir_intrinsic_ssbo_atomic: + case nir_intrinsic_ssbo_atomic_swap: + case nir_intrinsic_get_ssbo_size: + ntr_emit_mem(c, instr, nir_var_mem_ssbo); + break; + + case nir_intrinsic_load_shared: + case nir_intrinsic_store_shared: + case nir_intrinsic_shared_atomic: + case nir_intrinsic_shared_atomic_swap: + ntr_emit_mem(c, instr, nir_var_mem_shared); + break; + + case nir_intrinsic_atomic_counter_read: + case nir_intrinsic_atomic_counter_add: + case nir_intrinsic_atomic_counter_inc: + case nir_intrinsic_atomic_counter_post_dec: + case nir_intrinsic_atomic_counter_min: + case nir_intrinsic_atomic_counter_max: + case nir_intrinsic_atomic_counter_and: + case nir_intrinsic_atomic_counter_or: + case nir_intrinsic_atomic_counter_xor: + case nir_intrinsic_atomic_counter_exchange: + case nir_intrinsic_atomic_counter_comp_swap: + ntr_emit_mem(c, instr, nir_var_uniform); + break; + case nir_intrinsic_atomic_counter_pre_dec: + unreachable("Should be lowered by ntr_lower_atomic_pre_dec()"); + break; + + case nir_intrinsic_image_load: + case nir_intrinsic_image_store: + case nir_intrinsic_image_size: + case nir_intrinsic_image_samples: + case nir_intrinsic_image_atomic: + case nir_intrinsic_image_atomic_swap: + case nir_intrinsic_bindless_image_load: + case nir_intrinsic_bindless_image_store: + case nir_intrinsic_bindless_image_size: + case nir_intrinsic_bindless_image_samples: + case nir_intrinsic_bindless_image_atomic: + case nir_intrinsic_bindless_image_atomic_swap: + ntr_emit_image_load_store(c, instr); + break; + + case nir_intrinsic_barrier: + ntr_emit_barrier(c, instr); + break; + + case nir_intrinsic_end_primitive: + ntr_ENDPRIM(c, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr))); + break; + + case nir_intrinsic_emit_vertex: + ntr_EMIT(c, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr))); + break; + + /* In TGSI we don't actually generate the barycentric coords, and emit + * interp intrinsics later. However, we do need to store the + * load_barycentric_at_* argument so that we can use it at that point. + */ + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_centroid: + case nir_intrinsic_load_barycentric_sample: + break; + case nir_intrinsic_load_barycentric_at_sample: + case nir_intrinsic_load_barycentric_at_offset: + ntr_store(c, &instr->def, ntr_get_src(c, instr->src[0])); + break; + + case nir_intrinsic_shader_clock: + ntr_CLOCK(c, ntr_get_dest(c, &instr->def)); + break; + + case nir_intrinsic_decl_reg: + case nir_intrinsic_load_reg: + case nir_intrinsic_load_reg_indirect: + case nir_intrinsic_store_reg: + case nir_intrinsic_store_reg_indirect: + /* fully consumed */ + break; + + default: + fprintf(stderr, "Unknown intrinsic: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + break; + } +} + +struct ntr_tex_operand_state { + struct ureg_src srcs[4]; + unsigned i; +}; + +static void +ntr_push_tex_arg(struct ntr_compile *c, + nir_tex_instr *instr, + nir_tex_src_type tex_src_type, + struct ntr_tex_operand_state *s) +{ + int tex_src = nir_tex_instr_src_index(instr, tex_src_type); + if (tex_src < 0) + return; + + nir_src *src = &instr->src[tex_src].src; + + /* virglrenderer workaround that's hard to do in tgsi_translate: Make sure + * that TG4's immediate offset arg is float-typed. + */ + if (instr->op == nir_texop_tg4 && tex_src_type == nir_tex_src_backend2 && + nir_src_is_const(*src)) { + nir_const_value *consts = nir_src_as_const_value(*src); + s->srcs[s->i++] = ureg_imm4f(c->ureg, + consts[0].f32, + consts[1].f32, + consts[2].f32, + consts[3].f32); + return; + } + + s->srcs[s->i++] = ntr_get_src(c, *src); +} + +static void +ntr_emit_texture(struct ntr_compile *c, nir_tex_instr *instr) +{ + struct ureg_dst dst = ntr_get_dest(c, &instr->def); + enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(instr->sampler_dim, instr->is_array, instr->is_shadow); + unsigned tex_opcode; + + int tex_handle_src = nir_tex_instr_src_index(instr, nir_tex_src_texture_handle); + int sampler_handle_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_handle); + + struct ureg_src sampler; + if (tex_handle_src >= 0 && sampler_handle_src >= 0) { + /* It seems we can't get separate tex/sampler on GL, just use one of the handles */ + sampler = ntr_get_src(c, instr->src[tex_handle_src].src); + assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1); + } else { + assert(tex_handle_src == -1 && sampler_handle_src == -1); + sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index); + int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset); + if (sampler_src >= 0) { + struct ureg_src reladdr = ntr_get_src(c, instr->src[sampler_src].src); + sampler = ureg_src_indirect(sampler, ntr_reladdr(c, reladdr, 2)); + } + } + + switch (instr->op) { + case nir_texop_tex: + if (nir_tex_instr_src_size(instr, nir_tex_instr_src_index(instr, nir_tex_src_backend1)) > + MAX2(instr->coord_components, 2) + instr->is_shadow) + tex_opcode = TGSI_OPCODE_TXP; + else + tex_opcode = TGSI_OPCODE_TEX; + break; + case nir_texop_txf: + case nir_texop_txf_ms: + tex_opcode = TGSI_OPCODE_TXF; + + if (c->has_txf_lz) { + int lod_src = nir_tex_instr_src_index(instr, nir_tex_src_lod); + if (lod_src >= 0 && + nir_src_is_const(instr->src[lod_src].src) && + ntr_src_as_uint(c, instr->src[lod_src].src) == 0) { + tex_opcode = TGSI_OPCODE_TXF_LZ; + } + } + break; + case nir_texop_txl: + tex_opcode = TGSI_OPCODE_TXL; + break; + case nir_texop_txb: + tex_opcode = TGSI_OPCODE_TXB; + break; + case nir_texop_txd: + tex_opcode = TGSI_OPCODE_TXD; + break; + case nir_texop_txs: + tex_opcode = TGSI_OPCODE_TXQ; + break; + case nir_texop_tg4: + tex_opcode = TGSI_OPCODE_TG4; + break; + case nir_texop_query_levels: + tex_opcode = TGSI_OPCODE_TXQ; + break; + case nir_texop_lod: + tex_opcode = TGSI_OPCODE_LODQ; + break; + case nir_texop_texture_samples: + tex_opcode = TGSI_OPCODE_TXQS; + break; + default: + unreachable("unsupported tex op"); + } + + struct ntr_tex_operand_state s = { .i = 0 }; + ntr_push_tex_arg(c, instr, nir_tex_src_backend1, &s); + ntr_push_tex_arg(c, instr, nir_tex_src_backend2, &s); + + /* non-coord arg for TXQ */ + if (tex_opcode == TGSI_OPCODE_TXQ) { + ntr_push_tex_arg(c, instr, nir_tex_src_lod, &s); + /* virglrenderer mistakenly looks at .w instead of .x, so make sure it's + * scalar + */ + s.srcs[s.i - 1] = ureg_scalar(s.srcs[s.i - 1], 0); + } + + if (s.i > 1) { + if (tex_opcode == TGSI_OPCODE_TEX) + tex_opcode = TGSI_OPCODE_TEX2; + if (tex_opcode == TGSI_OPCODE_TXB) + tex_opcode = TGSI_OPCODE_TXB2; + if (tex_opcode == TGSI_OPCODE_TXL) + tex_opcode = TGSI_OPCODE_TXL2; + } + + if (instr->op == nir_texop_txd) { + /* Derivs appear in their own src args */ + int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx); + int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy); + s.srcs[s.i++] = ntr_get_src(c, instr->src[ddx].src); + s.srcs[s.i++] = ntr_get_src(c, instr->src[ddy].src); + } + + if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) { + if (c->screen->get_param(c->screen, + PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE)) { + sampler = ureg_scalar(sampler, instr->component); + s.srcs[s.i++] = ureg_src_undef(); + } else { + s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component); + } + } + + s.srcs[s.i++] = sampler; + + enum tgsi_return_type tex_type; + switch (instr->dest_type) { + case nir_type_float32: + tex_type = TGSI_RETURN_TYPE_FLOAT; + break; + case nir_type_int32: + tex_type = TGSI_RETURN_TYPE_SINT; + break; + case nir_type_uint32: + tex_type = TGSI_RETURN_TYPE_UINT; + break; + default: + unreachable("unknown texture type"); + } + + struct ureg_dst tex_dst; + if (instr->op == nir_texop_query_levels) + tex_dst = ureg_writemask(ntr_temp(c), TGSI_WRITEMASK_W); + else + tex_dst = dst; + + while (s.i < 4) + s.srcs[s.i++] = ureg_src_undef(); + + struct ntr_insn *insn = ntr_insn(c, tex_opcode, tex_dst, s.srcs[0], s.srcs[1], s.srcs[2], s.srcs[3]); + insn->tex_target = target; + insn->tex_return_type = tex_type; + insn->is_tex = true; + + int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset); + if (tex_offset_src >= 0) { + struct ureg_src offset = ntr_get_src(c, instr->src[tex_offset_src].src); + + insn->tex_offset[0].File = offset.File; + insn->tex_offset[0].Index = offset.Index; + insn->tex_offset[0].SwizzleX = offset.SwizzleX; + insn->tex_offset[0].SwizzleY = offset.SwizzleY; + insn->tex_offset[0].SwizzleZ = offset.SwizzleZ; + insn->tex_offset[0].Padding = 0; + } + + if (nir_tex_instr_has_explicit_tg4_offsets(instr)) { + for (uint8_t i = 0; i < 4; ++i) { + struct ureg_src imm = ureg_imm2i(c->ureg, instr->tg4_offsets[i][0], instr->tg4_offsets[i][1]); + insn->tex_offset[i].File = imm.File; + insn->tex_offset[i].Index = imm.Index; + insn->tex_offset[i].SwizzleX = imm.SwizzleX; + insn->tex_offset[i].SwizzleY = imm.SwizzleY; + insn->tex_offset[i].SwizzleZ = imm.SwizzleZ; + } + } + + if (instr->op == nir_texop_query_levels) + ntr_MOV(c, dst, ureg_scalar(ureg_src(tex_dst), 3)); +} + +static void +ntr_emit_jump(struct ntr_compile *c, nir_jump_instr *jump) +{ + switch (jump->type) { + case nir_jump_break: + ntr_BRK(c); + break; + + case nir_jump_continue: + ntr_CONT(c); + break; + + default: + fprintf(stderr, "Unknown jump instruction: "); + nir_print_instr(&jump->instr, stderr); + fprintf(stderr, "\n"); + abort(); + } +} + +static void +ntr_emit_ssa_undef(struct ntr_compile *c, nir_undef_instr *instr) +{ + /* Nothing to do but make sure that we have some storage to deref. */ + (void)ntr_get_ssa_def_decl(c, &instr->def); +} + +static void +ntr_emit_instr(struct ntr_compile *c, nir_instr *instr) +{ + switch (instr->type) { + case nir_instr_type_deref: + /* ignored, will be walked by nir_intrinsic_image_*_deref. */ + break; + + case nir_instr_type_alu: + ntr_emit_alu(c, nir_instr_as_alu(instr)); + break; + + case nir_instr_type_intrinsic: + ntr_emit_intrinsic(c, nir_instr_as_intrinsic(instr)); + break; + + case nir_instr_type_load_const: + /* Nothing to do here, as load consts are done directly from + * ntr_get_src() (since many constant NIR srcs will often get folded + * directly into a register file index instead of as a TGSI src). + */ + break; + + case nir_instr_type_tex: + ntr_emit_texture(c, nir_instr_as_tex(instr)); + break; + + case nir_instr_type_jump: + ntr_emit_jump(c, nir_instr_as_jump(instr)); + break; + + case nir_instr_type_undef: + ntr_emit_ssa_undef(c, nir_instr_as_undef(instr)); + break; + + default: + fprintf(stderr, "Unknown NIR instr type: "); + nir_print_instr(instr, stderr); + fprintf(stderr, "\n"); + abort(); + } +} + +static void +ntr_emit_if(struct ntr_compile *c, nir_if *if_stmt) +{ + if (c->native_integers) + ntr_UIF(c, c->if_cond); + else + ntr_IF(c, c->if_cond); + + ntr_emit_cf_list(c, &if_stmt->then_list); + + if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) { + ntr_ELSE(c); + ntr_emit_cf_list(c, &if_stmt->else_list); + } + + ntr_ENDIF(c); +} + +static void +ntr_emit_loop(struct ntr_compile *c, nir_loop *loop) +{ + assert(!nir_loop_has_continue_construct(loop)); + ntr_BGNLOOP(c); + ntr_emit_cf_list(c, &loop->body); + ntr_ENDLOOP(c); +} + +static void +ntr_emit_block(struct ntr_compile *c, nir_block *block) +{ + struct ntr_block *ntr_block = ntr_block_from_nir(c, block); + c->cur_block = ntr_block; + + nir_foreach_instr(instr, block) { + ntr_emit_instr(c, instr); + + /* Sanity check that we didn't accidentally ureg_OPCODE() instead of ntr_OPCODE(). */ + if (ureg_get_instruction_number(c->ureg) != 0) { + fprintf(stderr, "Emitted ureg insn during: "); + nir_print_instr(instr, stderr); + fprintf(stderr, "\n"); + unreachable("emitted ureg insn"); + } + } + + /* Set up the if condition for ntr_emit_if(), which we have to do before + * freeing up the temps (the "if" is treated as inside the block for liveness + * purposes, despite not being an instruction) + * + * Note that, while IF and UIF are supposed to look at only .x, virglrenderer + * looks at all of .xyzw. No harm in working around the bug. + */ + nir_if *nif = nir_block_get_following_if(block); + if (nif) + c->if_cond = ureg_scalar(ntr_get_src(c, nif->condition), TGSI_SWIZZLE_X); +} + +static void +ntr_emit_cf_list(struct ntr_compile *c, struct exec_list *list) +{ + foreach_list_typed(nir_cf_node, node, node, list) { + switch (node->type) { + case nir_cf_node_block: + ntr_emit_block(c, nir_cf_node_as_block(node)); + break; + + case nir_cf_node_if: + ntr_emit_if(c, nir_cf_node_as_if(node)); + break; + + case nir_cf_node_loop: + ntr_emit_loop(c, nir_cf_node_as_loop(node)); + break; + + default: + unreachable("unknown CF type"); + } + } +} + +static void +ntr_emit_block_ureg(struct ntr_compile *c, struct nir_block *block) +{ + struct ntr_block *ntr_block = ntr_block_from_nir(c, block); + + /* Emit the ntr insns to tgsi_ureg. */ + util_dynarray_foreach(&ntr_block->insns, struct ntr_insn, insn) { + const struct tgsi_opcode_info *opcode_info = + tgsi_get_opcode_info(insn->opcode); + + switch (insn->opcode) { + case TGSI_OPCODE_UIF: + ureg_UIF(c->ureg, insn->src[0], &c->cf_label); + break; + + case TGSI_OPCODE_IF: + ureg_IF(c->ureg, insn->src[0], &c->cf_label); + break; + + case TGSI_OPCODE_ELSE: + ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg)); + ureg_ELSE(c->ureg, &c->cf_label); + c->current_if_else = c->cf_label; + break; + + case TGSI_OPCODE_ENDIF: + ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg)); + ureg_ENDIF(c->ureg); + break; + + case TGSI_OPCODE_BGNLOOP: + /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx + * does reference BGNLOOP's. Follow the former behavior unless something comes up + * with a need. + */ + ureg_BGNLOOP(c->ureg, &c->cf_label); + break; + + case TGSI_OPCODE_ENDLOOP: + ureg_ENDLOOP(c->ureg, &c->cf_label); + break; + + default: + if (insn->is_tex) { + int num_offsets = 0; + for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) { + if (insn->tex_offset[i].File != TGSI_FILE_NULL) + num_offsets = i + 1; + } + ureg_tex_insn(c->ureg, insn->opcode, + insn->dst, opcode_info->num_dst, + insn->tex_target, insn->tex_return_type, + insn->tex_offset, + num_offsets, + insn->src, opcode_info->num_src); + } else if (insn->is_mem) { + ureg_memory_insn(c->ureg, insn->opcode, + insn->dst, opcode_info->num_dst, + insn->src, opcode_info->num_src, + insn->mem_qualifier, + insn->tex_target, + insn->mem_format); + } else { + ureg_insn(c->ureg, insn->opcode, + insn->dst, opcode_info->num_dst, + insn->src, opcode_info->num_src, + insn->precise); + } + } + } +} + +static void +ntr_emit_if_ureg(struct ntr_compile *c, nir_if *if_stmt) +{ + /* Note: the last block emitted our IF opcode. */ + + int if_stack = c->current_if_else; + c->current_if_else = c->cf_label; + + /* Either the then or else block includes the ENDIF, which will fix up the + * IF(/ELSE)'s label for jumping + */ + ntr_emit_cf_list_ureg(c, &if_stmt->then_list); + ntr_emit_cf_list_ureg(c, &if_stmt->else_list); + + c->current_if_else = if_stack; +} + +static void +ntr_emit_cf_list_ureg(struct ntr_compile *c, struct exec_list *list) +{ + foreach_list_typed(nir_cf_node, node, node, list) { + switch (node->type) { + case nir_cf_node_block: + ntr_emit_block_ureg(c, nir_cf_node_as_block(node)); + break; + + case nir_cf_node_if: + ntr_emit_if_ureg(c, nir_cf_node_as_if(node)); + break; + + case nir_cf_node_loop: + /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx + * does reference BGNLOOP's. Follow the former behavior unless something comes up + * with a need. + */ + ntr_emit_cf_list_ureg(c, &nir_cf_node_as_loop(node)->body); + break; + + default: + unreachable("unknown CF type"); + } + } +} + +static void +ntr_emit_impl(struct ntr_compile *c, nir_function_impl *impl) +{ + c->impl = impl; + + c->ssa_temp = rzalloc_array(c, struct ureg_src, impl->ssa_alloc); + c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->ssa_alloc); + + /* Set up the struct ntr_blocks to put insns in */ + c->blocks = _mesa_pointer_hash_table_create(c); + nir_foreach_block(block, impl) { + struct ntr_block *ntr_block = rzalloc(c->blocks, struct ntr_block); + util_dynarray_init(&ntr_block->insns, ntr_block); + _mesa_hash_table_insert(c->blocks, block, ntr_block); + } + + + ntr_setup_registers(c); + + c->cur_block = ntr_block_from_nir(c, nir_start_block(impl)); + ntr_setup_inputs(c); + ntr_setup_outputs(c); + ntr_setup_uniforms(c); + + /* Emit the ntr insns */ + ntr_emit_cf_list(c, &impl->body); + + /* Don't do optimized RA if the driver requests it, unless the number of + * temps is too large to be covered by the 16 bit signed int that TGSI + * allocates for the register index */ + if (!c->options->unoptimized_ra || c->num_temps > 0x7fff) + ntr_allocate_regs(c, impl); + else + ntr_allocate_regs_unoptimized(c, impl); + + /* Turn the ntr insns into actual TGSI tokens */ + ntr_emit_cf_list_ureg(c, &impl->body); + + ralloc_free(c->liveness); + c->liveness = NULL; + +} + +static int +type_size(const struct glsl_type *type, bool bindless) +{ + return glsl_count_attribute_slots(type, false); +} + +/* Allow vectorizing of ALU instructions, but avoid vectorizing past what we + * can handle for 64-bit values in TGSI. + */ +static uint8_t +ntr_should_vectorize_instr(const nir_instr *instr, const void *data) +{ + if (instr->type != nir_instr_type_alu) + return 0; + + nir_alu_instr *alu = nir_instr_as_alu(instr); + + switch (alu->op) { + case nir_op_ibitfield_extract: + case nir_op_ubitfield_extract: + case nir_op_bitfield_insert: + /* virglrenderer only looks at the .x channel of the offset/bits operands + * when translating to GLSL. tgsi.rst doesn't seem to require scalar + * offset/bits operands. + * + * https://gitlab.freedesktop.org/virgl/virglrenderer/-/issues/195 + */ + return 1; + + default: + break; + } + + int src_bit_size = nir_src_bit_size(alu->src[0].src); + int dst_bit_size = alu->def.bit_size; + + if (src_bit_size == 64 || dst_bit_size == 64) { + /* Avoid vectorizing 64-bit instructions at all. Despite tgsi.rst + * claiming support, virglrenderer generates bad shaders on the host when + * presented with them. Maybe we can make virgl avoid tickling the + * virglrenderer bugs, but given that glsl-to-TGSI didn't generate vector + * 64-bit instrs in the first place, I don't see much reason to care about + * this. + */ + return 1; + } + + return 4; +} + +static bool +ntr_should_vectorize_io(unsigned align, unsigned bit_size, + unsigned num_components, unsigned high_offset, + nir_intrinsic_instr *low, nir_intrinsic_instr *high, + void *data) +{ + if (bit_size != 32) + return false; + + /* Our offset alignment should aways be at least 4 bytes */ + if (align < 4) + return false; + + /* No wrapping off the end of a TGSI reg. We could do a bit better by + * looking at low's actual offset. XXX: With LOAD_CONSTBUF maybe we don't + * need this restriction. + */ + unsigned worst_start_component = align == 4 ? 3 : align / 4; + if (worst_start_component + num_components > 4) + return false; + + return true; +} + +static nir_variable_mode +ntr_no_indirects_mask(nir_shader *s, struct pipe_screen *screen) +{ + unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage); + unsigned indirect_mask = 0; + + if (!screen->get_shader_param(screen, pipe_stage, + PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR)) { + indirect_mask |= nir_var_shader_in; + } + + if (!screen->get_shader_param(screen, pipe_stage, + PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR)) { + indirect_mask |= nir_var_shader_out; + } + + if (!screen->get_shader_param(screen, pipe_stage, + PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) { + indirect_mask |= nir_var_function_temp; + } + + return indirect_mask; +} + +static void +ntr_optimize_nir(struct nir_shader *s, struct pipe_screen *screen, + const struct nir_to_rc_options *options) +{ + bool progress; + unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage); + unsigned control_flow_depth = + screen->get_shader_param(screen, pipe_stage, + PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH); + do { + progress = false; + + NIR_PASS_V(s, nir_lower_vars_to_ssa); + NIR_PASS_V(s, nir_split_64bit_vec3_and_vec4); + + NIR_PASS(progress, s, nir_copy_prop); + NIR_PASS(progress, s, nir_opt_algebraic); + NIR_PASS(progress, s, nir_opt_constant_folding); + NIR_PASS(progress, s, nir_opt_remove_phis); + NIR_PASS(progress, s, nir_opt_conditional_discard); + NIR_PASS(progress, s, nir_opt_dce); + NIR_PASS(progress, s, nir_opt_dead_cf); + NIR_PASS(progress, s, nir_opt_cse); + NIR_PASS(progress, s, nir_opt_find_array_copies); + NIR_PASS(progress, s, nir_opt_copy_prop_vars); + NIR_PASS(progress, s, nir_opt_dead_write_vars); + + NIR_PASS(progress, s, nir_opt_if, nir_opt_if_aggressive_last_continue | nir_opt_if_optimize_phi_true_false); + NIR_PASS(progress, s, nir_opt_peephole_select, + control_flow_depth == 0 ? ~0 : 8, true, true); + NIR_PASS(progress, s, nir_opt_algebraic); + NIR_PASS(progress, s, nir_opt_constant_folding); + nir_load_store_vectorize_options vectorize_opts = { + .modes = nir_var_mem_ubo, + .callback = ntr_should_vectorize_io, + .robust_modes = 0, + }; + NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts); + NIR_PASS(progress, s, nir_opt_shrink_stores, true); + NIR_PASS(progress, s, nir_opt_shrink_vectors); + NIR_PASS(progress, s, nir_opt_trivial_continues); + NIR_PASS(progress, s, nir_opt_vectorize, ntr_should_vectorize_instr, NULL); + NIR_PASS(progress, s, nir_opt_undef); + NIR_PASS(progress, s, nir_opt_loop_unroll); + + /* Try to fold addressing math into ubo_vec4's base to avoid load_consts + * and ALU ops for it. + */ + nir_opt_offsets_options offset_options = { + .ubo_vec4_max = ~0, + + /* No const offset in TGSI for shared accesses. */ + .shared_max = 0, + + /* unused intrinsics */ + .uniform_max = 0, + .buffer_max = 0, + }; + + if (options->ubo_vec4_max) + offset_options.ubo_vec4_max = options->ubo_vec4_max; + + NIR_PASS(progress, s, nir_opt_offsets, &offset_options); + } while (progress); + + NIR_PASS_V(s, nir_lower_var_copies); +} + +/* Scalarizes all 64-bit ALU ops. Note that we only actually need to + * scalarize vec3/vec4s, should probably fix that. + */ +static bool +scalarize_64bit(const nir_instr *instr, const void *data) +{ + const nir_alu_instr *alu = nir_instr_as_alu(instr); + + return (alu->def.bit_size == 64 || + nir_src_bit_size(alu->src[0].src) == 64); +} + +static bool +nir_to_rc_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr) +{ + b->cursor = nir_after_instr(&instr->instr); + + switch (instr->intrinsic) { + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ubo_vec4: + case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_input: + case nir_intrinsic_load_interpolated_input: + case nir_intrinsic_load_per_vertex_input: + case nir_intrinsic_store_output: + case nir_intrinsic_store_per_vertex_output: + case nir_intrinsic_store_ssbo: + break; + default: + return false; + } + + if (instr->num_components <= 2) + return false; + + bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest; + if (has_dest) { + if (instr->def.bit_size != 64) + return false; + } else { + if (nir_src_bit_size(instr->src[0]) != 64) + return false; + } + + nir_intrinsic_instr *first = + nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr)); + nir_intrinsic_instr *second = + nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr)); + + switch (instr->intrinsic) { + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ubo_vec4: + case nir_intrinsic_load_ssbo: + case nir_intrinsic_store_ssbo: + break; + + default: { + nir_io_semantics semantics = nir_intrinsic_io_semantics(second); + semantics.location++; + semantics.num_slots--; + nir_intrinsic_set_io_semantics(second, semantics); + + nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1); + break; + } + } + + first->num_components = 2; + second->num_components -= 2; + if (has_dest) { + first->def.num_components = 2; + second->def.num_components -= 2; + } + + nir_builder_instr_insert(b, &first->instr); + nir_builder_instr_insert(b, &second->instr); + + if (has_dest) { + /* Merge the two loads' results back into a vector. */ + nir_scalar channels[4] = { + nir_get_scalar(&first->def, 0), + nir_get_scalar(&first->def, 1), + nir_get_scalar(&second->def, 0), + nir_get_scalar(&second->def, second->num_components > 1 ? 1 : 0), + }; + nir_def *new = nir_vec_scalars(b, channels, instr->num_components); + nir_def_rewrite_uses(&instr->def, new); + } else { + /* Split the src value across the two stores. */ + b->cursor = nir_before_instr(&instr->instr); + + nir_def *src0 = instr->src[0].ssa; + nir_scalar channels[4] = { 0 }; + for (int i = 0; i < instr->num_components; i++) + channels[i] = nir_get_scalar(src0, i); + + nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3); + nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2); + + nir_src_rewrite(&first->src[0], nir_vec_scalars(b, channels, 2)); + nir_src_rewrite(&second->src[0], + nir_vec_scalars(b, &channels[2], second->num_components)); + } + + int offset_src = -1; + uint32_t offset_amount = 16; + + switch (instr->intrinsic) { + case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_ubo: + offset_src = 1; + break; + case nir_intrinsic_load_ubo_vec4: + offset_src = 1; + offset_amount = 1; + break; + case nir_intrinsic_store_ssbo: + offset_src = 2; + break; + default: + break; + } + if (offset_src != -1) { + b->cursor = nir_before_instr(&second->instr); + nir_def *second_offset = + nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount); + nir_src_rewrite(&second->src[offset_src], second_offset); + } + + /* DCE stores we generated with no writemask (nothing else does this + * currently). + */ + if (!has_dest) { + if (nir_intrinsic_write_mask(first) == 0) + nir_instr_remove(&first->instr); + if (nir_intrinsic_write_mask(second) == 0) + nir_instr_remove(&second->instr); + } + + nir_instr_remove(&instr->instr); + + return true; +} + +static bool +nir_to_rc_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr) +{ + int num_components = instr->def.num_components; + + if (instr->def.bit_size != 64 || num_components <= 2) + return false; + + b->cursor = nir_before_instr(&instr->instr); + + nir_load_const_instr *first = + nir_load_const_instr_create(b->shader, 2, 64); + nir_load_const_instr *second = + nir_load_const_instr_create(b->shader, num_components - 2, 64); + + first->value[0] = instr->value[0]; + first->value[1] = instr->value[1]; + second->value[0] = instr->value[2]; + if (num_components == 4) + second->value[1] = instr->value[3]; + + nir_builder_instr_insert(b, &first->instr); + nir_builder_instr_insert(b, &second->instr); + + nir_def *channels[4] = { + nir_channel(b, &first->def, 0), + nir_channel(b, &first->def, 1), + nir_channel(b, &second->def, 0), + num_components == 4 ? nir_channel(b, &second->def, 1) : NULL, + }; + nir_def *new = nir_vec(b, channels, num_components); + nir_def_rewrite_uses(&instr->def, new); + nir_instr_remove(&instr->instr); + + return true; +} + +static bool +nir_to_rc_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr, + void *data) +{ + switch (instr->type) { + case nir_instr_type_load_const: + return nir_to_rc_lower_64bit_load_const(b, nir_instr_as_load_const(instr)); + + case nir_instr_type_intrinsic: + return nir_to_rc_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr)); + default: + return false; + } +} + +static bool +nir_to_rc_lower_64bit_to_vec2(nir_shader *s) +{ + return nir_shader_instructions_pass(s, + nir_to_rc_lower_64bit_to_vec2_instr, + nir_metadata_block_index | + nir_metadata_dominance, + NULL); +} + +struct ntr_lower_tex_state { + nir_scalar channels[8]; + unsigned i; +}; + +static void +nir_to_rc_lower_tex_instr_arg(nir_builder *b, + nir_tex_instr *instr, + nir_tex_src_type tex_src_type, + struct ntr_lower_tex_state *s) +{ + int tex_src = nir_tex_instr_src_index(instr, tex_src_type); + if (tex_src < 0) + return; + + nir_def *def = instr->src[tex_src].src.ssa; + for (int i = 0; i < def->num_components; i++) { + s->channels[s->i++] = nir_get_scalar(def, i); + } + + nir_tex_instr_remove_src(instr, tex_src); +} + +/** + * Merges together a vec4 of tex coordinate/compare/bias/lod into a backend tex + * src. This lets NIR handle the coalescing of the vec4 rather than trying to + * manage it on our own, and may lead to more vectorization. + */ +static bool +nir_to_rc_lower_tex_instr(nir_builder *b, nir_instr *instr, void *data) +{ + if (instr->type != nir_instr_type_tex) + return false; + + nir_tex_instr *tex = nir_instr_as_tex(instr); + + if (nir_tex_instr_src_index(tex, nir_tex_src_coord) < 0) + return false; + + b->cursor = nir_before_instr(instr); + + struct ntr_lower_tex_state s = {0}; + + nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_coord, &s); + /* We always have at least two slots for the coordinate, even on 1D. */ + s.i = MAX2(s.i, 2); + + nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_comparator, &s); + s.i = MAX2(s.i, 3); + + nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_bias, &s); + + /* XXX: LZ */ + nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_lod, &s); + nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_projector, &s); + nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_ms_index, &s); + + /* No need to pack undefs in unused channels of the tex instr */ + while (!s.channels[s.i - 1].def) + s.i--; + + /* Instead of putting undefs in the unused slots of the vecs, just put in + * another used channel. Otherwise, we'll get unnecessary moves into + * registers. + */ + assert(s.channels[0].def != NULL); + for (int i = 1; i < s.i; i++) { + if (!s.channels[i].def) + s.channels[i] = s.channels[0]; + } + + nir_tex_instr_add_src(tex, nir_tex_src_backend1, + nir_vec_scalars(b, s.channels, MIN2(s.i, 4))); + if (s.i > 4) + nir_tex_instr_add_src(tex, nir_tex_src_backend2, + nir_vec_scalars(b, &s.channels[4], s.i - 4)); + + return true; +} + +static bool +nir_to_rc_lower_tex(nir_shader *s) +{ + return nir_shader_instructions_pass(s, + nir_to_rc_lower_tex_instr, + nir_metadata_block_index | + nir_metadata_dominance, + NULL); +} + +static void +ntr_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s, + const struct nir_to_rc_options *ntr_options) +{ + const struct nir_shader_compiler_options *options = s->options; + bool lower_fsqrt = + !screen->get_shader_param(screen, pipe_shader_type_from_mesa(s->info.stage), + PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED); + + bool force_indirect_unrolling_sampler = + screen->get_param(screen, PIPE_CAP_GLSL_FEATURE_LEVEL) < 400; + + nir_variable_mode no_indirects_mask = ntr_no_indirects_mask(s, screen); + + if (!options->lower_extract_byte || + !options->lower_extract_word || + !options->lower_insert_byte || + !options->lower_insert_word || + !options->lower_fdph || + !options->lower_flrp64 || + !options->lower_fmod || + !options->lower_rotate || + !options->lower_uadd_carry || + !options->lower_usub_borrow || + !options->lower_uadd_sat || + !options->lower_usub_sat || + !options->lower_uniforms_to_ubo || + !options->lower_vector_cmp || + options->lower_fsqrt != lower_fsqrt || + options->force_indirect_unrolling != no_indirects_mask || + force_indirect_unrolling_sampler) { + nir_shader_compiler_options *new_options = ralloc(s, nir_shader_compiler_options); + *new_options = *s->options; + + new_options->lower_extract_byte = true; + new_options->lower_extract_word = true; + new_options->lower_insert_byte = true; + new_options->lower_insert_word = true; + new_options->lower_fdph = true; + new_options->lower_flrp64 = true; + new_options->lower_fmod = true; + new_options->lower_rotate = true; + new_options->lower_uadd_carry = true; + new_options->lower_usub_borrow = true; + new_options->lower_uadd_sat = true; + new_options->lower_usub_sat = true; + new_options->lower_uniforms_to_ubo = true; + new_options->lower_vector_cmp = true; + new_options->lower_fsqrt = lower_fsqrt; + new_options->force_indirect_unrolling = no_indirects_mask; + new_options->force_indirect_unrolling_sampler = force_indirect_unrolling_sampler; + + s->options = new_options; + } +} + +static bool +ntr_lower_atomic_pre_dec_filter(const nir_instr *instr, const void *_data) +{ + return (instr->type == nir_instr_type_intrinsic && + nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_atomic_counter_pre_dec); +} + +static nir_def * +ntr_lower_atomic_pre_dec_lower(nir_builder *b, nir_instr *instr, void *_data) +{ + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + nir_def *old_result = &intr->def; + intr->intrinsic = nir_intrinsic_atomic_counter_post_dec; + + return nir_iadd_imm(b, old_result, -1); +} + +static bool +ntr_lower_atomic_pre_dec(nir_shader *s) +{ + return nir_shader_lower_instructions(s, + ntr_lower_atomic_pre_dec_filter, + ntr_lower_atomic_pre_dec_lower, NULL); +} + +/* Lowers texture projectors if we can't do them as TGSI_OPCODE_TXP. */ +static void +nir_to_rc_lower_txp(nir_shader *s) +{ + nir_lower_tex_options lower_tex_options = { + .lower_txp = 0, + }; + + nir_foreach_block(block, nir_shader_get_entrypoint(s)) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_tex) + continue; + nir_tex_instr *tex = nir_instr_as_tex(instr); + + if (nir_tex_instr_src_index(tex, nir_tex_src_projector) < 0) + continue; + + bool has_compare = nir_tex_instr_src_index(tex, nir_tex_src_comparator) >= 0; + bool has_lod = nir_tex_instr_src_index(tex, nir_tex_src_lod) >= 0 || s->info.stage != MESA_SHADER_FRAGMENT; + bool has_offset = nir_tex_instr_src_index(tex, nir_tex_src_offset) >= 0; + + /* We can do TXP for any tex (not txg) where we can fit all the + * coordinates and comparator and projector in one vec4 without any + * other modifiers to add on. + * + * nir_lower_tex() only handles the lowering on a sampler-dim basis, so + * if we get any funny projectors then we just blow them all away. + */ + if (tex->op != nir_texop_tex || has_lod || has_offset || (tex->coord_components >= 3 && has_compare)) + lower_tex_options.lower_txp |= 1 << tex->sampler_dim; + } + } + + /* nir_lower_tex must be run even if no options are set, because we need the + * LOD to be set for query_levels and for non-fragment shaders. + */ + NIR_PASS_V(s, nir_lower_tex, &lower_tex_options); +} + +static bool +nir_lower_primid_sysval_to_input_filter(const nir_instr *instr, const void *_data) +{ + return (instr->type == nir_instr_type_intrinsic && + nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_primitive_id); +} + +static nir_def * +nir_lower_primid_sysval_to_input_lower(nir_builder *b, nir_instr *instr, void *data) +{ + nir_variable *var = nir_get_variable_with_location(b->shader, nir_var_shader_in, + VARYING_SLOT_PRIMITIVE_ID, glsl_uint_type()); + + nir_io_semantics semantics = { + .location = var->data.location, + .num_slots = 1 + }; + return nir_load_input(b, 1, 32, nir_imm_int(b, 0), + .base = var->data.driver_location, + .io_semantics = semantics); +} + +static bool +nir_lower_primid_sysval_to_input(nir_shader *s) +{ + return nir_shader_lower_instructions(s, + nir_lower_primid_sysval_to_input_filter, + nir_lower_primid_sysval_to_input_lower, NULL); +} + +const void * +nir_to_rc(struct nir_shader *s, + struct pipe_screen *screen) +{ + static const struct nir_to_rc_options default_ntr_options = {0}; + return nir_to_rc_options(s, screen, &default_ntr_options); +} + +/* Prevent lower_vec_to_mov from coalescing 64-to-32 conversions and comparisons + * into unsupported channels of registers. + */ +static bool +ntr_vec_to_mov_writemask_cb(const nir_instr *instr, unsigned writemask, UNUSED const void *_data) +{ + if (instr->type != nir_instr_type_alu) + return false; + + nir_alu_instr *alu = nir_instr_as_alu(instr); + int dst_32 = alu->def.bit_size == 32; + int src_64 = nir_src_bit_size(alu->src[0].src) == 64; + + if (src_64 && dst_32) { + int num_srcs = nir_op_infos[alu->op].num_inputs; + + if (num_srcs == 2 || nir_op_infos[alu->op].output_type == nir_type_bool32) { + /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz + * instead of .xy. Just support scalar compares storing to .x, + * GLSL-to-TGSI only ever emitted scalar ops anyway. + */ + if (writemask != TGSI_WRITEMASK_X) + return false; + } else { + /* TGSI's 64-to-32-bit conversions can only store to .xy (since a TGSI + * register can only store a dvec2). Don't try to coalesce to write to + * .zw. + */ + if (writemask & ~(TGSI_WRITEMASK_XY)) + return false; + } + } + + return true; +} + +/** + * Translates the NIR shader to TGSI. + * + * This requires some lowering of the NIR shader to prepare it for translation. + * We take ownership of the NIR shader passed, returning a reference to the new + * TGSI tokens instead. If you need to keep the NIR, then pass us a clone. + */ +const void *nir_to_rc_options(struct nir_shader *s, + struct pipe_screen *screen, + const struct nir_to_rc_options *options) +{ + struct ntr_compile *c; + const void *tgsi_tokens; + nir_variable_mode no_indirects_mask = ntr_no_indirects_mask(s, screen); + bool native_integers = screen->get_shader_param(screen, + pipe_shader_type_from_mesa(s->info.stage), + PIPE_SHADER_CAP_INTEGERS); + const struct nir_shader_compiler_options *original_options = s->options; + + ntr_fix_nir_options(screen, s, options); + + /* Lower array indexing on FS inputs. Since we don't set + * ureg->supports_any_inout_decl_range, the TGSI input decls will be split to + * elements by ureg, and so dynamically indexing them would be invalid. + * Ideally we would set that ureg flag based on + * PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE, but can't due to mesa/st + * splitting NIR VS outputs to elements even if the FS doesn't get the + * corresponding splitting, and virgl depends on TGSI across link boundaries + * having matching declarations. + */ + if (s->info.stage == MESA_SHADER_FRAGMENT) { + NIR_PASS_V(s, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX); + NIR_PASS_V(s, nir_remove_dead_variables, nir_var_shader_in, NULL); + } + + NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + type_size, (nir_lower_io_options)0); + + nir_to_rc_lower_txp(s); + NIR_PASS_V(s, nir_to_rc_lower_tex); + + /* While TGSI can represent PRIMID as either an input or a system value, + * glsl-to-tgsi had the GS (not TCS or TES) primid as an input, and drivers + * depend on that. + */ + if (s->info.stage == MESA_SHADER_GEOMETRY) + NIR_PASS_V(s, nir_lower_primid_sysval_to_input); + + if (s->info.num_abos) + NIR_PASS_V(s, ntr_lower_atomic_pre_dec); + + if (!original_options->lower_uniforms_to_ubo) { + NIR_PASS_V(s, nir_lower_uniforms_to_ubo, + screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS), + !native_integers); + } + + /* Do lowering so we can directly translate f64/i64 NIR ALU ops to TGSI -- + * TGSI stores up to a vec2 in each slot, so to avoid a whole bunch of op + * duplication logic we just make it so that we only see vec2s. + */ + NIR_PASS_V(s, nir_lower_alu_to_scalar, scalarize_64bit, NULL); + NIR_PASS_V(s, nir_to_rc_lower_64bit_to_vec2); + + if (!screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF)) + NIR_PASS_V(s, nir_lower_ubo_vec4); + + ntr_optimize_nir(s, screen, options); + + NIR_PASS_V(s, nir_lower_indirect_derefs, no_indirects_mask, UINT32_MAX); + + /* Lower demote_if to if (cond) { demote } because TGSI doesn't have a DEMOTE_IF. */ + NIR_PASS_V(s, nir_lower_discard_if, nir_lower_demote_if_to_cf); + + NIR_PASS_V(s, nir_lower_frexp); + + bool progress; + do { + progress = false; + NIR_PASS(progress, s, nir_opt_algebraic_late); + if (progress) { + NIR_PASS_V(s, nir_copy_prop); + NIR_PASS_V(s, nir_opt_dce); + NIR_PASS_V(s, nir_opt_cse); + } + } while (progress); + + NIR_PASS_V(s, nir_opt_combine_barriers, NULL, NULL); + + if (screen->get_shader_param(screen, + pipe_shader_type_from_mesa(s->info.stage), + PIPE_SHADER_CAP_INTEGERS)) { + NIR_PASS_V(s, nir_lower_bool_to_int32); + } else { + NIR_PASS_V(s, nir_lower_int_to_float); + NIR_PASS_V(s, nir_lower_bool_to_float, + !options->lower_cmp && !options->lower_fabs); + /* bool_to_float generates MOVs for b2f32 that we want to clean up. */ + NIR_PASS_V(s, nir_copy_prop); + NIR_PASS_V(s, nir_opt_dce); + } + + nir_move_options move_all = + nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | + nir_move_comparisons | nir_move_copies | nir_move_load_ssbo; + + NIR_PASS_V(s, nir_opt_move, move_all); + + NIR_PASS_V(s, nir_convert_from_ssa, true); + NIR_PASS_V(s, nir_lower_vec_to_regs, ntr_vec_to_mov_writemask_cb, NULL); + + /* locals_to_reg_intrinsics will leave dead derefs that are good to clean up. + */ + NIR_PASS_V(s, nir_lower_locals_to_regs, 32); + NIR_PASS_V(s, nir_opt_dce); + + /* See comment in ntr_get_alu_src for supported modifiers */ + NIR_PASS_V(s, nir_legacy_trivialize, !options->lower_fabs); + + if (NIR_DEBUG(TGSI)) { + fprintf(stderr, "NIR before translation to TGSI:\n"); + nir_print_shader(s, stderr); + } + + c = rzalloc(NULL, struct ntr_compile); + c->screen = screen; + c->options = options; + + c->needs_texcoord_semantic = + screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD); + c->has_txf_lz = + screen->get_param(screen, PIPE_CAP_TGSI_TEX_TXF_LZ); + + c->s = s; + c->native_integers = native_integers; + c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage)); + ureg_setup_shader_info(c->ureg, &s->info); + if (s->info.use_legacy_math_rules && screen->get_param(screen, PIPE_CAP_LEGACY_MATH_RULES)) + ureg_property(c->ureg, TGSI_PROPERTY_LEGACY_MATH_RULES, 1); + + if (s->info.stage == MESA_SHADER_FRAGMENT) { + /* The draw module's polygon stipple layer doesn't respect the chosen + * coordinate mode, so leave it as unspecified unless we're actually + * reading the position in the shader already. See + * gl-2.1-polygon-stipple-fs on softpipe. + */ + if ((s->info.inputs_read & VARYING_BIT_POS) || + BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) { + ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN, + s->info.fs.origin_upper_left ? + TGSI_FS_COORD_ORIGIN_UPPER_LEFT : + TGSI_FS_COORD_ORIGIN_LOWER_LEFT); + + ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER, + s->info.fs.pixel_center_integer ? + TGSI_FS_COORD_PIXEL_CENTER_INTEGER : + TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER); + } + } + /* Emit the main function */ + nir_function_impl *impl = nir_shader_get_entrypoint(c->s); + ntr_emit_impl(c, impl); + ureg_END(c->ureg); + + tgsi_tokens = ureg_get_tokens(c->ureg, NULL); + + if (NIR_DEBUG(TGSI)) { + fprintf(stderr, "TGSI after translation from NIR:\n"); + tgsi_dump(tgsi_tokens, 0); + } + + ureg_destroy(c->ureg); + + ralloc_free(c); + ralloc_free(s); + + return tgsi_tokens; +} + +static const nir_shader_compiler_options nir_to_rc_compiler_options = { + .fdot_replicates = true, + .fuse_ffma32 = true, + .fuse_ffma64 = true, + .lower_extract_byte = true, + .lower_extract_word = true, + .lower_insert_byte = true, + .lower_insert_word = true, + .lower_fdph = true, + .lower_flrp64 = true, + .lower_fmod = true, + .lower_rotate = true, + .lower_uniforms_to_ubo = true, + .lower_uadd_carry = true, + .lower_usub_borrow = true, + .lower_uadd_sat = true, + .lower_usub_sat = true, + .lower_vector_cmp = true, + .lower_int64_options = nir_lower_imul_2x32_64, + .use_interpolated_input_intrinsics = true, + + /* TGSI doesn't have a semantic for local or global index, just local and + * workgroup id. + */ + .lower_cs_local_index_to_id = true, +}; + +/* Returns a default compiler options for drivers with only nir-to-tgsi-based + * NIR support. + */ +const void * +nir_to_rc_get_compiler_options(struct pipe_screen *pscreen, + enum pipe_shader_ir ir, + unsigned shader) +{ + assert(ir == PIPE_SHADER_IR_NIR); + return &nir_to_rc_compiler_options; +} + +/** Helper for getting TGSI tokens to store for a pipe_shader_state CSO. */ +const void * +r300_pipe_shader_state_to_tgsi_tokens(struct pipe_screen *screen, + const struct pipe_shader_state *cso) +{ + if (cso->type == PIPE_SHADER_IR_NIR) { + return nir_to_rc((nir_shader *)cso->ir.nir, screen); + } else { + assert(cso->type == PIPE_SHADER_IR_TGSI); + /* we need to keep a local copy of the tokens */ + return tgsi_dup_tokens(cso->tokens); + } +} diff --git a/src/gallium/drivers/r300/compiler/nir_to_rc.h b/src/gallium/drivers/r300/compiler/nir_to_rc.h new file mode 100644 index 00000000000..0df5cd9a74e --- /dev/null +++ b/src/gallium/drivers/r300/compiler/nir_to_rc.h @@ -0,0 +1,59 @@ +/* + * Copyright © 2014 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef NIR_TO_RC_H +#define NIR_TO_RC_H + +#include +#include "pipe/p_defines.h" + +struct nir_shader; +struct pipe_screen; +struct pipe_shader_state; + +struct nir_to_rc_options { + bool lower_cmp; + /* Emit MAX(a,-a) instead of abs src modifier) */ + bool lower_fabs; + bool unoptimized_ra; + bool lower_ssbo_bindings; + uint32_t ubo_vec4_max; +}; + +const void *nir_to_rc(struct nir_shader *s, + struct pipe_screen *screen); + +const void *nir_to_rc_options(struct nir_shader *s, + struct pipe_screen *screen, + const struct nir_to_rc_options *ntr_options); + +const void * +nir_to_rc_get_compiler_options(struct pipe_screen *pscreen, + enum pipe_shader_ir ir, + unsigned shader); + +const void * +r300_pipe_shader_state_to_tgsi_tokens(struct pipe_screen *screen, + const struct pipe_shader_state *cso); + +#endif /* NIR_TO_RC_H */ diff --git a/src/gallium/drivers/r300/meson.build b/src/gallium/drivers/r300/meson.build index 136f7d2d93f..ec675f4ade8 100644 --- a/src/gallium/drivers/r300/meson.build +++ b/src/gallium/drivers/r300/meson.build @@ -62,6 +62,8 @@ files_r300 = files( 'r300_vs.h', 'compiler/memory_pool.c', 'compiler/memory_pool.h', + 'compiler/nir_to_rc.c', + 'compiler/nir_to_rc.h', 'compiler/r300_fragprog.c', 'compiler/r300_fragprog_emit.c', 'compiler/r300_fragprog.h', diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c index 0bc86c584e5..a0177280444 100644 --- a/src/gallium/drivers/r300/r300_state.c +++ b/src/gallium/drivers/r300/r300_state.c @@ -47,7 +47,7 @@ #include "r300_texture.h" #include "r300_vs.h" #include "compiler/r300_nir.h" -#include "nir/nir_to_tgsi.h" +#include "compiler/nir_to_rc.h" /* r300_state: Functions used to initialize state context by translating * Gallium state objects into semi-native r300 state objects. */ @@ -1047,7 +1047,7 @@ static void* r300_create_fs_state(struct pipe_context* pipe, if (fs->state.type == PIPE_SHADER_IR_NIR) { if (r300->screen->caps.is_r500) NIR_PASS_V(shader->ir.nir, r300_transform_fs_trig_input); - fs->state.tokens = nir_to_tgsi(shader->ir.nir, pipe->screen); + fs->state.tokens = nir_to_rc(shader->ir.nir, pipe->screen); } else { assert(fs->state.type == PIPE_SHADER_IR_TGSI); /* we need to keep a local copy of the tokens */ @@ -1945,21 +1945,21 @@ static void* r300_create_vs_state(struct pipe_context* pipe, vs->state = *shader; if (vs->state.type == PIPE_SHADER_IR_NIR) { - static const struct nir_to_tgsi_options swtcl_options = {0}; - static const struct nir_to_tgsi_options hwtcl_r300_options = { + static const struct nir_to_rc_options swtcl_options = {0}; + static const struct nir_to_rc_options hwtcl_r300_options = { .lower_cmp = true, .lower_fabs = true, .ubo_vec4_max = 0x00ff, .unoptimized_ra = true, }; - static const struct nir_to_tgsi_options hwtcl_r500_options = { + static const struct nir_to_rc_options hwtcl_r500_options = { .ubo_vec4_max = 0x00ff, .unoptimized_ra = true, }; - const struct nir_to_tgsi_options *ntt_options; + const struct nir_to_rc_options *ntr_options; if (r300->screen->caps.has_tcl) { if (r300->screen->caps.is_r500) { - ntt_options = &hwtcl_r500_options; + ntr_options = &hwtcl_r500_options; /* Only nine should set both NTT shader name and * use_legacy_math_rules and D3D9 already mandates @@ -1972,12 +1972,12 @@ static void* r300_create_vs_state(struct pipe_context* pipe, } } else - ntt_options = &hwtcl_r300_options; + ntr_options = &hwtcl_r300_options; } else { - ntt_options = &swtcl_options; + ntr_options = &swtcl_options; } - vs->state.tokens = nir_to_tgsi_options(shader->ir.nir, pipe->screen, - ntt_options); + vs->state.tokens = nir_to_rc_options(shader->ir.nir, pipe->screen, + ntr_options); } else { assert(vs->state.type == PIPE_SHADER_IR_TGSI); /* we need to keep a local copy of the tokens */