diff --git a/src/gallium/drivers/i915/i915_context.h b/src/gallium/drivers/i915/i915_context.h index 0bbbd66662b..ef81f69740c 100644 --- a/src/gallium/drivers/i915/i915_context.h +++ b/src/gallium/drivers/i915/i915_context.h @@ -88,8 +88,15 @@ struct i915_winsys_batchbuffer; #define I915_MAX_CONSTANT 32 -/** See constant_flags[] below */ -#define I915_CONSTFLAG_USER 0x1f +/** + * Per-channel flags for constant_flags[]. + * Bits 0-3: channel has a compiler immediate. + * Bits 4-7: channel has a user (UBO) value uploaded at draw time. + * A channel is available when neither bit is set. + */ +#define I915_CONSTFLAG_IMM(ch) (1 << (ch)) +#define I915_CONSTFLAG_USER_CH(ch) (1 << ((ch) + 4)) +#define I915_CONSTFLAG_USER 0xf0 /** * Subclass of pipe_shader_state @@ -103,6 +110,10 @@ struct i915_fragment_shader { uint32_t *program; uint32_t program_len; + uint32_t nr_alu_insn; + uint32_t nr_tex_insn; + uint32_t nr_tex_indirect; + uint32_t nr_temps; /** * constants introduced during translation. @@ -134,12 +145,15 @@ struct i915_fragment_shader { } texcoords[I915_TEX_UNITS]; bool reads_pntc; + bool writes_z; + + unsigned num_inputs; + uint8_t input_semantic_name[PIPE_MAX_SHADER_INPUTS]; + uint8_t input_semantic_index[PIPE_MAX_SHADER_INPUTS]; - /* Set if the shader is an internal (blit, etc.) shader that shouldn't debug - * log by default. */ bool internal; - char *error; /* Any error message from compiling this shader (or NULL) */ + char *error; }; struct i915_cache_context; diff --git a/src/gallium/drivers/i915/i915_fpc.h b/src/gallium/drivers/i915/i915_fpc.h index d234042dea2..9e3e4b8ee63 100644 --- a/src/gallium/drivers/i915/i915_fpc.h +++ b/src/gallium/drivers/i915/i915_fpc.h @@ -136,6 +136,15 @@ swizzle(int reg, uint32_t x, uint32_t y, uint32_t z, uint32_t w) CHANNEL_SRC(GET_CHANNEL_SRC(reg, w), 3)); } +static inline int +negate(int reg, int x, int y, int z, int w) +{ + return reg ^ (x << UREG_CHANNEL_X_NEGATE_SHIFT | + y << UREG_CHANNEL_Y_NEGATE_SHIFT | + z << UREG_CHANNEL_Z_NEGATE_SHIFT | + w << UREG_CHANNEL_W_NEGATE_SHIFT); +} + #define A0_DEST(reg) (((reg)&UREG_TYPE_NR_MASK) >> UREG_A0_DEST_SHIFT_LEFT) #define D0_DEST(reg) (((reg)&UREG_TYPE_NR_MASK) >> UREG_A0_DEST_SHIFT_LEFT) #define T0_DEST(reg) (((reg)&UREG_TYPE_NR_MASK) >> UREG_A0_DEST_SHIFT_LEFT) @@ -173,8 +182,21 @@ swizzle(int reg, uint32_t x, uint32_t y, uint32_t z, uint32_t w) */ extern void i915_translate_fragment_program(struct i915_context *i915, struct i915_fragment_shader *fs); +struct corm_compile_opts { + bool deferred_const; + bool seq_sne_opt; + bool late_scalar; +}; + +extern void i915_translate_fragment_program_nir(struct i915_context *i915, + struct i915_fragment_shader *ifs, + struct nir_shader *s, + const struct corm_compile_opts *opts); +extern void i915_use_passthrough_shader(struct i915_fragment_shader *fs); +extern void i915_program_error(struct i915_fp_compile *p, const char *msg, ...); extern uint32_t i915_get_temp(struct i915_fp_compile *p); +extern void i915_release_temp(struct i915_fp_compile *p, int reg); extern uint32_t i915_get_utemp(struct i915_fp_compile *p); extern void i915_release_utemps(struct i915_fp_compile *p); @@ -191,6 +213,8 @@ extern uint32_t i915_emit_decl(struct i915_fp_compile *p, uint32_t type, uint32_t nr, uint32_t d0_flags); extern uint32_t i915_emit_const1f(struct i915_fp_compile *p, float c0); +extern uint32_t i915_emit_const1f_prefer(struct i915_fp_compile *p, float c0, + int preferred_reg); extern uint32_t i915_emit_const2f(struct i915_fp_compile *p, float c0, float c1); diff --git a/src/gallium/drivers/i915/i915_fpc_emit.c b/src/gallium/drivers/i915/i915_fpc_emit.c index 603c79e089f..aeace4396ca 100644 --- a/src/gallium/drivers/i915/i915_fpc_emit.c +++ b/src/gallium/drivers/i915/i915_fpc_emit.c @@ -25,11 +25,45 @@ * **************************************************************************/ +#include + +#include "util/ralloc.h" #include "util/u_math.h" +#include "util/u_memory.h" #include "i915_context.h" #include "i915_fpc.h" #include "i915_reg.h" +void +i915_program_error(struct i915_fp_compile *p, const char *msg, ...) +{ + va_list args; + va_start(args, msg); + ralloc_vasprintf_append(&p->error, msg, args); + va_end(args); +} + +static const unsigned passthrough_program[] = { + _3DSTATE_PIXEL_SHADER_PROGRAM | ((1 * 3) - 1), + (A0_MOV | (REG_TYPE_OC << A0_DEST_TYPE_SHIFT) | A0_DEST_CHANNEL_ALL | + (REG_TYPE_R << A0_SRC0_TYPE_SHIFT) | (0 << A0_SRC0_NR_SHIFT)), + ((SRC_ONE << A1_SRC0_CHANNEL_X_SHIFT) | + (SRC_ZERO << A1_SRC0_CHANNEL_Y_SHIFT) | + (SRC_ZERO << A1_SRC0_CHANNEL_Z_SHIFT) | + (SRC_ONE << A1_SRC0_CHANNEL_W_SHIFT)), + 0}; + +void +i915_use_passthrough_shader(struct i915_fragment_shader *fs) +{ + fs->program = (uint32_t *)MALLOC(sizeof(passthrough_program)); + if (fs->program) { + memcpy(fs->program, passthrough_program, sizeof(passthrough_program)); + fs->program_len = ARRAY_SIZE(passthrough_program); + } + fs->num_constants = 0; +} + uint32_t i915_get_temp(struct i915_fp_compile *p) { @@ -43,7 +77,7 @@ i915_get_temp(struct i915_fp_compile *p) return bit - 1; } -static void +void i915_release_temp(struct i915_fp_compile *p, int reg) { p->temp_flag &= ~(1 << reg); @@ -179,8 +213,6 @@ i915_emit_texld(struct i915_fp_compile *p, uint32_t dest, uint32_t destmask, { const uint32_t k = UREG(GET_UREG_TYPE(coord), GET_UREG_NR(coord)); - int temp = -1; - uint32_t coord_used = 0xf << UREG_CHANNEL_X_SHIFT; if (coord_mask & TGSI_WRITEMASK_Y) coord_used |= 0xf << UREG_CHANNEL_Y_SHIFT; @@ -191,13 +223,10 @@ i915_emit_texld(struct i915_fp_compile *p, uint32_t dest, uint32_t destmask, if ((coord & coord_used) != (k & coord_used) || GET_UREG_TYPE(coord) == REG_TYPE_CONST) { - /* texcoord is swizzled or negated. Need to allocate a new temporary - * register (a utemp / unpreserved temp) won't do. + /* texcoord is swizzled or negated. Need a temporary to hold it. + * Use a utemp so it doesn't create a tex indirect phase boundary. */ - uint32_t tempReg; - - temp = i915_get_temp(p); /* get temp reg index */ - tempReg = UREG(REG_TYPE_R, temp); /* make i915 register */ + uint32_t tempReg = i915_get_utemp(p); i915_emit_arith(p, A0_MOV, tempReg, A0_DEST_CHANNEL_ALL, /* dest reg, writemask */ @@ -227,11 +256,21 @@ i915_emit_texld(struct i915_fp_compile *p, uint32_t dest, uint32_t destmask, p->nr_tex_indirect++; /* Reading from an r# register whose contents depend on output of the - * current phase defines a phase boundary. + * current phase defines a phase boundary. Prefer just bumping the + * phase count (free), but if we'd exceed the HW limit, copy to a + * utemp instead (costs 1 ALU instruction). */ if (GET_UREG_TYPE(coord) == REG_TYPE_R && - p->register_phases[GET_UREG_NR(coord)] == p->nr_tex_indirect) - p->nr_tex_indirect++; + p->register_phases[GET_UREG_NR(coord)] == p->nr_tex_indirect) { + if (p->nr_tex_indirect + 1 < I915_MAX_TEX_INDIRECT) { + p->nr_tex_indirect++; + } else { + uint32_t tmp = i915_get_utemp(p); + i915_emit_arith(p, A0_MOV, tmp, A0_DEST_CHANNEL_ALL, 0, + coord, 0, 0); + coord = tmp; + } + } if (p->csr < p->program + I915_PROGRAM_SIZE) { *(p->csr++) = (opcode | T0_DEST(dest) | T0_SAMPLER(sampler)); @@ -246,40 +285,75 @@ i915_emit_texld(struct i915_fp_compile *p, uint32_t dest, uint32_t destmask, p->nr_tex_insn++; } - if (temp >= 0) - i915_release_temp(p, temp); - return dest; } +static uint32_t +i915_try_const1f_in_reg(struct i915_fp_compile *p, float c0, unsigned reg) +{ + struct i915_fragment_shader *ifs = p->shader; + + for (unsigned idx = 0; idx < 4; idx++) { + if (ifs->constant_flags[reg] & I915_CONSTFLAG_USER_CH(idx)) + continue; + if (!(ifs->constant_flags[reg] & I915_CONSTFLAG_IMM(idx)) || + ifs->constants[reg][idx] == c0) { + ifs->constants[reg][idx] = c0; + ifs->constant_flags[reg] |= I915_CONSTFLAG_IMM(idx); + if (reg + 1 > ifs->num_constants) + ifs->num_constants = reg + 1; + return swizzle(UREG(REG_TYPE_CONST, reg), idx, ZERO, ZERO, ONE); + } + } + return UREG_BAD; +} + +static uint32_t +i915_try_emit_const1f(struct i915_fp_compile *p, float c0, int preferred_reg) +{ + if (preferred_reg >= 0) { + uint32_t r = i915_try_const1f_in_reg(p, c0, preferred_reg); + if (r != UREG_BAD) + return r; + } + + for (unsigned reg = 0; reg < I915_MAX_CONSTANT; reg++) { + uint32_t r = i915_try_const1f_in_reg(p, c0, reg); + if (r != UREG_BAD) + return r; + } + + i915_program_error(p, "i915_emit_const1f: out of constants"); + return 0; +} + uint32_t i915_emit_const1f(struct i915_fp_compile *p, float c0) { - struct i915_fragment_shader *ifs = p->shader; - unsigned reg, idx; - if (c0 == 0.0) return swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO); if (c0 == 1.0) return swizzle(UREG(REG_TYPE_R, 0), ONE, ONE, ONE, ONE); + if (c0 == -1.0) + return negate(swizzle(UREG(REG_TYPE_R, 0), ONE, ONE, ONE, ONE), + 1, 1, 1, 1); - for (reg = 0; reg < I915_MAX_CONSTANT; reg++) { - if (ifs->constant_flags[reg] == I915_CONSTFLAG_USER) - continue; - for (idx = 0; idx < 4; idx++) { - if (!(ifs->constant_flags[reg] & (1 << idx)) || - ifs->constants[reg][idx] == c0) { - ifs->constants[reg][idx] = c0; - ifs->constant_flags[reg] |= 1 << idx; - if (reg + 1 > ifs->num_constants) - ifs->num_constants = reg + 1; - return swizzle(UREG(REG_TYPE_CONST, reg), idx, ZERO, ZERO, ONE); - } - } - } + return i915_try_emit_const1f(p, c0, -1); +} - i915_program_error(p, "i915_emit_const1f: out of constants"); - return 0; +uint32_t +i915_emit_const1f_prefer(struct i915_fp_compile *p, float c0, + int preferred_reg) +{ + if (c0 == 0.0) + return swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO); + if (c0 == 1.0) + return swizzle(UREG(REG_TYPE_R, 0), ONE, ONE, ONE, ONE); + if (c0 == -1.0) + return negate(swizzle(UREG(REG_TYPE_R, 0), ONE, ONE, ONE, ONE), + 1, 1, 1, 1); + + return i915_try_emit_const1f(p, c0, preferred_reg); } uint32_t @@ -301,14 +375,15 @@ i915_emit_const2f(struct i915_fp_compile *p, float c0, float c1) // XXX emit swizzle here for 0, 1, -1 and any combination thereof // we can use swizzle + neg for that for (reg = 0; reg < I915_MAX_CONSTANT; reg++) { - if (ifs->constant_flags[reg] == 0xf || - ifs->constant_flags[reg] == I915_CONSTFLAG_USER) + uint8_t occupied = (ifs->constant_flags[reg] & 0xf) | + (ifs->constant_flags[reg] >> 4); + if (occupied == 0xf) continue; for (idx = 0; idx < 3; idx++) { - if (!(ifs->constant_flags[reg] & (3 << idx))) { + if (!(occupied & (3 << idx))) { ifs->constants[reg][idx + 0] = c0; ifs->constants[reg][idx + 1] = c1; - ifs->constant_flags[reg] |= 3 << idx; + ifs->constant_flags[reg] |= (3 << idx); /* immediate bits */ if (reg + 1 > ifs->num_constants) ifs->num_constants = reg + 1; return swizzle(UREG(REG_TYPE_CONST, reg), idx, idx + 1, ZERO, ONE); @@ -330,9 +405,9 @@ i915_emit_const4f(struct i915_fp_compile *p, float c0, float c1, float c2, // XXX emit swizzle here for 0, 1, -1 and any combination thereof // we can use swizzle + neg for that for (reg = 0; reg < I915_MAX_CONSTANT; reg++) { - if (ifs->constant_flags[reg] == 0xf && ifs->constants[reg][0] == c0 && - ifs->constants[reg][1] == c1 && ifs->constants[reg][2] == c2 && - ifs->constants[reg][3] == c3) { + if ((ifs->constant_flags[reg] & 0x0f) == 0x0f && + ifs->constants[reg][0] == c0 && ifs->constants[reg][1] == c1 && + ifs->constants[reg][2] == c2 && ifs->constants[reg][3] == c3) { return UREG(REG_TYPE_CONST, reg); } else if (ifs->constant_flags[reg] == 0) { @@ -340,7 +415,7 @@ i915_emit_const4f(struct i915_fp_compile *p, float c0, float c1, float c2, ifs->constants[reg][1] = c1; ifs->constants[reg][2] = c2; ifs->constants[reg][3] = c3; - ifs->constant_flags[reg] = 0xf; + ifs->constant_flags[reg] = 0x0f; if (reg + 1 > ifs->num_constants) ifs->num_constants = reg + 1; return UREG(REG_TYPE_CONST, reg); diff --git a/src/gallium/drivers/i915/i915_fpc_nir.c b/src/gallium/drivers/i915/i915_fpc_nir.c new file mode 100644 index 00000000000..d1835800413 --- /dev/null +++ b/src/gallium/drivers/i915/i915_fpc_nir.c @@ -0,0 +1,1310 @@ +/* + * Copyright 2025 Red Hat, Inc. + * SPDX-License-Identifier: MIT + */ + +#include "compiler/nir/nir.h" +#include "tgsi/tgsi_from_mesa.h" +#include "util/log.h" +#include "util/ralloc.h" +#include "util/u_memory.h" + +#include "i915_context.h" +#include "i915_debug.h" +#include "i915_debug_private.h" +#include "i915_fpc.h" +#include "i915_reg.h" + +struct nir_to_i915 { + struct corm_compile_opts opts; + struct i915_fp_compile *p; + struct i915_fragment_shader *ifs; + + uint32_t *ureg_map; + uint32_t **def_csr; + float *deferred_const; + unsigned ureg_map_size; + + int *last_use; + int ip; +}; + +static bool +mark_last_use_cb(nir_src *src, void *state) +{ + struct nir_to_i915 *c = state; + if (src->ssa->index < c->ureg_map_size) + c->last_use[src->ssa->index] = c->ip; + return true; +} + +static void +compute_last_use(struct nir_to_i915 *c, nir_function_impl *impl) +{ + c->ip = 0; + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + nir_foreach_src(instr, mark_last_use_cb, c); + c->ip++; + } + } +} + +static bool +release_if_last_use_cb(nir_src *src, void *state) +{ + struct nir_to_i915 *c = state; + unsigned idx = src->ssa->index; + if (idx < c->ureg_map_size && c->last_use[idx] == c->ip) { + uint32_t ureg = c->ureg_map[idx]; + if (GET_UREG_TYPE(ureg) == REG_TYPE_R) + i915_release_temp(c->p, GET_UREG_NR(ureg)); + } + return true; +} + +static void +release_dead_temps(struct nir_to_i915 *c, nir_instr *instr) +{ + nir_foreach_src(instr, release_if_last_use_cb, c); +} + +static void +set_ureg(struct nir_to_i915 *c, nir_def *def, uint32_t ureg) +{ + assert(def->index < c->ureg_map_size); + c->ureg_map[def->index] = ureg; +} + +static bool +is_deferred(struct nir_to_i915 *c, unsigned ssa_index) +{ + return c->ureg_map[ssa_index] == UREG_BAD; +} + +static uint32_t +resolve_const(struct nir_to_i915 *c, unsigned ssa_index, int preferred_reg) +{ + uint32_t ureg = i915_emit_const1f_prefer(c->p, + c->deferred_const[ssa_index], + preferred_reg); + c->ureg_map[ssa_index] = ureg; + return ureg; +} + +static uint32_t +src_ureg(struct nir_to_i915 *c, nir_src *src) +{ + assert(src->ssa->index < c->ureg_map_size); + if (c->ureg_map[src->ssa->index] == UREG_BAD) + resolve_const(c, src->ssa->index, -1); + return c->ureg_map[src->ssa->index]; +} + +static uint32_t +alu_src_ureg(struct nir_to_i915 *c, nir_alu_src *src) +{ + uint32_t ureg = src_ureg(c, &src->src); + return swizzle(ureg, + src->swizzle[0], src->swizzle[1], + src->swizzle[2], src->swizzle[3]); +} + +static uint32_t +def_mask(nir_def *def) +{ + uint32_t mask = 0; + if (def->num_components >= 1) mask |= A0_DEST_CHANNEL_X; + if (def->num_components >= 2) mask |= A0_DEST_CHANNEL_Y; + if (def->num_components >= 3) mask |= A0_DEST_CHANNEL_Z; + if (def->num_components >= 4) mask |= A0_DEST_CHANNEL_W; + return mask; +} + +static uint32_t +writemask_to_mask(unsigned wm) +{ + uint32_t mask = 0; + if (wm & 1) mask |= A0_DEST_CHANNEL_X; + if (wm & 2) mask |= A0_DEST_CHANNEL_Y; + if (wm & 4) mask |= A0_DEST_CHANNEL_Z; + if (wm & 8) mask |= A0_DEST_CHANNEL_W; + return mask; +} + +static uint32_t +get_texcoord_mapping(struct i915_fragment_shader *fs, + unsigned semantic, int index) +{ + for (int i = 0; i < I915_TEX_UNITS; i++) { + if (fs->texcoords[i].semantic == -1) { + fs->texcoords[i].semantic = semantic; + fs->texcoords[i].index = index; + return i; + } + if (fs->texcoords[i].semantic == (int)semantic && + fs->texcoords[i].index == index) + return i; + } + return 0; +} + +static uint32_t +emit_input(struct nir_to_i915 *c, unsigned location) +{ + struct i915_fp_compile *p = c->p; + struct i915_fragment_shader *ifs = c->ifs; + unsigned sem_name, sem_index; + + tgsi_get_gl_varying_semantic((gl_varying_slot)location, true, + &sem_name, &sem_index); + + switch (sem_name) { + case TGSI_SEMANTIC_GENERIC: + case TGSI_SEMANTIC_TEXCOORD: + case TGSI_SEMANTIC_PCOORD: + case TGSI_SEMANTIC_POSITION: { + if (sem_name == TGSI_SEMANTIC_PCOORD) + ifs->reads_pntc = true; + int tc = get_texcoord_mapping(ifs, sem_name, sem_index); + return i915_emit_decl(p, REG_TYPE_T, T_TEX0 + tc, D0_CHANNEL_ALL); + } + case TGSI_SEMANTIC_COLOR: + if (sem_index == 0) { + return i915_emit_decl(p, REG_TYPE_T, T_DIFFUSE, D0_CHANNEL_ALL); + } else { + return swizzle( + i915_emit_decl(p, REG_TYPE_T, T_SPECULAR, D0_CHANNEL_XYZ), + X, Y, Z, ONE); + } + case TGSI_SEMANTIC_FOG: + return swizzle( + i915_emit_decl(p, REG_TYPE_T, T_FOG_W, D0_CHANNEL_W), + W, W, W, W); + case TGSI_SEMANTIC_FACE: { + int tc = get_texcoord_mapping(ifs, sem_name, sem_index); + return i915_emit_decl(p, REG_TYPE_T, T_TEX0 + tc, D0_CHANNEL_X); + } + default: + i915_program_error(p, "Bad input location %d (semantic %d)", + location, sem_name); + return 0; + } +} + +static void +emit_load_const(struct nir_to_i915 *c, nir_load_const_instr *load) +{ + struct i915_fp_compile *p = c->p; + + switch (load->def.num_components) { + case 1: { + float val = load->value[0].f32; + if (c->opts.deferred_const && + val != 0.0f && val != 1.0f && val != -1.0f) { + c->deferred_const[load->def.index] = val; + set_ureg(c, &load->def, UREG_BAD); + } else { + set_ureg(c, &load->def, i915_emit_const1f(p, val)); + } + break; + } + case 2: + set_ureg(c, &load->def, + i915_emit_const2f(p, load->value[0].f32, + load->value[1].f32)); + break; + case 3: + case 4: { + unsigned n = load->def.num_components; + float v[4] = { + load->value[0].f32, + n > 1 ? load->value[1].f32 : 0.0f, + n > 2 ? load->value[2].f32 : 0.0f, + n > 3 ? load->value[3].f32 : 0.0f, + }; + + uint32_t ch[4] = { X, Y, Z, W }; + int ng[4] = { 0, 0, 0, 0 }; + bool all_swizzle = true; + for (unsigned i = 0; i < n; i++) { + if (v[i] == 0.0f) + ch[i] = ZERO; + else if (v[i] == 1.0f) + ch[i] = ONE; + else if (v[i] == -1.0f) { + ch[i] = ONE; + ng[i] = 1; + } else { + all_swizzle = false; + break; + } + } + + if (all_swizzle) { + set_ureg(c, &load->def, + negate(swizzle(UREG(REG_TYPE_R, 0), + ch[0], ch[1], ch[2], ch[3]), + ng[0], ng[1], ng[2], ng[3])); + } else { + set_ureg(c, &load->def, i915_emit_const4fv(p, v)); + } + break; + } + default: + i915_program_error(p, "load_const with %d components", + load->def.num_components); + break; + } +} + +static void +coalesce_constants(struct nir_to_i915 *c, nir_alu_instr *alu) +{ + unsigned n = nir_op_infos[alu->op].num_inputs; + unsigned deferred[3]; + unsigned nr_deferred = 0; + int preferred = -1; + + for (unsigned i = 0; i < n; i++) { + unsigned idx = alu->src[i].src.ssa->index; + if (is_deferred(c, idx)) { + deferred[nr_deferred++] = idx; + } else { + uint32_t ureg = c->ureg_map[idx]; + if (GET_UREG_TYPE(ureg) == REG_TYPE_CONST && preferred < 0) + preferred = GET_UREG_NR(ureg); + } + } + + if (nr_deferred == 0) + return; + + for (unsigned i = 0; i < nr_deferred; i++) { + uint32_t ureg = resolve_const(c, deferred[i], preferred); + if (preferred < 0 && GET_UREG_TYPE(ureg) == REG_TYPE_CONST) + preferred = GET_UREG_NR(ureg); + } +} + +static void +emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu) +{ + struct i915_fp_compile *p = c->p; + nir_def *def = &alu->def; + + if (def->index < c->ureg_map_size && c->ureg_map[def->index] != 0) + return; + + uint32_t mask = def_mask(def); + uint32_t dest = UREG(REG_TYPE_R, i915_get_temp(p)); + set_ureg(c, def, dest); + + coalesce_constants(c, alu); + + uint32_t src0 = 0, src1 = 0, src2 = 0; + if (nir_op_infos[alu->op].num_inputs >= 1) + src0 = alu_src_ureg(c, &alu->src[0]); + if (nir_op_infos[alu->op].num_inputs >= 2) + src1 = alu_src_ureg(c, &alu->src[1]); + if (nir_op_infos[alu->op].num_inputs >= 3) + src2 = alu_src_ureg(c, &alu->src[2]); + + uint32_t *pre_csr = p->csr; + + switch (alu->op) { + case nir_op_mov: + case nir_op_fcanonicalize: + case nir_op_fneg: { + i915_release_temp(p, GET_UREG_NR(dest)); + set_ureg(c, def, alu->op == nir_op_fneg ? negate(src0, 1, 1, 1, 1) + : src0); + unsigned src_idx = alu->src[0].src.ssa->index; + if (c->last_use[src_idx] == c->ip) + c->last_use[src_idx] = c->last_use[def->index]; + return; + } + case nir_op_fabs: + i915_emit_arith(p, A0_MAX, dest, mask, 0, + src0, negate(src0, 1, 1, 1, 1), 0); + break; + case nir_op_fsat: { + nir_def *src_def = alu->src[0].src.ssa; + uint32_t *prev = c->def_csr[src_def->index]; + if (prev && list_is_singular(&src_def->uses)) { + prev[0] |= A0_DEST_SATURATE; + i915_release_temp(p, GET_UREG_NR(dest)); + set_ureg(c, def, src_ureg(c, &alu->src[0].src)); + c->def_csr[def->index] = prev; + unsigned src_idx = alu->src[0].src.ssa->index; + if (c->last_use[src_idx] == c->ip) + c->last_use[src_idx] = c->last_use[def->index]; + return; + } + i915_emit_arith(p, A0_MOV, dest, mask, A0_DEST_SATURATE, src0, 0, 0); + break; + } + case nir_op_fadd: + i915_emit_arith(p, A0_ADD, dest, mask, 0, src0, src1, 0); + break; + case nir_op_fmul: + i915_emit_arith(p, A0_MUL, dest, mask, 0, src0, src1, 0); + break; + case nir_op_ffma: + i915_emit_arith(p, A0_MAD, dest, mask, 0, src0, src1, src2); + break; + case nir_op_fmin: + case nir_op_imin: + case nir_op_umin: + i915_emit_arith(p, A0_MIN, dest, mask, 0, src0, src1, 0); + break; + case nir_op_fmax: + case nir_op_imax: + case nir_op_umax: + i915_emit_arith(p, A0_MAX, dest, mask, 0, src0, src1, 0); + break; + case nir_op_ffloor: + i915_emit_arith(p, A0_FLR, dest, mask, 0, src0, 0, 0); + break; + case nir_op_ffract: + i915_emit_arith(p, A0_FRC, dest, mask, 0, src0, 0, 0); + break; + case nir_op_ftrunc: + i915_emit_arith(p, A0_TRC, dest, mask, 0, src0, 0, 0); + break; + case nir_op_fceil: { + uint32_t tmp = i915_get_utemp(p); + i915_emit_arith(p, A0_FLR, tmp, mask, 0, + negate(src0, 1, 1, 1, 1), 0, 0); + i915_emit_arith(p, A0_MOV, dest, mask, 0, + negate(tmp, 1, 1, 1, 1), 0, 0); + break; + } + case nir_op_frcp: + i915_emit_arith(p, A0_RCP, dest, mask, 0, + swizzle(src0, X, X, X, X), 0, 0); + break; + case nir_op_frsq: + i915_emit_arith(p, A0_RSQ, dest, mask, 0, + swizzle(src0, X, X, X, X), 0, 0); + break; + case nir_op_fsqrt: { + uint32_t tmp = i915_get_utemp(p); + i915_emit_arith(p, A0_RSQ, tmp, A0_DEST_CHANNEL_X, 0, + swizzle(src0, X, X, X, X), 0, 0); + i915_emit_arith(p, A0_MUL, dest, mask, 0, + src0, swizzle(tmp, X, X, X, X), 0); + break; + } + case nir_op_fexp2: + i915_emit_arith(p, A0_EXP, dest, mask, 0, + swizzle(src0, X, X, X, X), 0, 0); + break; + case nir_op_flog2: + i915_emit_arith(p, A0_LOG, dest, mask, 0, + swizzle(src0, X, X, X, X), 0, 0); + break; + case nir_op_fdot2: + case nir_op_fdot2_replicated: + i915_emit_arith(p, A0_DP3, dest, mask, 0, + swizzle(src0, X, Y, ZERO, ZERO), src1, 0); + break; + case nir_op_fdot3: + case nir_op_fdot3_replicated: + i915_emit_arith(p, A0_DP3, dest, mask, 0, src0, src1, 0); + break; + case nir_op_fdot4: + case nir_op_fdot4_replicated: + i915_emit_arith(p, A0_DP4, dest, mask, 0, src0, src1, 0); + break; + case nir_op_slt: + i915_emit_arith(p, A0_SLT, dest, mask, 0, src0, src1, 0); + break; + case nir_op_sge: + i915_emit_arith(p, A0_SGE, dest, mask, 0, src0, src1, 0); + break; + case nir_op_seq: { + const uint32_t zero = + swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO); + if (c->opts.seq_sne_opt && + ((src0 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK) || + (src1 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK))) { + if ((src0 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK)) + src0 = src1; + /* x == 0 <-> -abs(x) >= 0: 2 insns instead of 3 */ + uint32_t tmp = i915_get_utemp(p); + i915_emit_arith(p, A0_MAX, tmp, A0_DEST_CHANNEL_ALL, 0, + src0, negate(src0, 1, 1, 1, 1), 0); + i915_emit_arith(p, A0_SGE, dest, mask, 0, + negate(tmp, 1, 1, 1, 1), zero, 0); + } else { + /* seq(a,b) = sge(a,b) * sge(b,a) */ + uint32_t tmp = i915_get_utemp(p); + i915_emit_arith(p, A0_SGE, tmp, A0_DEST_CHANNEL_ALL, 0, + src0, src1, 0); + i915_emit_arith(p, A0_SGE, dest, mask, 0, src1, src0, 0); + i915_emit_arith(p, A0_MUL, dest, mask, 0, dest, tmp, 0); + } + break; + } + case nir_op_sne: { + const uint32_t zero = + swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO); + if (c->opts.seq_sne_opt && + ((src0 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK) || + (src1 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK))) { + if ((src0 & UREG_XYZW_CHANNEL_MASK) == (zero & UREG_XYZW_CHANNEL_MASK)) + src0 = src1; + /* x != 0 <-> -abs(x) < 0: 2 insns instead of 3 */ + uint32_t tmp = i915_get_utemp(p); + i915_emit_arith(p, A0_MAX, tmp, A0_DEST_CHANNEL_ALL, 0, + src0, negate(src0, 1, 1, 1, 1), 0); + i915_emit_arith(p, A0_SLT, dest, mask, 0, + negate(tmp, 1, 1, 1, 1), zero, 0); + } else { + /* sne(a,b) = slt(a,b) + slt(b,a) */ + uint32_t tmp = i915_get_utemp(p); + i915_emit_arith(p, A0_SLT, tmp, A0_DEST_CHANNEL_ALL, 0, + src0, src1, 0); + i915_emit_arith(p, A0_SLT, dest, mask, 0, src1, src0, 0); + i915_emit_arith(p, A0_ADD, dest, mask, 0, dest, tmp, 0); + } + break; + } + case nir_op_fpow: { + uint32_t tmp = i915_get_utemp(p); + i915_emit_arith(p, A0_LOG, tmp, A0_DEST_CHANNEL_X, 0, + swizzle(src0, X, X, X, X), 0, 0); + i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_X, 0, tmp, src1, 0); + i915_emit_arith(p, A0_EXP, dest, mask, 0, + swizzle(tmp, X, X, X, X), 0, 0); + break; + } + case nir_op_bcsel: + i915_emit_arith(p, A0_CMP, dest, mask, 0, + negate(src0, 1, 1, 1, 1), src2, src1); + break; + case nir_op_fcsel_ge: + i915_emit_arith(p, A0_CMP, dest, mask, 0, src0, src1, src2); + break; + case nir_op_fcsel_gt: + i915_emit_arith(p, A0_CMP, dest, mask, 0, + negate(src0, 1, 1, 1, 1), src2, src1); + break; + case nir_op_vec2: + case nir_op_vec3: + case nir_op_vec4: { + unsigned n = nir_op_infos[alu->op].num_inputs; + uint32_t srcs[4] = { 0 }; + for (unsigned i = 0; i < n; i++) + srcs[i] = alu_src_ureg(c, &alu->src[i]); + + bool same_reg = true; + for (unsigned i = 1; i < n; i++) { + if ((srcs[i] & UREG_TYPE_NR_MASK) != (srcs[0] & UREG_TYPE_NR_MASK)) { + same_reg = false; + break; + } + } + + if (same_reg) { + uint32_t base = UREG(GET_UREG_TYPE(srcs[0]), GET_UREG_NR(srcs[0])); + uint32_t ch[4] = { X, Y, Z, W }; + int ng[4] = { 0, 0, 0, 0 }; + for (unsigned i = 0; i < n; i++) { + ch[i] = (srcs[i] >> UREG_CHANNEL_X_SHIFT) & 0x7; + ng[i] = (srcs[i] >> UREG_CHANNEL_X_NEGATE_SHIFT) & 0x1; + } + i915_release_temp(p, GET_UREG_NR(dest)); + set_ureg(c, def, negate(swizzle(base, ch[0], ch[1], ch[2], ch[3]), + ng[0], ng[1], ng[2], ng[3])); + return; + } + + /* If this vec's only consumer is a store_output, write directly + * to the output register instead of going through a temp. + * If it's a tex instruction, use a utemp to avoid phase boundaries. + */ + if (list_is_singular(&def->uses)) { + nir_src *use = list_first_entry(&def->uses, nir_src, use_link); + nir_instr *use_instr = nir_src_use_instr(use); + if (use_instr->type == nir_instr_type_intrinsic) { + nir_intrinsic_instr *store = + nir_instr_as_intrinsic(use_instr); + if (store->intrinsic == nir_intrinsic_store_output && + nir_intrinsic_component(store) == 0) { + nir_io_semantics sem = nir_intrinsic_io_semantics(store); + uint32_t out = sem.location == FRAG_RESULT_DEPTH + ? UREG(REG_TYPE_OD, 0) : UREG(REG_TYPE_OC, 0); + i915_release_temp(p, GET_UREG_NR(dest)); + dest = out; + set_ureg(c, def, dest); + } + } else if (use_instr->type == nir_instr_type_tex) { + i915_release_temp(p, GET_UREG_NR(dest)); + uint32_t utemp = i915_get_utemp(p); + dest = utemp; + set_ureg(c, def, dest); + } + } + + static const uint32_t chan_mask[] = { + A0_DEST_CHANNEL_X, A0_DEST_CHANNEL_Y, + A0_DEST_CHANNEL_Z, A0_DEST_CHANNEL_W, + }; + bool emitted[4] = { false }; + uint32_t ch_sel[4]; + int neg_sel[4] = { 0, 0, 0, 0 }; + for (unsigned i = 0; i < n; i++) { + ch_sel[i] = (srcs[i] >> UREG_CHANNEL_X_SHIFT) & 0x7; + neg_sel[i] = (srcs[i] >> UREG_CHANNEL_X_NEGATE_SHIFT) & 0x1; + } + + /* ALU dest folding: if a vec source is a single-use ALU result in a + * temp with identity swizzle, patch that instruction to write + * directly into our dest with the right channel mask. + */ + for (unsigned i = 0; i < n; i++) { + nir_def *src_def = alu->src[i].src.ssa; + uint32_t *prev_csr = c->def_csr[src_def->index]; + if (!prev_csr) + continue; + if (GET_UREG_TYPE(srcs[i]) != REG_TYPE_R) + continue; + unsigned nc = src_def->num_components; + if (i + nc > n) + continue; + bool identity = true; + for (unsigned j = 0; j < nc && identity; j++) + identity = (j == 0 || alu->src[i + j].src.ssa == src_def) && + (alu->src[i + j].swizzle[0] == j); + if (!identity) + continue; + bool all_from_this_vec = true; + nir_foreach_use(use, src_def) { + if (nir_src_use_instr(use) != &alu->instr) { + all_from_this_vec = false; + break; + } + } + if (!all_from_this_vec) + continue; + + uint32_t fold_mask = 0; + for (unsigned j = 0; j < nc; j++) + fold_mask |= chan_mask[i + j]; + + prev_csr[0] = (prev_csr[0] & ~(A0_DEST_CHANNEL_ALL | + (0x1ff << A0_DEST_NR_SHIFT))) | + A0_DEST(dest) | fold_mask; + + i915_release_temp(p, GET_UREG_NR(srcs[i])); + c->ureg_map[src_def->index] = dest; + for (unsigned j = 0; j < nc; j++) + emitted[i + j] = true; + } + + /* ALU consumer fusion: if this vec feeds a single binary ALU op + * and the other ALU source is a single register, emit the ALU op + * per-group with partial writemasks instead of MOV+ALU. + */ + if (list_is_singular(&def->uses)) { + nir_src *use = list_first_entry(&def->uses, nir_src, use_link); + nir_instr *use_instr = nir_src_use_instr(use); + if (use_instr->type == nir_instr_type_alu) { + nir_alu_instr *consumer = nir_instr_as_alu(use_instr); + unsigned nargs = nir_op_infos[consumer->op].num_inputs; + int vec_arg = -1; + for (unsigned a = 0; a < nargs; a++) { + if (consumer->src[a].src.ssa == def) { + vec_arg = a; + break; + } + } + uint32_t hw_op = 0; + bool can_fuse = (vec_arg >= 0); + if (can_fuse) { + switch (consumer->op) { + case nir_op_fmul: hw_op = A0_MUL; break; + case nir_op_fadd: hw_op = A0_ADD; break; + case nir_op_ffma: hw_op = A0_MAD; break; + case nir_op_fmin: case nir_op_imin: case nir_op_umin: + hw_op = A0_MIN; break; + case nir_op_fmax: case nir_op_imax: case nir_op_umax: + hw_op = A0_MAX; break; + default: can_fuse = false; break; + } + } + /* check the non-vec sources are single registers */ + uint32_t other_srcs[3] = { 0, 0, 0 }; + if (can_fuse) { + for (unsigned a = 0; a < nargs; a++) { + if ((int)a == vec_arg) + continue; + nir_def *od = consumer->src[a].src.ssa; + if (od->index >= c->ureg_map_size || + c->ureg_map[od->index] == UREG_BAD) { + can_fuse = false; + break; + } + other_srcs[a] = alu_src_ureg(c, &consumer->src[a]); + } + } + if (can_fuse) { + nir_def *cdef = &consumer->def; + uint32_t cdest = dest; + uint32_t cmask = def_mask(cdef); + + for (unsigned i = 0; i < n; i++) { + if (emitted[i]) + continue; + uint32_t base = UREG(GET_UREG_TYPE(srcs[i]), + GET_UREG_NR(srcs[i])); + uint32_t group_mask = chan_mask[i]; + uint32_t ch[4] = { X, Y, Z, W }; + int ng[4] = { 0, 0, 0, 0 }; + ch[i] = ch_sel[i]; + ng[i] = neg_sel[i]; + for (unsigned j = i + 1; j < n; j++) { + if (!emitted[j] && + (ch_sel[j] >= SRC_ZERO || + (srcs[j] & UREG_TYPE_NR_MASK) == + (srcs[i] & UREG_TYPE_NR_MASK))) { + group_mask |= chan_mask[j]; + ch[j] = ch_sel[j]; + ng[j] = neg_sel[j]; + emitted[j] = true; + } + } + uint32_t fused_src = negate( + swizzle(base, ch[0], ch[1], ch[2], ch[3]), + ng[0], ng[1], ng[2], ng[3]); + uint32_t s[3]; + for (unsigned a = 0; a < nargs; a++) + s[a] = ((int)a == vec_arg) ? fused_src + : other_srcs[a]; + i915_emit_arith(p, hw_op, cdest, + group_mask & cmask, 0, + s[0], nargs > 1 ? s[1] : 0, + nargs > 2 ? s[2] : 0); + emitted[i] = true; + } + + set_ureg(c, cdef, cdest); + c->def_csr[cdef->index] = p->csr - 3; + break; + } + } + } + + /* Process real-register sources first, folding in any ZERO/ONE + * const-swizzle sources that can piggyback on the same MOV. + * Use the unswizzled base register since swizzle() composes. + */ + for (unsigned i = 0; i < n; i++) { + if (emitted[i] || ch_sel[i] >= SRC_ZERO) + continue; + uint32_t base = UREG(GET_UREG_TYPE(srcs[i]), GET_UREG_NR(srcs[i])); + uint32_t group_mask = chan_mask[i]; + uint32_t ch[4] = { X, Y, Z, W }; + int ng[4] = { 0, 0, 0, 0 }; + ch[i] = ch_sel[i]; + ng[i] = neg_sel[i]; + for (unsigned j = i + 1; j < n; j++) { + if (!emitted[j] && + (ch_sel[j] >= SRC_ZERO || + (srcs[j] & UREG_TYPE_NR_MASK) == + (srcs[i] & UREG_TYPE_NR_MASK))) { + group_mask |= chan_mask[j]; + ch[j] = ch_sel[j]; + ng[j] = neg_sel[j]; + emitted[j] = true; + } + } + i915_emit_arith(p, A0_MOV, dest, group_mask & mask, 0, + negate(swizzle(base, ch[0], ch[1], ch[2], ch[3]), + ng[0], ng[1], ng[2], ng[3]), + 0, 0); + emitted[i] = true; + } + /* Any remaining const-swizzle-only sources */ + for (unsigned i = 0; i < n; i++) { + if (emitted[i]) + continue; + uint32_t group_mask = chan_mask[i]; + uint32_t ch[4] = { X, Y, Z, W }; + int ng[4] = { 0, 0, 0, 0 }; + ch[i] = ch_sel[i]; + ng[i] = neg_sel[i]; + for (unsigned j = i + 1; j < n; j++) { + if (!emitted[j]) { + group_mask |= chan_mask[j]; + ch[j] = ch_sel[j]; + ng[j] = neg_sel[j]; + emitted[j] = true; + } + } + i915_emit_arith(p, A0_MOV, dest, group_mask & mask, 0, + negate(swizzle(srcs[i], ch[0], ch[1], ch[2], ch[3]), + ng[0], ng[1], ng[2], ng[3]), + 0, 0); + emitted[i] = true; + } + break; + } + case nir_op_fsign: { + uint32_t tmp = i915_get_utemp(p); + const uint32_t zero = swizzle(UREG(REG_TYPE_R, 0), + ZERO, ZERO, ZERO, ZERO); + i915_emit_arith(p, A0_SLT, tmp, A0_DEST_CHANNEL_ALL, 0, + src0, zero, 0); + i915_emit_arith(p, A0_SLT, dest, mask, 0, zero, src0, 0); + i915_emit_arith(p, A0_ADD, dest, mask, 0, + dest, negate(tmp, 1, 1, 1, 1), 0); + break; + } + default: + i915_program_error(p, "unsupported NIR ALU op: %s", + nir_op_infos[alu->op].name); + break; + } + + if (p->csr == pre_csr + 3) + c->def_csr[def->index] = pre_csr; + + uint32_t save = 0; + if (GET_UREG_TYPE(dest) == REG_TYPE_U) + save = p->utemp_flag & (1 << GET_UREG_NR(dest)); + i915_release_utemps(p); + p->utemp_flag |= save; +} + +static uint32_t +translate_tex_type(struct i915_fp_compile *p, enum glsl_sampler_dim dim) +{ + switch (dim) { + case GLSL_SAMPLER_DIM_1D: + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_RECT: + case GLSL_SAMPLER_DIM_EXTERNAL: + return D0_SAMPLE_TYPE_2D; + case GLSL_SAMPLER_DIM_3D: + return D0_SAMPLE_TYPE_VOLUME; + case GLSL_SAMPLER_DIM_CUBE: + return D0_SAMPLE_TYPE_CUBE; + default: + i915_program_error(p, "unsupported sampler dim %d", dim); + return D0_SAMPLE_TYPE_2D; + } +} + +static uint32_t +tex_coord_mask(nir_tex_instr *tex) +{ + uint32_t mask = TGSI_WRITEMASK_X; + + switch (tex->sampler_dim) { + case GLSL_SAMPLER_DIM_1D: + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_RECT: + case GLSL_SAMPLER_DIM_EXTERNAL: + mask = TGSI_WRITEMASK_XY; + break; + case GLSL_SAMPLER_DIM_3D: + case GLSL_SAMPLER_DIM_CUBE: + mask = TGSI_WRITEMASK_XYZ; + break; + default: + break; + } + + if (tex->is_shadow) + mask |= TGSI_WRITEMASK_Z; + + if (tex->op == nir_texop_txb) + mask |= TGSI_WRITEMASK_W; + + for (unsigned i = 0; i < tex->num_srcs; i++) { + if (tex->src[i].src_type == nir_tex_src_projector) { + mask |= TGSI_WRITEMASK_W; + break; + } + } + + return mask; +} + +static void +emit_tex(struct nir_to_i915 *c, nir_tex_instr *tex) +{ + struct i915_fp_compile *p = c->p; + nir_def *def = &tex->def; + uint32_t dest = UREG(REG_TYPE_R, i915_get_temp(p)); + set_ureg(c, def, dest); + + uint32_t hw_tex = translate_tex_type(p, tex->sampler_dim); + uint32_t sampler = i915_emit_decl(p, REG_TYPE_S, tex->sampler_index, hw_tex); + + uint32_t coord = 0; + uint32_t bias_or_proj = 0; + uint32_t shadow = 0; + bool has_bias = false, has_proj = false, has_shadow = false; + + for (unsigned i = 0; i < tex->num_srcs; i++) { + switch (tex->src[i].src_type) { + case nir_tex_src_coord: + coord = src_ureg(c, &tex->src[i].src); + break; + case nir_tex_src_bias: + bias_or_proj = src_ureg(c, &tex->src[i].src); + has_bias = true; + break; + case nir_tex_src_projector: + bias_or_proj = src_ureg(c, &tex->src[i].src); + has_proj = true; + break; + case nir_tex_src_comparator: + shadow = src_ureg(c, &tex->src[i].src); + has_shadow = true; + break; + default: + break; + } + } + + /* 1D textures: set Y = X so LOD works correctly when sampled as 2D */ + if (tex->sampler_dim == GLSL_SAMPLER_DIM_1D) + coord = swizzle(coord, X, X, Z, W); + + /* pack bias/projector/shadow into a single coord register if needed */ + if (has_bias || has_proj || has_shadow) { + uint32_t tmp = UREG(REG_TYPE_R, i915_get_temp(p)); + + i915_emit_arith(p, A0_MOV, tmp, A0_DEST_CHANNEL_ALL, 0, coord, 0, 0); + + if (has_shadow) + i915_emit_arith(p, A0_MOV, tmp, A0_DEST_CHANNEL_Z, 0, + swizzle(shadow, X, X, X, X), 0, 0); + + if (has_bias || has_proj) + i915_emit_arith(p, A0_MOV, tmp, A0_DEST_CHANNEL_W, 0, + swizzle(bias_or_proj, X, X, X, X), 0, 0); + + coord = tmp; + } + + uint32_t opcode; + if (tex->op == nir_texop_txb) { + opcode = T0_TEXLDB; + } else if (has_proj) { + opcode = T0_TEXLDP; + } else if (tex->op == nir_texop_tex) { + opcode = T0_TEXLD; + } else { + i915_program_error(p, "unsupported tex op %d", tex->op); + return; + } + + i915_emit_texld(p, dest, A0_DEST_CHANNEL_ALL, sampler, coord, opcode, + tex_coord_mask(tex)); + + i915_release_utemps(p); +} + +static void +emit_intrinsic(struct nir_to_i915 *c, nir_intrinsic_instr *intr) +{ + struct i915_fp_compile *p = c->p; + struct i915_fragment_shader *ifs = c->ifs; + + switch (intr->intrinsic) { + case nir_intrinsic_load_input: { + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + unsigned comp = nir_intrinsic_component(intr); + uint32_t reg = emit_input(c, sem.location); + + if (comp > 0) { + reg = swizzle(reg, comp, MIN2(comp + 1, 3), + MIN2(comp + 2, 3), MIN2(comp + 3, 3)); + } + + set_ureg(c, &intr->def, reg); + break; + } + + case nir_intrinsic_store_output: { + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + unsigned comp = nir_intrinsic_component(intr); + uint32_t val = src_ureg(c, &intr->src[0]); + uint32_t wm = nir_intrinsic_write_mask(intr); + uint32_t dest; + + if (sem.location == FRAG_RESULT_DEPTH) { + dest = UREG(REG_TYPE_OD, 0); + } else { + dest = UREG(REG_TYPE_OC, 0); + } + + /* Vec direct-output already wrote to oC/oD */ + uint32_t val_type = GET_UREG_TYPE(val); + if (val_type == REG_TYPE_OC || val_type == REG_TYPE_OD) + break; + + nir_def *src_def = intr->src[0].ssa; + uint32_t *prev = c->def_csr[src_def->index]; + + /* Look through identity vec (same_reg case emits no instructions). + * Check that all uses of the underlying def come from this vec. + */ + bool looked_through_vec = false; + if (!prev) { + nir_instr *def_instr = nir_def_instr_nonconst(src_def); + if (def_instr->type == nir_instr_type_alu) { + nir_alu_instr *vec = nir_instr_as_alu(def_instr); + if ((vec->op == nir_op_vec4 || vec->op == nir_op_vec3 || + vec->op == nir_op_vec2) && + list_is_singular(&src_def->uses)) { + nir_def *inner = vec->src[0].src.ssa; + bool all_from_vec = true; + nir_foreach_use(use, inner) { + if (nir_src_use_instr(use) != def_instr) { + all_from_vec = false; + break; + } + } + if (all_from_vec) { + src_def = inner; + prev = c->def_csr[src_def->index]; + looked_through_vec = true; + } + } + } + } + + if (prev && comp == 0 && + (looked_through_vec || list_is_singular(&src_def->uses))) { + prev[0] = (prev[0] & ~(A0_DEST_CHANNEL_ALL | + (0x1ff << A0_DEST_NR_SHIFT))) | + A0_DEST(dest) | writemask_to_mask(wm); + break; + } + + if (comp > 0) { + uint32_t s[4] = { X, Y, Z, W }; + for (int i = 3; i >= (int)comp; i--) + s[i] = s[i - comp]; + for (unsigned i = 0; i < comp; i++) + s[i] = ZERO; + val = swizzle(val, s[0], s[1], s[2], s[3]); + wm <<= comp; + } + + i915_emit_arith(p, A0_MOV, dest, writemask_to_mask(wm), 0, + val, 0, 0); + break; + } + + case nir_intrinsic_load_ubo: { + nir_src *offset_src = &intr->src[1]; + if (!nir_src_is_const(*offset_src)) { + i915_program_error(p, "non-constant UBO offset"); + set_ureg(c, &intr->def, + swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO)); + break; + } + unsigned byte_offset = (unsigned)nir_src_as_float(*offset_src); + unsigned slot = byte_offset / 16; + unsigned comp = (byte_offset % 16) / 4; + + if (slot >= I915_MAX_CONSTANT) { + i915_program_error(p, "UBO offset %d exceeds max constants", slot); + set_ureg(c, &intr->def, + swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO)); + break; + } + + for (unsigned i = 0; i < intr->def.num_components; i++) + ifs->constant_flags[slot] |= I915_CONSTFLAG_USER_CH(comp + i); + ifs->num_constants = MAX2(ifs->num_constants, slot + 1); + + uint32_t reg = UREG(REG_TYPE_CONST, slot); + if (comp > 0) { + uint32_t s[4]; + for (unsigned i = 0; i < 4; i++) + s[i] = MIN2(comp + i, 3); + reg = swizzle(reg, s[0], s[1], s[2], s[3]); + } + + set_ureg(c, &intr->def, reg); + break; + } + + case nir_intrinsic_load_ubo_vec4: { + nir_src *offset_src = &intr->src[1]; + if (!nir_src_is_const(*offset_src)) { + i915_program_error(p, "non-constant UBO offset"); + set_ureg(c, &intr->def, + swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO)); + break; + } + unsigned slot = nir_intrinsic_base(intr) + + (unsigned)nir_src_as_float(*offset_src); + unsigned comp = nir_intrinsic_component(intr); + + if (slot >= I915_MAX_CONSTANT) { + i915_program_error(p, "UBO slot %d exceeds max constants", slot); + set_ureg(c, &intr->def, + swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO)); + break; + } + + for (unsigned i = 0; i < intr->def.num_components; i++) + ifs->constant_flags[slot] |= I915_CONSTFLAG_USER_CH(comp + i); + ifs->num_constants = MAX2(ifs->num_constants, slot + 1); + + uint32_t reg = UREG(REG_TYPE_CONST, slot); + if (comp > 0) { + uint32_t s[4]; + for (unsigned i = 0; i < 4; i++) + s[i] = MIN2(comp + i, 3); + reg = swizzle(reg, s[0], s[1], s[2], s[3]); + } + + set_ureg(c, &intr->def, reg); + break; + } + + case nir_intrinsic_terminate: + case nir_intrinsic_demote: { + uint32_t tmp = i915_get_utemp(p); + i915_emit_texld(p, tmp, A0_DEST_CHANNEL_ALL, 0, + negate(swizzle(UREG(REG_TYPE_R, 0), ONE, ONE, ONE, ONE), + 1, 1, 1, 1), + T0_TEXKILL, TGSI_WRITEMASK_X); + i915_release_utemps(p); + break; + } + + case nir_intrinsic_terminate_if: + case nir_intrinsic_demote_if: { + uint32_t cond = src_ureg(c, &intr->src[0]); + uint32_t tmp = i915_get_utemp(p); + i915_emit_texld(p, tmp, A0_DEST_CHANNEL_ALL, 0, + negate(swizzle(cond, X, X, X, X), 1, 1, 1, 1), + T0_TEXKILL, TGSI_WRITEMASK_XYZW); + i915_release_utemps(p); + break; + } + + case nir_intrinsic_ddx: + case nir_intrinsic_ddy: + case nir_intrinsic_ddx_coarse: + case nir_intrinsic_ddy_coarse: + case nir_intrinsic_ddx_fine: + case nir_intrinsic_ddy_fine: + set_ureg(c, &intr->def, + swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO)); + break; + + default: + i915_program_error(p, "unsupported intrinsic: %s", + nir_intrinsic_infos[intr->intrinsic].name); + break; + } +} + +static void +emit_instr(struct nir_to_i915 *c, nir_instr *instr) +{ + switch (instr->type) { + case nir_instr_type_load_const: + emit_load_const(c, nir_instr_as_load_const(instr)); + break; + case nir_instr_type_alu: + emit_alu(c, nir_instr_as_alu(instr)); + break; + case nir_instr_type_tex: + emit_tex(c, nir_instr_as_tex(instr)); + break; + case nir_instr_type_intrinsic: + emit_intrinsic(c, nir_instr_as_intrinsic(instr)); + break; + case nir_instr_type_undef: { + nir_undef_instr *undef = nir_instr_as_undef(instr); + set_ureg(c, &undef->def, + swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO)); + break; + } + case nir_instr_type_jump: + case nir_instr_type_deref: + break; + default: + i915_program_error(c->p, "unsupported NIR instruction type %d", + instr->type); + break; + } +} + +static void +fixup_depth_write(struct nir_to_i915 *c, nir_shader *s) +{ + if (!(s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))) + return; + + /* NIR writes depth to OD.X (component 0); hardware reads from OD.W */ + i915_emit_arith(c->p, A0_MOV, + UREG(REG_TYPE_OD, 0), A0_DEST_CHANNEL_W, 0, + swizzle(UREG(REG_TYPE_OD, 0), X, Y, Z, X), + 0, 0); +} + +void +i915_translate_fragment_program_nir(struct i915_context *i915, + struct i915_fragment_shader *ifs, + nir_shader *s, + const struct corm_compile_opts *opts) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(s); + bool debug = I915_DBG_ON(DBG_FS) && + (!ifs->internal || NIR_DEBUG(PRINT_INTERNAL)); + + if (debug) { + mesa_logi("NIR fragment shader:"); + nir_log_shaderi(s); + } + + struct i915_fp_compile *p = CALLOC_STRUCT(i915_fp_compile); + p->shader = ifs; + p->error = ralloc_strdup(NULL, ""); + p->log_program_errors = !ifs->internal; + + ifs->num_constants = 0; + memset(ifs->constant_flags, 0, sizeof(ifs->constant_flags)); + memset(p->register_phases, 0, sizeof(p->register_phases)); + + for (int i = 0; i < I915_TEX_UNITS; i++) + ifs->texcoords[i].semantic = -1; + + p->nr_tex_indirect = 1; + p->nr_tex_insn = 0; + p->nr_alu_insn = 0; + p->nr_decl_insn = 0; + p->csr = p->program; + p->decl = p->declarations; + p->decl_s = 0; + p->decl_t = 0; + p->temp_flag = ~0x0U << I915_MAX_TEMPORARY; + p->utemp_flag = ~0x7; + + *(p->decl++) = _3DSTATE_PIXEL_SHADER_PROGRAM; + + struct nir_to_i915 c = { + .p = p, + .ifs = ifs, + .opts = *opts, + .ureg_map_size = impl->ssa_alloc, + .ureg_map = CALLOC(impl->ssa_alloc, sizeof(uint32_t)), + .def_csr = CALLOC(impl->ssa_alloc, sizeof(uint32_t *)), + .deferred_const = CALLOC(impl->ssa_alloc, sizeof(float)), + .last_use = CALLOC(impl->ssa_alloc, sizeof(int)), + }; + + memset(c.last_use, -1, impl->ssa_alloc * sizeof(int)); + compute_last_use(&c, impl); + + c.ip = 0; + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + emit_instr(&c, instr); + if (p->error[0]) + break; + release_dead_temps(&c, instr); + c.ip++; + } + if (p->error[0]) + break; + } + + if (!p->error[0]) + fixup_depth_write(&c, s); + + /* finalize */ + if (p->nr_tex_indirect > I915_MAX_TEX_INDIRECT) + i915_program_error(p, "exceeded max tex indirect (%d/%d)", + p->nr_tex_indirect, I915_MAX_TEX_INDIRECT); + if (p->nr_tex_insn > I915_MAX_TEX_INSN) + i915_program_error(p, "exceeded max tex insn (%d/%d)", + p->nr_tex_insn, I915_MAX_TEX_INSN); + if (p->nr_alu_insn > I915_MAX_ALU_INSN) + i915_program_error(p, "exceeded max ALU insn (%d/%d)", + p->nr_alu_insn, I915_MAX_ALU_INSN); + if (p->nr_decl_insn > I915_MAX_DECL_INSN) + i915_program_error(p, "exceeded max decl insn (%d/%d)", + p->nr_decl_insn, I915_MAX_DECL_INSN); + + if (p->nr_alu_insn == 0 && p->nr_tex_insn == 0) { + i915_use_passthrough_shader(ifs); + ifs->nr_alu_insn = 1; + goto cleanup; + } + + ifs->nr_alu_insn = p->nr_alu_insn; + ifs->nr_tex_insn = p->nr_tex_insn; + ifs->nr_tex_indirect = p->nr_tex_indirect; + ifs->nr_temps = util_bitcount(p->temp_flag); + + { + unsigned long program_size = (unsigned long)(p->csr - p->program); + unsigned long decl_size = (unsigned long)(p->decl - p->declarations); + + p->declarations[0] |= program_size + decl_size - 2; + + assert(!ifs->program); + ifs->program_len = decl_size + program_size; + ifs->program = (uint32_t *)MALLOC(ifs->program_len * sizeof(uint32_t)); + memcpy(ifs->program, p->declarations, decl_size * sizeof(uint32_t)); + memcpy(&ifs->program[decl_size], p->program, + program_size * sizeof(uint32_t)); + + if (p->error[0]) { + /* dump the program for debugging, then replace with passthrough */ + if (debug && ifs->program_len > 2) { + mesa_logi("FAILED program (%d ALU):", p->nr_alu_insn); + i915_disassemble_program(ifs->program, ifs->program_len); + } + FREE(ifs->program); + ifs->program = NULL; + ifs->program_len = 0; + i915_use_passthrough_shader(ifs); + } + } + +cleanup: + if (p->error[0]) + ifs->error = p->error; + else + ralloc_free(p->error); + + FREE(c.last_use); + FREE(c.deferred_const); + FREE(c.def_csr); + FREE(c.ureg_map); + FREE(p); + + if (debug) { + if (ifs->error) + mesa_loge("%s", ifs->error); + + mesa_logi("i915 fragment shader with %d constants%s", + ifs->num_constants, ifs->num_constants ? ":" : ""); + + for (int i = 0; i < I915_MAX_CONSTANT; i++) { + if (ifs->constant_flags[i] & 0x0f) { + mesa_logi("\t\tC[%d] = { %f, %f, %f, %f }", i, + ifs->constants[i][0], ifs->constants[i][1], + ifs->constants[i][2], ifs->constants[i][3]); + } + } + i915_disassemble_program(ifs->program, ifs->program_len); + } +} diff --git a/src/gallium/drivers/i915/i915_fpc_optimize.c b/src/gallium/drivers/i915/i915_fpc_optimize.c index b4ae362dfef..731f2444fec 100644 --- a/src/gallium/drivers/i915/i915_fpc_optimize.c +++ b/src/gallium/drivers/i915/i915_fpc_optimize.c @@ -405,6 +405,8 @@ i915_fpc_optimize_mov_before_tex(struct i915_optimize_context *ctx, target_is_texture2d(next->FullInstruction.Texture.Texture) && same_src_dst_reg(&next->FullInstruction.Src[0], ¤t->FullInstruction.Dst[0]) && + (current->FullInstruction.Dst[0].Register.WriteMask & + i915_tex_mask(next)) == i915_tex_mask(next) && is_unswizzled(¤t->FullInstruction.Src[0], i915_tex_mask(next)) && unused_from(ctx, ¤t->FullInstruction.Dst[0], index)) { memcpy(&next->FullInstruction.Src[0], ¤t->FullInstruction.Src[0], diff --git a/src/gallium/drivers/i915/i915_fpc_translate.c b/src/gallium/drivers/i915/i915_fpc_translate.c index b6cfb2a3dfb..9277e55e9e3 100644 --- a/src/gallium/drivers/i915/i915_fpc_translate.c +++ b/src/gallium/drivers/i915/i915_fpc_translate.c @@ -54,55 +54,9 @@ * Simple pass-through fragment shader to use when we don't have * a real shader (or it fails to compile for some reason). */ -static unsigned passthrough_program[] = { - _3DSTATE_PIXEL_SHADER_PROGRAM | ((1 * 3) - 1), - /* move to output color: - */ - (A0_MOV | (REG_TYPE_OC << A0_DEST_TYPE_SHIFT) | A0_DEST_CHANNEL_ALL | - (REG_TYPE_R << A0_SRC0_TYPE_SHIFT) | (0 << A0_SRC0_NR_SHIFT)), - ((SRC_ONE << A1_SRC0_CHANNEL_X_SHIFT) | - (SRC_ZERO << A1_SRC0_CHANNEL_Y_SHIFT) | - (SRC_ZERO << A1_SRC0_CHANNEL_Z_SHIFT) | - (SRC_ONE << A1_SRC0_CHANNEL_W_SHIFT)), - 0}; - /** * component-wise negation of ureg */ -static inline int -negate(int reg, int x, int y, int z, int w) -{ - /* Another neat thing about the UREG representation */ - return reg ^ (((x & 1) << UREG_CHANNEL_X_NEGATE_SHIFT) | - ((y & 1) << UREG_CHANNEL_Y_NEGATE_SHIFT) | - ((z & 1) << UREG_CHANNEL_Z_NEGATE_SHIFT) | - ((w & 1) << UREG_CHANNEL_W_NEGATE_SHIFT)); -} - -/** - * In the event of a translation failure, we'll generate a simple color - * pass-through program. - */ -static void -i915_use_passthrough_shader(struct i915_fragment_shader *fs) -{ - fs->program = (uint32_t *)MALLOC(sizeof(passthrough_program)); - if (fs->program) { - memcpy(fs->program, passthrough_program, sizeof(passthrough_program)); - fs->program_len = ARRAY_SIZE(passthrough_program); - } - fs->num_constants = 0; -} - -void -i915_program_error(struct i915_fp_compile *p, const char *msg, ...) -{ - va_list args; - va_start(args, msg); - ralloc_vasprintf_append(&p->error, msg, args); - va_end(args); -} - static uint32_t get_mapping(struct i915_fragment_shader *fs, enum tgsi_semantic semantic, int index) @@ -1006,12 +960,11 @@ i915_fini_compile(struct i915_context *i915, struct i915_fp_compile *p) p->nr_decl_insn, I915_MAX_DECL_INSN); } - /* hw doesn't seem to like empty frag programs (num_instructions == 1 is just - * TGSI_END), even when the depth write fixup gets emitted below - maybe that - * one is fishy, too? - */ - if (ifs->info.num_instructions == 1) - i915_program_error(p, "Empty fragment shader"); + if (ifs->info.num_instructions == 1) { + i915_use_passthrough_shader(ifs); + ifs->nr_alu_insn = 1; + goto done; + } if (strlen(p->error) != 0) { i915_use_passthrough_shader(ifs); @@ -1024,6 +977,10 @@ i915_fini_compile(struct i915_context *i915, struct i915_fp_compile *p) assert(!ifs->program); ifs->program_len = decl_size + program_size; + ifs->nr_alu_insn = p->nr_alu_insn; + ifs->nr_tex_insn = p->nr_tex_insn; + ifs->nr_tex_indirect = p->nr_tex_indirect; + ifs->nr_temps = util_bitcount(p->temp_flag); ifs->program = (uint32_t *)MALLOC(ifs->program_len * sizeof(uint32_t)); memcpy(ifs->program, p->declarations, decl_size * sizeof(uint32_t)); memcpy(&ifs->program[decl_size], p->program, @@ -1032,14 +989,16 @@ i915_fini_compile(struct i915_context *i915, struct i915_fp_compile *p) if (i915) { util_debug_message( &i915->debug, SHADER_INFO, - "%s shader: %d inst, %d tex, %d tex_indirect, %d temps, %d const", + "%s shader: %d instructions, %d alu, %d tex, %d tex_indirect, " + "%d temps, %d const", _mesa_shader_stage_to_abbrev(MESA_SHADER_FRAGMENT), - (int)program_size, p->nr_tex_insn, p->nr_tex_indirect, - p->shader->info.file_max[TGSI_FILE_TEMPORARY] + 1, - ifs->num_constants); + ifs->nr_alu_insn + ifs->nr_tex_insn, + ifs->nr_alu_insn, ifs->nr_tex_insn, ifs->nr_tex_indirect, + ifs->nr_temps, ifs->num_constants); } } +done: if (strlen(p->error) != 0) ifs->error = p->error; else diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c index 17db0d34034..df43fb05149 100644 --- a/src/gallium/drivers/i915/i915_screen.c +++ b/src/gallium/drivers/i915/i915_screen.c @@ -176,6 +176,8 @@ i915_optimize_nir(struct nir_shader *s) { bool progress; + NIR_PASS(_, s, nir_lower_int_to_float); + do { progress = false; @@ -212,6 +214,11 @@ i915_optimize_nir(struct nir_shader *s) } while (progress); + NIR_PASS(_, s, nir_lower_alu_to_scalar, NULL, NULL); + NIR_PASS(_, s, nir_lower_bool_to_float, false); + NIR_PASS(_, s, nir_opt_algebraic); + NIR_PASS(_, s, nir_opt_dce); + NIR_PASS(progress, s, nir_remove_dead_variables, nir_var_function_temp, NULL); diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c index 8d786c02e41..24adc396241 100644 --- a/src/gallium/drivers/i915/i915_state.c +++ b/src/gallium/drivers/i915/i915_state.c @@ -31,7 +31,9 @@ #include "compiler/nir/nir_builder.h" #include "draw/draw_context.h" #include "nir/nir_to_tgsi.h" +#include "tgsi/tgsi_from_mesa.h" #include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_scan.h" #include "util/u_helpers.h" #include "util/u_inlines.h" #include "util/u_math.h" @@ -542,6 +544,37 @@ static const struct nir_to_tgsi_options ntt_options = { .lower_fabs = true, }; +static int +type_size(const struct glsl_type *type, bool bindless) +{ + return glsl_count_attribute_slots(type, false); +} + +static bool +scalarize_vector_bools(const nir_instr *instr, const void *data) +{ + if (instr->type != nir_instr_type_alu) + return false; + nir_alu_instr *alu = nir_instr_as_alu(instr); + return alu->op == nir_op_bcsel || + alu->op == nir_op_fcsel_ge || + alu->op == nir_op_fcsel_gt; +} + +static bool +lower_fsqrt_filter(const nir_instr *instr, UNUSED const void *data) +{ + return instr->type == nir_instr_type_alu && + nir_instr_as_alu(instr)->op == nir_op_fsqrt; +} + +static nir_def * +lower_fsqrt_impl(nir_builder *b, nir_instr *instr, UNUSED void *data) +{ + nir_def *src = nir_instr_as_alu(instr)->src[0].src.ssa; + return nir_fmul(b, src, nir_frsq(b, src)); +} + static char * i915_check_control_flow(nir_shader *s) { @@ -565,6 +598,94 @@ i915_check_control_flow(nir_shader *s) return NULL; } +enum i915_fs_mode { + I915_FS_TGSI, + I915_FS_NIR, + I915_FS_BOTH, +}; + +static enum i915_fs_mode +i915_get_fs_mode(void) +{ + const char *env = debug_get_option("I915_FS", "both"); + if (!strcmp(env, "tgsi")) + return I915_FS_TGSI; + if (!strcmp(env, "nir")) + return I915_FS_NIR; + return I915_FS_BOTH; +} + +static void +i915_populate_fs_metadata(struct i915_fragment_shader *ifs, nir_shader *s) +{ + ifs->num_inputs = 0; + ifs->writes_z = s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH); + + nir_foreach_shader_in_variable(var, s) { + unsigned sem_name, sem_index; + tgsi_get_gl_varying_semantic((gl_varying_slot)var->data.location, true, + &sem_name, &sem_index); + unsigned idx = ifs->num_inputs++; + ifs->input_semantic_name[idx] = sem_name; + ifs->input_semantic_index[idx] = sem_index; + } +} + +static void +i915_compile_tgsi(struct i915_context *i915, + struct i915_fragment_shader *ifs, + struct pipe_screen *screen, + nir_shader *nir_clone) +{ + ifs->state.tokens = nir_to_tgsi_options(nir_clone, screen, &ntt_options); + ifs->state.type = PIPE_SHADER_IR_TGSI; + tgsi_scan_shader(ifs->state.tokens, &ifs->info); + i915_translate_fragment_program(i915, ifs); +} + +static bool +corm_fs_better(const struct i915_fragment_shader *a, + const struct i915_fragment_shader *b) +{ + if (a->nr_tex_indirect != b->nr_tex_indirect) + return a->nr_tex_indirect < b->nr_tex_indirect; + if (a->nr_alu_insn != b->nr_alu_insn) + return a->nr_alu_insn < b->nr_alu_insn; + if (a->nr_temps != b->nr_temps) + return a->nr_temps < b->nr_temps; + return a->num_constants < b->num_constants; +} + +static const char * +corm_win_reason(const struct i915_fragment_shader *winner, + const struct i915_fragment_shader *loser, + char *buf, size_t len) +{ + if (!loser) { + snprintf(buf, len, "only"); + return buf; + } + int da = (int)winner->nr_alu_insn - (int)loser->nr_alu_insn; + int dp = (int)winner->nr_tex_indirect - (int)loser->nr_tex_indirect; + int dt = (int)winner->nr_temps - (int)loser->nr_temps; + if (dp != 0) + snprintf(buf, len, "%+d phase", dp); + else if (da != 0) + snprintf(buf, len, "%+d alu", da); + else if (dt != 0) + snprintf(buf, len, "%+d temps", dt); + else if ((int)winner->num_constants != (int)loser->num_constants) + snprintf(buf, len, "%+d const", + (int)winner->num_constants - (int)loser->num_constants); + else if (winner->program_len == loser->program_len && + !memcmp(winner->program, loser->program, + winner->program_len * sizeof(uint32_t))) + snprintf(buf, len, "identical"); + else + snprintf(buf, len, "tied"); + return buf; +} + static void * i915_create_fs_state(struct pipe_context *pipe, const struct pipe_shader_state *templ) @@ -576,39 +697,222 @@ i915_create_fs_state(struct pipe_context *pipe, ifs->draw_data = draw_create_fragment_shader(i915->draw, templ); - if (templ->type == PIPE_SHADER_IR_NIR) { - nir_shader *s = templ->ir.nir; - ifs->internal = s->info.internal; - - char *msg = i915_check_control_flow(s); - if (msg) { - if (I915_DBG_ON(DBG_FS) && - (!s->info.internal || NIR_DEBUG(PRINT_INTERNAL))) { - mesa_logi("failing shader:"); - nir_log_shaderi(s); - } - if (templ->report_compile_error) { - ((struct pipe_shader_state *)templ)->error_message = strdup(msg); - ralloc_free(s); - i915_delete_fs_state(NULL, ifs); - return NULL; - } - } - - ifs->state.tokens = nir_to_tgsi_options(s, pipe->screen, &ntt_options); - } else { - assert(templ->type == PIPE_SHADER_IR_TGSI); - /* we need to keep a local copy of the tokens */ + if (templ->type == PIPE_SHADER_IR_TGSI) { ifs->state.tokens = tgsi_dup_tokens(templ->tokens); + ifs->state.type = PIPE_SHADER_IR_TGSI; ifs->internal = i915->no_log_program_errors; + tgsi_scan_shader(ifs->state.tokens, &ifs->info); + i915_translate_fragment_program(i915, ifs); + return ifs; } - ifs->state.type = PIPE_SHADER_IR_TGSI; + assert(templ->type == PIPE_SHADER_IR_NIR); + nir_shader *s = templ->ir.nir; + ifs->internal = s->info.internal; - tgsi_scan_shader(ifs->state.tokens, &ifs->info); + bool debug = I915_DBG_ON(DBG_FS) && + (!s->info.internal || NIR_DEBUG(PRINT_INTERNAL)); + + char *msg = i915_check_control_flow(s); + if (msg) { + if (debug) { + mesa_logi("failing shader:"); + nir_log_shaderi(s); + } + if (templ->report_compile_error) { + ((struct pipe_shader_state *)templ)->error_message = strdup(msg); + ralloc_free(s); + i915_delete_fs_state(NULL, ifs); + return NULL; + } + } + + static enum i915_fs_mode fs_mode = -1; + if (fs_mode == (enum i915_fs_mode)-1) + fs_mode = i915_get_fs_mode(); + + bool try_nir = (fs_mode == I915_FS_NIR || fs_mode == I915_FS_BOTH); + bool try_tgsi = (fs_mode == I915_FS_TGSI || fs_mode == I915_FS_BOTH); + + struct i915_fragment_shader tgsi_fs = {0}; + + static const struct corm_compile_opts corm_variants[] = { + { .deferred_const = false, .seq_sne_opt = false }, + { .deferred_const = false, .seq_sne_opt = true }, + { .deferred_const = true, .seq_sne_opt = false }, + { .deferred_const = true, .seq_sne_opt = true }, + { .deferred_const = false, .seq_sne_opt = false, .late_scalar = true }, + { .deferred_const = false, .seq_sne_opt = true, .late_scalar = true }, + { .deferred_const = true, .seq_sne_opt = false, .late_scalar = true }, + { .deferred_const = true, .seq_sne_opt = true, .late_scalar = true }, + }; + + struct i915_fragment_shader nir_results[ARRAY_SIZE(corm_variants)]; + int best_nir = -1; + + if (try_nir) { + nir_shader *nir_s = try_tgsi ? nir_shader_clone(NULL, s) : s; + NIR_PASS(_, nir_s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + type_size, (nir_lower_io_options)0); + NIR_PASS(_, nir_s, nir_lower_alu_to_scalar, scalarize_vector_bools, NULL); + NIR_PASS(_, nir_s, nir_opt_vectorize, NULL, NULL); + NIR_PASS(_, nir_s, nir_lower_bool_to_float, false); + NIR_PASS(_, nir_s, nir_shader_lower_instructions, lower_fsqrt_filter, + lower_fsqrt_impl, NULL); + NIR_PASS(_, nir_s, nir_opt_copy_prop); + NIR_PASS(_, nir_s, nir_opt_cse); + NIR_PASS(_, nir_s, nir_opt_dce); + NIR_PASS(_, nir_s, nir_opt_algebraic); + NIR_PASS(_, nir_s, nir_opt_algebraic_late); + NIR_PASS(_, nir_s, nir_opt_dce); + NIR_PASS(_, nir_s, nir_opt_shrink_vectors, false); + NIR_PASS(_, nir_s, nir_opt_copy_prop); + NIR_PASS(_, nir_s, nir_opt_dce); + nir_index_ssa_defs(nir_shader_get_entrypoint(nir_s)); + + for (unsigned v = 0; v < ARRAY_SIZE(corm_variants); v++) { + nir_shader *variant_nir = nir_shader_clone(NULL, nir_s); + if (corm_variants[v].late_scalar) { + NIR_PASS(_, variant_nir, nir_lower_alu_to_scalar, NULL, NULL); + NIR_PASS(_, variant_nir, nir_opt_copy_prop); + NIR_PASS(_, variant_nir, nir_opt_algebraic); + NIR_PASS(_, variant_nir, nir_opt_dce); + nir_index_ssa_defs(nir_shader_get_entrypoint(variant_nir)); + } + memset(&nir_results[v], 0, sizeof(nir_results[v])); + i915_populate_fs_metadata(&nir_results[v], variant_nir); + i915_translate_fragment_program_nir(i915, &nir_results[v], + variant_nir, &corm_variants[v]); + ralloc_free(variant_nir); + + bool ok = !nir_results[v].error || !nir_results[v].error[0]; + if (ok && (best_nir < 0 || + corm_fs_better(&nir_results[v], &nir_results[best_nir]))) + best_nir = v; + } + + ralloc_free(nir_s); + } + + if (try_tgsi) { + i915_compile_tgsi(i915, &tgsi_fs, pipe->screen, s); + } else { + ralloc_free(s); + } + + bool nir_ok = best_nir >= 0; + bool tgsi_ok = try_tgsi && (!tgsi_fs.error || !tgsi_fs.error[0]); + struct i915_fragment_shader *best_nir_fs = nir_ok ? &nir_results[best_nir] : NULL; + + bool use_nir; + if (nir_ok && tgsi_ok) + use_nir = !corm_fs_better(&tgsi_fs, best_nir_fs); + else + use_nir = nir_ok; + + if (debug && try_nir && try_tgsi) { + for (unsigned v = 0; v < ARRAY_SIZE(corm_variants); v++) { + bool ok = !nir_results[v].error || !nir_results[v].error[0]; + mesa_logi(" NIR[dc=%d,ss=%d]: %s (%d ALU, %d phase, %d temps)%s", + corm_variants[v].deferred_const, + corm_variants[v].seq_sne_opt, + ok ? "ok" : "FAIL", + ok ? nir_results[v].nr_alu_insn : 0, + ok ? nir_results[v].nr_tex_indirect : 0, + ok ? nir_results[v].nr_temps : 0, + (int)v == best_nir ? " *" : ""); + } + mesa_logi(" TGSI: %s (%d ALU, %d phase, %d temps)", + tgsi_ok ? "ok" : "FAIL", + tgsi_ok ? tgsi_fs.nr_alu_insn : 0, + tgsi_ok ? tgsi_fs.nr_tex_indirect : 0, + tgsi_ok ? tgsi_fs.nr_temps : 0); + mesa_logi(" -> %s%s", use_nir ? "NIR" : "TGSI", + use_nir ? (corm_fs_better(best_nir_fs, &tgsi_fs) + ? " (better)" : " (tied)") : ""); + } + + /* Free non-winning NIR variants */ + if (try_nir) { + for (unsigned v = 0; v < ARRAY_SIZE(corm_variants); v++) { + if ((int)v != best_nir) { + FREE(nir_results[v].program); + ralloc_free(nir_results[v].error); + } + } + } + + struct i915_fragment_shader *winner, *loser = NULL; + struct i915_fragment_shader nir_loser_copy = {0}; + if (use_nir) { + winner = best_nir_fs; + loser = tgsi_ok ? &tgsi_fs : NULL; + } else { + winner = &tgsi_fs; + if (best_nir_fs) { + nir_loser_copy = *best_nir_fs; + nir_loser_copy.program = NULL; + loser = &nir_loser_copy; + FREE(best_nir_fs->program); + ralloc_free(best_nir_fs->error); + } + } + + if (i915 && !ifs->internal) { + bool neither = (winner->nr_alu_insn + winner->nr_tex_insn) == 0; + char reason[32]; + if (neither) + snprintf(reason, sizeof(reason), "neither"); + else + corm_win_reason(winner, loser, reason, sizeof(reason)); + util_debug_message( + &i915->debug, SHADER_INFO, + "%s shader [%s, %s]: %d instructions, %d alu, %d tex, " + "%d tex_indirect, %d temps, %d const", + _mesa_shader_stage_to_abbrev(MESA_SHADER_FRAGMENT), + neither ? "FAIL" : use_nir ? "NIR" : "TGSI", reason, + winner->nr_alu_insn + winner->nr_tex_insn, + winner->nr_alu_insn, winner->nr_tex_insn, winner->nr_tex_indirect, + winner->nr_temps, winner->num_constants); + } + + ifs->program = winner->program; + ifs->program_len = winner->program_len; + ifs->nr_alu_insn = winner->nr_alu_insn; + ifs->nr_tex_insn = winner->nr_tex_insn; + ifs->nr_tex_indirect = winner->nr_tex_indirect; + ifs->nr_temps = winner->nr_temps; + ifs->num_constants = winner->num_constants; + memcpy(ifs->constants, winner->constants, sizeof(ifs->constants)); + memcpy(ifs->constant_flags, winner->constant_flags, + sizeof(ifs->constant_flags)); + memcpy(ifs->texcoords, winner->texcoords, sizeof(ifs->texcoords)); + ifs->reads_pntc = winner->reads_pntc; + ifs->writes_z = winner->writes_z; + ifs->num_inputs = winner->num_inputs; + memcpy(ifs->input_semantic_name, winner->input_semantic_name, + sizeof(ifs->input_semantic_name)); + memcpy(ifs->input_semantic_index, winner->input_semantic_index, + sizeof(ifs->input_semantic_index)); + if (winner->error) + ifs->error = winner->error; + + /* The loser's info may be in use (TGSI path populates ifs->info) */ + if (try_tgsi) + ifs->info = tgsi_fs.info; + + if (loser) { + FREE(loser->program); + ralloc_free(loser->error); + } + if (!use_nir && try_tgsi) { + /* TGSI won — tokens are in tgsi_fs via i915_compile_tgsi. + * We need them for ifs->state for draw's FS pipeline. */ + ifs->state = tgsi_fs.state; + } else if (try_tgsi) { + FREE((void *)tgsi_fs.state.tokens); + } - /* The shader's compiled to i915 instructions here */ - i915_translate_fragment_program(i915, ifs); if (ifs->error && templ->report_compile_error) { ((struct pipe_shader_state *)templ)->error_message = strdup(ifs->error); i915_delete_fs_state(NULL, ifs); @@ -667,28 +971,11 @@ i915_create_vs_state(struct pipe_context *pipe, const struct pipe_shader_state *templ) { struct i915_context *i915 = i915_context(pipe); - void *vertex_shader; - struct pipe_shader_state from_nir = {PIPE_SHADER_IR_TGSI}; - if (templ->type == PIPE_SHADER_IR_NIR) { - nir_shader *s = templ->ir.nir; + if (templ->type == PIPE_SHADER_IR_NIR) + NIR_PASS(_, templ->ir.nir, nir_lower_point_size, 1.0, 255.0); - NIR_PASS(_, s, nir_lower_point_size, 1.0, 255.0); - - /* The gallivm draw path doesn't support non-native-integers NIR shaders, - * st/mesa does native-integers for the screen as a whole rather than - * per-stage, and i915 FS can't do native integers. So, convert to TGSI, - * where the draw path *does* support non-native-integers. - */ - from_nir.tokens = nir_to_tgsi(s, pipe->screen); - templ = &from_nir; - } - - vertex_shader = draw_create_vertex_shader(i915->draw, templ); - - FREE((void *)from_nir.tokens); - - return vertex_shader; + return draw_create_vertex_shader(i915->draw, templ); } static void diff --git a/src/gallium/drivers/i915/i915_state_emit.c b/src/gallium/drivers/i915/i915_state_emit.c index f3561b143e8..8a92d6d0a7b 100644 --- a/src/gallium/drivers/i915/i915_state_emit.c +++ b/src/gallium/drivers/i915/i915_state_emit.c @@ -332,28 +332,33 @@ emit_constants(struct i915_context *i915) OUT_BATCH((1 << nr) - 1); for (i = 0; i < nr; i++) { - const uint32_t *c; - if (i915->fs->constant_flags[i] == I915_CONSTFLAG_USER) { - /* grab user-defined constant */ - c = (uint32_t *)i915_buffer(i915->constants[MESA_SHADER_FRAGMENT]) - ->data; + uint8_t flags = i915->fs->constant_flags[i]; + uint8_t user_mask = flags >> 4; + + if (!user_mask) { + const uint32_t *c = (uint32_t *)i915->fs->constants[i]; + OUT_BATCH(c[0]); + OUT_BATCH(c[1]); + OUT_BATCH(c[2]); + OUT_BATCH(c[3]); + } else if (user_mask == 0xf) { + const uint32_t *c = + (uint32_t *)i915_buffer(i915->constants[MESA_SHADER_FRAGMENT]) + ->data; c += 4 * i; + OUT_BATCH(c[0]); + OUT_BATCH(c[1]); + OUT_BATCH(c[2]); + OUT_BATCH(c[3]); } else { - /* emit program constant */ - c = (uint32_t *)i915->fs->constants[i]; + const uint32_t *user = + (uint32_t *)i915_buffer(i915->constants[MESA_SHADER_FRAGMENT]) + ->data; + user += 4 * i; + const uint32_t *imm = (uint32_t *)i915->fs->constants[i]; + for (unsigned ch = 0; ch < 4; ch++) + OUT_BATCH((user_mask & (1 << ch)) ? user[ch] : imm[ch]); } -#if 0 /* debug */ - { - float *f = (float *) c; - printf("Const %2d: %f %f %f %f %s\n", i, f[0], f[1], f[2], f[3], - (i915->fs->constant_flags[i] == I915_CONSTFLAG_USER - ? "user" : "immediate")); - } -#endif - OUT_BATCH(*c++); - OUT_BATCH(*c++); - OUT_BATCH(*c++); - OUT_BATCH(*c++); } } } diff --git a/src/gallium/drivers/i915/meson.build b/src/gallium/drivers/i915/meson.build index 80dc825fbc5..ef1d5f7ad34 100644 --- a/src/gallium/drivers/i915/meson.build +++ b/src/gallium/drivers/i915/meson.build @@ -16,6 +16,7 @@ files_i915 = files( 'i915_flush.c', 'i915_fpc_emit.c', 'i915_fpc.h', + 'i915_fpc_nir.c', 'i915_fpc_optimize.c', 'i915_fpc_translate.c', 'i915_prim_emit.c',