From 3d3b5577804919ed1cb4db2a7bba89edf4486a1b Mon Sep 17 00:00:00 2001 From: Adam Jackson Date: Wed, 6 May 2026 12:45:51 -0400 Subject: [PATCH] i915/corm: add NIR fragment shader backend Bare-minimum NIR-to-i915 fragment shader compiler with multi-variant framework, lexicographic cost metric (ALU > tex_indirect > temps > consts), and winner-tagged stats output. Stats are emitted once per shader with [NIR] or [TGSI] tag indicating which backend won. The corm_compile_opts struct is available for multi-variant compilation (currently empty). Assisted-by: Claude shader-db (I915_FS=nir): 48/403 compiled, 65 alu shader-db (I915_FS=both): nir won 48 (26 identical, 16 tied, 6 better), 236 TGSI, 119 neither --- src/gallium/drivers/i915/i915_fpc_nir.c | 821 ++++++++++++++++++++++++ src/gallium/drivers/i915/i915_screen.c | 7 + src/gallium/drivers/i915/i915_state.c | 328 +++++++++- src/gallium/drivers/i915/meson.build | 1 + 4 files changed, 1130 insertions(+), 27 deletions(-) create mode 100644 src/gallium/drivers/i915/i915_fpc_nir.c diff --git a/src/gallium/drivers/i915/i915_fpc_nir.c b/src/gallium/drivers/i915/i915_fpc_nir.c new file mode 100644 index 00000000000..346e06d0a34 --- /dev/null +++ b/src/gallium/drivers/i915/i915_fpc_nir.c @@ -0,0 +1,821 @@ +/* + * Copyright 2025 Red Hat, Inc. + * SPDX-License-Identifier: MIT + */ + +#include "compiler/nir/nir.h" +#include "tgsi/tgsi_from_mesa.h" +#include "util/log.h" +#include "util/ralloc.h" +#include "util/u_memory.h" + +#include "i915_context.h" +#include "i915_debug.h" +#include "i915_debug_private.h" +#include "i915_fpc.h" +#include "i915_reg.h" + +struct nir_to_i915 { + struct corm_compile_opts opts; + struct i915_fp_compile *p; + struct i915_fragment_shader *ifs; + + uint32_t *ureg_map; + unsigned ureg_map_size; +}; + +static void +set_ureg(struct nir_to_i915 *c, nir_def *def, uint32_t ureg) +{ + assert(def->index < c->ureg_map_size); + c->ureg_map[def->index] = ureg; +} + +static uint32_t +src_ureg(struct nir_to_i915 *c, nir_src *src) +{ + assert(src->ssa->index < c->ureg_map_size); + return c->ureg_map[src->ssa->index]; +} + +static uint32_t +alu_src_ureg(struct nir_to_i915 *c, nir_alu_src *src) +{ + uint32_t ureg = src_ureg(c, &src->src); + return swizzle(ureg, + src->swizzle[0], src->swizzle[1], + src->swizzle[2], src->swizzle[3]); +} + +static uint32_t +def_mask(nir_def *def) +{ + uint32_t mask = 0; + if (def->num_components >= 1) mask |= A0_DEST_CHANNEL_X; + if (def->num_components >= 2) mask |= A0_DEST_CHANNEL_Y; + if (def->num_components >= 3) mask |= A0_DEST_CHANNEL_Z; + if (def->num_components >= 4) mask |= A0_DEST_CHANNEL_W; + return mask; +} + +static uint32_t +writemask_to_mask(unsigned wm) +{ + uint32_t mask = 0; + if (wm & 1) mask |= A0_DEST_CHANNEL_X; + if (wm & 2) mask |= A0_DEST_CHANNEL_Y; + if (wm & 4) mask |= A0_DEST_CHANNEL_Z; + if (wm & 8) mask |= A0_DEST_CHANNEL_W; + return mask; +} + +static uint32_t +get_texcoord_mapping(struct i915_fragment_shader *fs, + unsigned semantic, int index) +{ + for (int i = 0; i < I915_TEX_UNITS; i++) { + if (fs->texcoords[i].semantic == -1) { + fs->texcoords[i].semantic = semantic; + fs->texcoords[i].index = index; + return i; + } + if (fs->texcoords[i].semantic == (int)semantic && + fs->texcoords[i].index == index) + return i; + } + return 0; +} + +static uint32_t +emit_input(struct nir_to_i915 *c, unsigned location) +{ + struct i915_fp_compile *p = c->p; + struct i915_fragment_shader *ifs = c->ifs; + unsigned sem_name, sem_index; + + tgsi_get_gl_varying_semantic((gl_varying_slot)location, true, + &sem_name, &sem_index); + + switch (sem_name) { + case TGSI_SEMANTIC_GENERIC: + case TGSI_SEMANTIC_TEXCOORD: + case TGSI_SEMANTIC_PCOORD: + case TGSI_SEMANTIC_POSITION: { + if (sem_name == TGSI_SEMANTIC_PCOORD) + ifs->reads_pntc = true; + int tc = get_texcoord_mapping(ifs, sem_name, sem_index); + return i915_emit_decl(p, REG_TYPE_T, T_TEX0 + tc, D0_CHANNEL_ALL); + } + case TGSI_SEMANTIC_COLOR: + if (sem_index == 0) { + return i915_emit_decl(p, REG_TYPE_T, T_DIFFUSE, D0_CHANNEL_ALL); + } else { + return swizzle( + i915_emit_decl(p, REG_TYPE_T, T_SPECULAR, D0_CHANNEL_XYZ), + X, Y, Z, ONE); + } + case TGSI_SEMANTIC_FOG: + return swizzle( + i915_emit_decl(p, REG_TYPE_T, T_FOG_W, D0_CHANNEL_W), + W, W, W, W); + case TGSI_SEMANTIC_FACE: { + int tc = get_texcoord_mapping(ifs, sem_name, sem_index); + return i915_emit_decl(p, REG_TYPE_T, T_TEX0 + tc, D0_CHANNEL_X); + } + default: + i915_program_error(p, "Bad input location %d (semantic %d)", + location, sem_name); + return 0; + } +} + +static void +emit_load_const(struct nir_to_i915 *c, nir_load_const_instr *load) +{ + struct i915_fp_compile *p = c->p; + + switch (load->def.num_components) { + case 1: + set_ureg(c, &load->def, i915_emit_const1f(p, load->value[0].f32)); + break; + case 2: + set_ureg(c, &load->def, + i915_emit_const2f(p, load->value[0].f32, + load->value[1].f32)); + break; + case 3: + case 4: { + float v[4] = { + load->value[0].f32, + load->def.num_components > 1 ? load->value[1].f32 : 0.0f, + load->def.num_components > 2 ? load->value[2].f32 : 0.0f, + load->def.num_components > 3 ? load->value[3].f32 : 0.0f, + }; + set_ureg(c, &load->def, i915_emit_const4fv(p, v)); + break; + } + default: + i915_program_error(p, "load_const with %d components", + load->def.num_components); + break; + } +} + +static void +emit_alu(struct nir_to_i915 *c, nir_alu_instr *alu) +{ + struct i915_fp_compile *p = c->p; + nir_def *def = &alu->def; + uint32_t mask = def_mask(def); + uint32_t dest = UREG(REG_TYPE_R, i915_get_temp(p)); + set_ureg(c, def, dest); + + uint32_t src0 = 0, src1 = 0, src2 = 0; + if (nir_op_infos[alu->op].num_inputs >= 1) + src0 = alu_src_ureg(c, &alu->src[0]); + if (nir_op_infos[alu->op].num_inputs >= 2) + src1 = alu_src_ureg(c, &alu->src[1]); + if (nir_op_infos[alu->op].num_inputs >= 3) + src2 = alu_src_ureg(c, &alu->src[2]); + + switch (alu->op) { + case nir_op_mov: + case nir_op_fcanonicalize: + case nir_op_fneg: { + i915_release_temp(p, GET_UREG_NR(dest)); + set_ureg(c, def, alu->op == nir_op_fneg ? negate(src0, 1, 1, 1, 1) + : src0); + return; + } + case nir_op_fabs: + i915_emit_arith(p, A0_MAX, dest, mask, 0, + src0, negate(src0, 1, 1, 1, 1), 0); + break; + case nir_op_fsat: + i915_emit_arith(p, A0_MOV, dest, mask, A0_DEST_SATURATE, src0, 0, 0); + break; + case nir_op_fadd: + i915_emit_arith(p, A0_ADD, dest, mask, 0, src0, src1, 0); + break; + case nir_op_fmul: + i915_emit_arith(p, A0_MUL, dest, mask, 0, src0, src1, 0); + break; + case nir_op_ffma: + i915_emit_arith(p, A0_MAD, dest, mask, 0, src0, src1, src2); + break; + case nir_op_fmin: + case nir_op_imin: + case nir_op_umin: + i915_emit_arith(p, A0_MIN, dest, mask, 0, src0, src1, 0); + break; + case nir_op_fmax: + case nir_op_imax: + case nir_op_umax: + i915_emit_arith(p, A0_MAX, dest, mask, 0, src0, src1, 0); + break; + case nir_op_ffloor: + i915_emit_arith(p, A0_FLR, dest, mask, 0, src0, 0, 0); + break; + case nir_op_ffract: + i915_emit_arith(p, A0_FRC, dest, mask, 0, src0, 0, 0); + break; + case nir_op_ftrunc: + i915_emit_arith(p, A0_TRC, dest, mask, 0, src0, 0, 0); + break; + case nir_op_fceil: { + uint32_t tmp = i915_get_utemp(p); + i915_emit_arith(p, A0_FLR, tmp, mask, 0, + negate(src0, 1, 1, 1, 1), 0, 0); + i915_emit_arith(p, A0_MOV, dest, mask, 0, + negate(tmp, 1, 1, 1, 1), 0, 0); + break; + } + case nir_op_frcp: + i915_emit_arith(p, A0_RCP, dest, mask, 0, + swizzle(src0, X, X, X, X), 0, 0); + break; + case nir_op_frsq: + i915_emit_arith(p, A0_RSQ, dest, mask, 0, + swizzle(src0, X, X, X, X), 0, 0); + break; + case nir_op_fsqrt: { + uint32_t tmp = i915_get_utemp(p); + i915_emit_arith(p, A0_RSQ, tmp, A0_DEST_CHANNEL_X, 0, + swizzle(src0, X, X, X, X), 0, 0); + i915_emit_arith(p, A0_MUL, dest, mask, 0, + src0, swizzle(tmp, X, X, X, X), 0); + break; + } + case nir_op_fexp2: + i915_emit_arith(p, A0_EXP, dest, mask, 0, + swizzle(src0, X, X, X, X), 0, 0); + break; + case nir_op_flog2: + i915_emit_arith(p, A0_LOG, dest, mask, 0, + swizzle(src0, X, X, X, X), 0, 0); + break; + case nir_op_fdot2: + case nir_op_fdot2_replicated: + i915_emit_arith(p, A0_DP3, dest, mask, 0, + swizzle(src0, X, Y, ZERO, ZERO), src1, 0); + break; + case nir_op_fdot3: + case nir_op_fdot3_replicated: + i915_emit_arith(p, A0_DP3, dest, mask, 0, src0, src1, 0); + break; + case nir_op_fdot4: + case nir_op_fdot4_replicated: + i915_emit_arith(p, A0_DP4, dest, mask, 0, src0, src1, 0); + break; + case nir_op_slt: + i915_emit_arith(p, A0_SLT, dest, mask, 0, src0, src1, 0); + break; + case nir_op_sge: + i915_emit_arith(p, A0_SGE, dest, mask, 0, src0, src1, 0); + break; + case nir_op_seq: { + /* seq(a,b) = sge(a,b) * sge(b,a) */ + uint32_t tmp = i915_get_utemp(p); + i915_emit_arith(p, A0_SGE, tmp, A0_DEST_CHANNEL_ALL, 0, + src0, src1, 0); + i915_emit_arith(p, A0_SGE, dest, mask, 0, src1, src0, 0); + i915_emit_arith(p, A0_MUL, dest, mask, 0, dest, tmp, 0); + break; + } + case nir_op_sne: { + /* sne(a,b) = slt(a,b) + slt(b,a) */ + uint32_t tmp = i915_get_utemp(p); + i915_emit_arith(p, A0_SLT, tmp, A0_DEST_CHANNEL_ALL, 0, + src0, src1, 0); + i915_emit_arith(p, A0_SLT, dest, mask, 0, src1, src0, 0); + i915_emit_arith(p, A0_ADD, dest, mask, 0, dest, tmp, 0); + break; + } + case nir_op_fpow: { + uint32_t tmp = i915_get_utemp(p); + i915_emit_arith(p, A0_LOG, tmp, A0_DEST_CHANNEL_X, 0, + swizzle(src0, X, X, X, X), 0, 0); + i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_X, 0, tmp, src1, 0); + i915_emit_arith(p, A0_EXP, dest, mask, 0, + swizzle(tmp, X, X, X, X), 0, 0); + break; + } + case nir_op_bcsel: + i915_emit_arith(p, A0_CMP, dest, mask, 0, + negate(src0, 1, 1, 1, 1), src2, src1); + break; + case nir_op_fcsel_ge: + i915_emit_arith(p, A0_CMP, dest, mask, 0, src0, src1, src2); + break; + case nir_op_fcsel_gt: + i915_emit_arith(p, A0_CMP, dest, mask, 0, + negate(src0, 1, 1, 1, 1), src2, src1); + break; + case nir_op_vec2: + case nir_op_vec3: + case nir_op_vec4: { + unsigned n = nir_op_infos[alu->op].num_inputs; + static const uint32_t chan_mask[] = { + A0_DEST_CHANNEL_X, A0_DEST_CHANNEL_Y, + A0_DEST_CHANNEL_Z, A0_DEST_CHANNEL_W, + }; + for (unsigned i = 0; i < n; i++) { + uint32_t s = alu_src_ureg(c, &alu->src[i]); + i915_emit_arith(p, A0_MOV, dest, chan_mask[i] & mask, 0, + swizzle(s, X, X, X, X), 0, 0); + } + break; + } + case nir_op_fsign: { + uint32_t tmp = i915_get_utemp(p); + const uint32_t zero = swizzle(UREG(REG_TYPE_R, 0), + ZERO, ZERO, ZERO, ZERO); + i915_emit_arith(p, A0_SLT, tmp, A0_DEST_CHANNEL_ALL, 0, + src0, zero, 0); + i915_emit_arith(p, A0_SLT, dest, mask, 0, zero, src0, 0); + i915_emit_arith(p, A0_ADD, dest, mask, 0, + dest, negate(tmp, 1, 1, 1, 1), 0); + break; + } + default: + i915_program_error(p, "unsupported NIR ALU op: %s", + nir_op_infos[alu->op].name); + break; + } + + i915_release_utemps(p); +} + +static uint32_t +translate_tex_type(struct i915_fp_compile *p, enum glsl_sampler_dim dim) +{ + switch (dim) { + case GLSL_SAMPLER_DIM_1D: + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_RECT: + case GLSL_SAMPLER_DIM_EXTERNAL: + return D0_SAMPLE_TYPE_2D; + case GLSL_SAMPLER_DIM_3D: + return D0_SAMPLE_TYPE_VOLUME; + case GLSL_SAMPLER_DIM_CUBE: + return D0_SAMPLE_TYPE_CUBE; + default: + i915_program_error(p, "unsupported sampler dim %d", dim); + return D0_SAMPLE_TYPE_2D; + } +} + +static uint32_t +tex_coord_mask(nir_tex_instr *tex) +{ + uint32_t mask = TGSI_WRITEMASK_X; + + switch (tex->sampler_dim) { + case GLSL_SAMPLER_DIM_1D: + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_RECT: + case GLSL_SAMPLER_DIM_EXTERNAL: + mask = TGSI_WRITEMASK_XY; + break; + case GLSL_SAMPLER_DIM_3D: + case GLSL_SAMPLER_DIM_CUBE: + mask = TGSI_WRITEMASK_XYZ; + break; + default: + break; + } + + if (tex->is_shadow) + mask |= TGSI_WRITEMASK_Z; + + if (tex->op == nir_texop_txb) + mask |= TGSI_WRITEMASK_W; + + for (unsigned i = 0; i < tex->num_srcs; i++) { + if (tex->src[i].src_type == nir_tex_src_projector) { + mask |= TGSI_WRITEMASK_W; + break; + } + } + + return mask; +} + +static void +emit_tex(struct nir_to_i915 *c, nir_tex_instr *tex) +{ + struct i915_fp_compile *p = c->p; + nir_def *def = &tex->def; + uint32_t dest = UREG(REG_TYPE_R, i915_get_temp(p)); + set_ureg(c, def, dest); + + uint32_t hw_tex = translate_tex_type(p, tex->sampler_dim); + uint32_t sampler = i915_emit_decl(p, REG_TYPE_S, tex->sampler_index, hw_tex); + + uint32_t coord = 0; + uint32_t bias_or_proj = 0; + uint32_t shadow = 0; + bool has_bias = false, has_proj = false, has_shadow = false; + + for (unsigned i = 0; i < tex->num_srcs; i++) { + switch (tex->src[i].src_type) { + case nir_tex_src_coord: + coord = src_ureg(c, &tex->src[i].src); + break; + case nir_tex_src_bias: + bias_or_proj = src_ureg(c, &tex->src[i].src); + has_bias = true; + break; + case nir_tex_src_projector: + bias_or_proj = src_ureg(c, &tex->src[i].src); + has_proj = true; + break; + case nir_tex_src_comparator: + shadow = src_ureg(c, &tex->src[i].src); + has_shadow = true; + break; + default: + break; + } + } + + /* 1D textures: set Y = X so LOD works correctly when sampled as 2D */ + if (tex->sampler_dim == GLSL_SAMPLER_DIM_1D) + coord = swizzle(coord, X, X, Z, W); + + /* pack bias/projector/shadow into a single coord register if needed */ + if (has_bias || has_proj || has_shadow) { + uint32_t tmp = UREG(REG_TYPE_R, i915_get_temp(p)); + + i915_emit_arith(p, A0_MOV, tmp, A0_DEST_CHANNEL_ALL, 0, coord, 0, 0); + + if (has_shadow) + i915_emit_arith(p, A0_MOV, tmp, A0_DEST_CHANNEL_Z, 0, + swizzle(shadow, X, X, X, X), 0, 0); + + if (has_bias || has_proj) + i915_emit_arith(p, A0_MOV, tmp, A0_DEST_CHANNEL_W, 0, + swizzle(bias_or_proj, X, X, X, X), 0, 0); + + coord = tmp; + } + + uint32_t opcode; + if (tex->op == nir_texop_txb) { + opcode = T0_TEXLDB; + } else if (has_proj) { + opcode = T0_TEXLDP; + } else if (tex->op == nir_texop_tex) { + opcode = T0_TEXLD; + } else { + i915_program_error(p, "unsupported tex op %d", tex->op); + return; + } + + i915_emit_texld(p, dest, A0_DEST_CHANNEL_ALL, sampler, coord, opcode, + tex_coord_mask(tex)); + + i915_release_utemps(p); +} + +static void +emit_intrinsic(struct nir_to_i915 *c, nir_intrinsic_instr *intr) +{ + struct i915_fp_compile *p = c->p; + struct i915_fragment_shader *ifs = c->ifs; + + switch (intr->intrinsic) { + case nir_intrinsic_load_input: { + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + unsigned comp = nir_intrinsic_component(intr); + uint32_t reg = emit_input(c, sem.location); + + if (comp > 0) { + reg = swizzle(reg, comp, MIN2(comp + 1, 3), + MIN2(comp + 2, 3), MIN2(comp + 3, 3)); + } + + set_ureg(c, &intr->def, reg); + break; + } + + case nir_intrinsic_store_output: { + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + unsigned comp = nir_intrinsic_component(intr); + uint32_t val = src_ureg(c, &intr->src[0]); + uint32_t wm = nir_intrinsic_write_mask(intr); + uint32_t dest; + + if (sem.location == FRAG_RESULT_DEPTH) { + dest = UREG(REG_TYPE_OD, 0); + } else { + dest = UREG(REG_TYPE_OC, 0); + } + + if (comp > 0) { + uint32_t s[4] = { X, Y, Z, W }; + for (int i = 3; i >= (int)comp; i--) + s[i] = s[i - comp]; + for (unsigned i = 0; i < comp; i++) + s[i] = ZERO; + val = swizzle(val, s[0], s[1], s[2], s[3]); + wm <<= comp; + } + + i915_emit_arith(p, A0_MOV, dest, writemask_to_mask(wm), 0, + val, 0, 0); + break; + } + + case nir_intrinsic_load_ubo: { + nir_src *offset_src = &intr->src[1]; + if (!nir_src_is_const(*offset_src)) { + i915_program_error(p, "non-constant UBO offset"); + set_ureg(c, &intr->def, + swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO)); + break; + } + unsigned byte_offset = (unsigned)nir_src_as_float(*offset_src); + unsigned slot = byte_offset / 16; + unsigned comp = (byte_offset % 16) / 4; + + if (slot >= I915_MAX_CONSTANT) { + i915_program_error(p, "UBO offset %d exceeds max constants", slot); + set_ureg(c, &intr->def, + swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO)); + break; + } + + for (unsigned i = 0; i < intr->def.num_components; i++) + ifs->constant_flags[slot] |= I915_CONSTFLAG_USER; + ifs->num_constants = MAX2(ifs->num_constants, slot + 1); + + uint32_t reg = UREG(REG_TYPE_CONST, slot); + if (comp > 0) { + uint32_t s[4]; + for (unsigned i = 0; i < 4; i++) + s[i] = MIN2(comp + i, 3); + reg = swizzle(reg, s[0], s[1], s[2], s[3]); + } + + set_ureg(c, &intr->def, reg); + break; + } + + case nir_intrinsic_load_ubo_vec4: { + nir_src *offset_src = &intr->src[1]; + if (!nir_src_is_const(*offset_src)) { + i915_program_error(p, "non-constant UBO offset"); + set_ureg(c, &intr->def, + swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO)); + break; + } + unsigned slot = nir_intrinsic_base(intr) + + (unsigned)nir_src_as_float(*offset_src); + unsigned comp = nir_intrinsic_component(intr); + + if (slot >= I915_MAX_CONSTANT) { + i915_program_error(p, "UBO slot %d exceeds max constants", slot); + set_ureg(c, &intr->def, + swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO)); + break; + } + + for (unsigned i = 0; i < intr->def.num_components; i++) + ifs->constant_flags[slot] |= I915_CONSTFLAG_USER; + ifs->num_constants = MAX2(ifs->num_constants, slot + 1); + + uint32_t reg = UREG(REG_TYPE_CONST, slot); + if (comp > 0) { + uint32_t s[4]; + for (unsigned i = 0; i < 4; i++) + s[i] = MIN2(comp + i, 3); + reg = swizzle(reg, s[0], s[1], s[2], s[3]); + } + + set_ureg(c, &intr->def, reg); + break; + } + + case nir_intrinsic_terminate: + case nir_intrinsic_demote: { + uint32_t tmp = i915_get_utemp(p); + i915_emit_texld(p, tmp, A0_DEST_CHANNEL_ALL, 0, + negate(swizzle(UREG(REG_TYPE_R, 0), ONE, ONE, ONE, ONE), + 1, 1, 1, 1), + T0_TEXKILL, TGSI_WRITEMASK_X); + i915_release_utemps(p); + break; + } + + case nir_intrinsic_terminate_if: + case nir_intrinsic_demote_if: { + uint32_t cond = src_ureg(c, &intr->src[0]); + uint32_t tmp = i915_get_utemp(p); + i915_emit_texld(p, tmp, A0_DEST_CHANNEL_ALL, 0, + negate(swizzle(cond, X, X, X, X), 1, 1, 1, 1), + T0_TEXKILL, TGSI_WRITEMASK_XYZW); + i915_release_utemps(p); + break; + } + + case nir_intrinsic_ddx: + case nir_intrinsic_ddy: + case nir_intrinsic_ddx_coarse: + case nir_intrinsic_ddy_coarse: + case nir_intrinsic_ddx_fine: + case nir_intrinsic_ddy_fine: + set_ureg(c, &intr->def, + swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO)); + break; + + default: + i915_program_error(p, "unsupported intrinsic: %s", + nir_intrinsic_infos[intr->intrinsic].name); + break; + } +} + +static void +emit_instr(struct nir_to_i915 *c, nir_instr *instr) +{ + switch (instr->type) { + case nir_instr_type_load_const: + emit_load_const(c, nir_instr_as_load_const(instr)); + break; + case nir_instr_type_alu: + emit_alu(c, nir_instr_as_alu(instr)); + break; + case nir_instr_type_tex: + emit_tex(c, nir_instr_as_tex(instr)); + break; + case nir_instr_type_intrinsic: + emit_intrinsic(c, nir_instr_as_intrinsic(instr)); + break; + case nir_instr_type_undef: { + nir_undef_instr *undef = nir_instr_as_undef(instr); + set_ureg(c, &undef->def, + swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO)); + break; + } + case nir_instr_type_jump: + case nir_instr_type_deref: + break; + default: + i915_program_error(c->p, "unsupported NIR instruction type %d", + instr->type); + break; + } +} + +static void +fixup_depth_write(struct nir_to_i915 *c, nir_shader *s) +{ + if (!(s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))) + return; + + /* NIR writes depth to OD.X (component 0); hardware reads from OD.W */ + i915_emit_arith(c->p, A0_MOV, + UREG(REG_TYPE_OD, 0), A0_DEST_CHANNEL_W, 0, + swizzle(UREG(REG_TYPE_OD, 0), X, Y, Z, X), + 0, 0); +} + +void +i915_translate_fragment_program_nir(struct i915_context *i915, + struct i915_fragment_shader *ifs, + nir_shader *s, + const struct corm_compile_opts *opts) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(s); + bool debug = I915_DBG_ON(DBG_FS) && + (!ifs->internal || NIR_DEBUG(PRINT_INTERNAL)); + + if (debug) { + mesa_logi("NIR fragment shader:"); + nir_log_shaderi(s); + } + + struct i915_fp_compile *p = CALLOC_STRUCT(i915_fp_compile); + p->shader = ifs; + p->error = ralloc_strdup(NULL, ""); + p->log_program_errors = !ifs->internal; + + ifs->num_constants = 0; + memset(ifs->constant_flags, 0, sizeof(ifs->constant_flags)); + memset(p->register_phases, 0, sizeof(p->register_phases)); + + for (int i = 0; i < I915_TEX_UNITS; i++) + ifs->texcoords[i].semantic = -1; + + p->nr_tex_indirect = 1; + p->nr_tex_insn = 0; + p->nr_alu_insn = 0; + p->nr_decl_insn = 0; + p->csr = p->program; + p->decl = p->declarations; + p->decl_s = 0; + p->decl_t = 0; + p->temp_flag = ~0x0U << I915_MAX_TEMPORARY; + p->utemp_flag = ~0x7; + + *(p->decl++) = _3DSTATE_PIXEL_SHADER_PROGRAM; + + struct nir_to_i915 c = { + .p = p, + .ifs = ifs, + .opts = *opts, + .ureg_map_size = impl->ssa_alloc, + .ureg_map = CALLOC(impl->ssa_alloc, sizeof(uint32_t)), + }; + + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + emit_instr(&c, instr); + if (p->error[0]) + break; + } + if (p->error[0]) + break; + } + + if (!p->error[0]) + fixup_depth_write(&c, s); + + /* finalize */ + if (p->nr_tex_indirect > I915_MAX_TEX_INDIRECT) + i915_program_error(p, "exceeded max tex indirect (%d/%d)", + p->nr_tex_indirect, I915_MAX_TEX_INDIRECT); + if (p->nr_tex_insn > I915_MAX_TEX_INSN) + i915_program_error(p, "exceeded max tex insn (%d/%d)", + p->nr_tex_insn, I915_MAX_TEX_INSN); + if (p->nr_alu_insn > I915_MAX_ALU_INSN) + i915_program_error(p, "exceeded max ALU insn (%d/%d)", + p->nr_alu_insn, I915_MAX_ALU_INSN); + if (p->nr_decl_insn > I915_MAX_DECL_INSN) + i915_program_error(p, "exceeded max decl insn (%d/%d)", + p->nr_decl_insn, I915_MAX_DECL_INSN); + + if (p->nr_alu_insn == 0 && p->nr_tex_insn == 0) { + i915_use_passthrough_shader(ifs); + ifs->nr_alu_insn = 1; + goto cleanup; + } + + ifs->nr_alu_insn = p->nr_alu_insn; + ifs->nr_tex_insn = p->nr_tex_insn; + ifs->nr_tex_indirect = p->nr_tex_indirect; + ifs->nr_temps = util_bitcount(p->temp_flag); + + { + unsigned long program_size = (unsigned long)(p->csr - p->program); + unsigned long decl_size = (unsigned long)(p->decl - p->declarations); + + p->declarations[0] |= program_size + decl_size - 2; + + assert(!ifs->program); + ifs->program_len = decl_size + program_size; + ifs->program = (uint32_t *)MALLOC(ifs->program_len * sizeof(uint32_t)); + memcpy(ifs->program, p->declarations, decl_size * sizeof(uint32_t)); + memcpy(&ifs->program[decl_size], p->program, + program_size * sizeof(uint32_t)); + + if (p->error[0]) { + /* dump the program for debugging, then replace with passthrough */ + if (debug && ifs->program_len > 2) { + mesa_logi("FAILED program (%d ALU):", p->nr_alu_insn); + i915_disassemble_program(ifs->program, ifs->program_len); + } + FREE(ifs->program); + ifs->program = NULL; + ifs->program_len = 0; + i915_use_passthrough_shader(ifs); + } + } + +cleanup: + if (p->error[0]) + ifs->error = p->error; + else + ralloc_free(p->error); + + FREE(c.ureg_map); + FREE(p); + + if (debug) { + if (ifs->error) + mesa_loge("%s", ifs->error); + + mesa_logi("i915 fragment shader with %d constants%s", + ifs->num_constants, ifs->num_constants ? ":" : ""); + + for (int i = 0; i < I915_MAX_CONSTANT; i++) { + if (ifs->constant_flags[i] & 0x0f) { + mesa_logi("\t\tC[%d] = { %f, %f, %f, %f }", i, + ifs->constants[i][0], ifs->constants[i][1], + ifs->constants[i][2], ifs->constants[i][3]); + } + } + i915_disassemble_program(ifs->program, ifs->program_len); + } +} diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c index 17db0d34034..df43fb05149 100644 --- a/src/gallium/drivers/i915/i915_screen.c +++ b/src/gallium/drivers/i915/i915_screen.c @@ -176,6 +176,8 @@ i915_optimize_nir(struct nir_shader *s) { bool progress; + NIR_PASS(_, s, nir_lower_int_to_float); + do { progress = false; @@ -212,6 +214,11 @@ i915_optimize_nir(struct nir_shader *s) } while (progress); + NIR_PASS(_, s, nir_lower_alu_to_scalar, NULL, NULL); + NIR_PASS(_, s, nir_lower_bool_to_float, false); + NIR_PASS(_, s, nir_opt_algebraic); + NIR_PASS(_, s, nir_opt_dce); + NIR_PASS(progress, s, nir_remove_dead_variables, nir_var_function_temp, NULL); diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c index af45247355d..12da6b72266 100644 --- a/src/gallium/drivers/i915/i915_state.c +++ b/src/gallium/drivers/i915/i915_state.c @@ -31,7 +31,9 @@ #include "compiler/nir/nir_builder.h" #include "draw/draw_context.h" #include "nir/nir_to_tgsi.h" +#include "tgsi/tgsi_from_mesa.h" #include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_scan.h" #include "util/u_helpers.h" #include "util/u_inlines.h" #include "util/u_math.h" @@ -542,6 +544,23 @@ static const struct nir_to_tgsi_options ntt_options = { .lower_fabs = true, }; +static int +type_size(const struct glsl_type *type, bool bindless) +{ + return glsl_count_attribute_slots(type, false); +} + +static bool +scalarize_vector_bools(const nir_instr *instr, const void *data) +{ + if (instr->type != nir_instr_type_alu) + return false; + nir_alu_instr *alu = nir_instr_as_alu(instr); + return alu->op == nir_op_bcsel || + alu->op == nir_op_fcsel_ge || + alu->op == nir_op_fcsel_gt; +} + static char * i915_check_control_flow(nir_shader *s) { @@ -565,6 +584,94 @@ i915_check_control_flow(nir_shader *s) return NULL; } +enum i915_fs_mode { + I915_FS_TGSI, + I915_FS_NIR, + I915_FS_BOTH, +}; + +static enum i915_fs_mode +i915_get_fs_mode(void) +{ + const char *env = debug_get_option("I915_FS", "both"); + if (!strcmp(env, "tgsi")) + return I915_FS_TGSI; + if (!strcmp(env, "nir")) + return I915_FS_NIR; + return I915_FS_BOTH; +} + +static void +i915_populate_fs_metadata(struct i915_fragment_shader *ifs, nir_shader *s) +{ + ifs->num_inputs = 0; + ifs->writes_z = s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH); + + nir_foreach_shader_in_variable(var, s) { + unsigned sem_name, sem_index; + tgsi_get_gl_varying_semantic((gl_varying_slot)var->data.location, true, + &sem_name, &sem_index); + unsigned idx = ifs->num_inputs++; + ifs->input_semantic_name[idx] = sem_name; + ifs->input_semantic_index[idx] = sem_index; + } +} + +static void +i915_compile_tgsi(struct i915_context *i915, + struct i915_fragment_shader *ifs, + struct pipe_screen *screen, + nir_shader *nir_clone) +{ + ifs->state.tokens = nir_to_tgsi_options(nir_clone, screen, &ntt_options); + ifs->state.type = PIPE_SHADER_IR_TGSI; + tgsi_scan_shader(ifs->state.tokens, &ifs->info); + i915_translate_fragment_program(i915, ifs); +} + +static bool +corm_fs_better(const struct i915_fragment_shader *a, + const struct i915_fragment_shader *b) +{ + if (a->nr_tex_indirect != b->nr_tex_indirect) + return a->nr_tex_indirect < b->nr_tex_indirect; + if (a->nr_alu_insn != b->nr_alu_insn) + return a->nr_alu_insn < b->nr_alu_insn; + if (a->nr_temps != b->nr_temps) + return a->nr_temps < b->nr_temps; + return a->num_constants < b->num_constants; +} + +static const char * +corm_win_reason(const struct i915_fragment_shader *winner, + const struct i915_fragment_shader *loser, + char *buf, size_t len) +{ + if (!loser) { + snprintf(buf, len, "only"); + return buf; + } + int da = (int)winner->nr_alu_insn - (int)loser->nr_alu_insn; + int dp = (int)winner->nr_tex_indirect - (int)loser->nr_tex_indirect; + int dt = (int)winner->nr_temps - (int)loser->nr_temps; + if (dp != 0) + snprintf(buf, len, "%+d phase", dp); + else if (da != 0) + snprintf(buf, len, "%+d alu", da); + else if (dt != 0) + snprintf(buf, len, "%+d temps", dt); + else if ((int)winner->num_constants != (int)loser->num_constants) + snprintf(buf, len, "%+d const", + (int)winner->num_constants - (int)loser->num_constants); + else if (winner->program_len == loser->program_len && + !memcmp(winner->program, loser->program, + winner->program_len * sizeof(uint32_t))) + snprintf(buf, len, "identical"); + else + snprintf(buf, len, "tied"); + return buf; +} + static void * i915_create_fs_state(struct pipe_context *pipe, const struct pipe_shader_state *templ) @@ -576,39 +683,206 @@ i915_create_fs_state(struct pipe_context *pipe, ifs->draw_data = draw_create_fragment_shader(i915->draw, templ); - if (templ->type == PIPE_SHADER_IR_NIR) { - nir_shader *s = templ->ir.nir; - ifs->internal = s->info.internal; - - char *msg = i915_check_control_flow(s); - if (msg) { - if (I915_DBG_ON(DBG_FS) && - (!s->info.internal || NIR_DEBUG(PRINT_INTERNAL))) { - mesa_logi("failing shader:"); - nir_log_shaderi(s); - } - if (templ->report_compile_error) { - ((struct pipe_shader_state *)templ)->error_message = strdup(msg); - ralloc_free(s); - i915_delete_fs_state(NULL, ifs); - return NULL; - } - } - - ifs->state.tokens = nir_to_tgsi_options(s, pipe->screen, &ntt_options); - } else { - assert(templ->type == PIPE_SHADER_IR_TGSI); - /* we need to keep a local copy of the tokens */ + if (templ->type == PIPE_SHADER_IR_TGSI) { ifs->state.tokens = tgsi_dup_tokens(templ->tokens); + ifs->state.type = PIPE_SHADER_IR_TGSI; ifs->internal = i915->no_log_program_errors; + tgsi_scan_shader(ifs->state.tokens, &ifs->info); + i915_translate_fragment_program(i915, ifs); + return ifs; } - ifs->state.type = PIPE_SHADER_IR_TGSI; + assert(templ->type == PIPE_SHADER_IR_NIR); + nir_shader *s = templ->ir.nir; + ifs->internal = s->info.internal; - tgsi_scan_shader(ifs->state.tokens, &ifs->info); + bool debug = I915_DBG_ON(DBG_FS) && + (!s->info.internal || NIR_DEBUG(PRINT_INTERNAL)); + + char *msg = i915_check_control_flow(s); + if (msg) { + if (debug) { + mesa_logi("failing shader:"); + nir_log_shaderi(s); + } + if (templ->report_compile_error) { + ((struct pipe_shader_state *)templ)->error_message = strdup(msg); + ralloc_free(s); + i915_delete_fs_state(NULL, ifs); + return NULL; + } + } + + static enum i915_fs_mode fs_mode = -1; + if (fs_mode == (enum i915_fs_mode)-1) + fs_mode = i915_get_fs_mode(); + + bool try_nir = (fs_mode == I915_FS_NIR || fs_mode == I915_FS_BOTH); + bool try_tgsi = (fs_mode == I915_FS_TGSI || fs_mode == I915_FS_BOTH); + + struct i915_fragment_shader tgsi_fs = {0}; + + static const struct corm_compile_opts corm_variants[] = { + { .deferred_const = false, .seq_sne_opt = false }, + { .deferred_const = false, .seq_sne_opt = true }, + { .deferred_const = true, .seq_sne_opt = false }, + { .deferred_const = true, .seq_sne_opt = true }, + }; + + struct i915_fragment_shader nir_results[ARRAY_SIZE(corm_variants)]; + int best_nir = -1; + + if (try_nir) { + nir_shader *nir_s = try_tgsi ? nir_shader_clone(NULL, s) : s; + NIR_PASS(_, nir_s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + type_size, (nir_lower_io_options)0); + NIR_PASS(_, nir_s, nir_lower_alu_to_scalar, scalarize_vector_bools, NULL); + NIR_PASS(_, nir_s, nir_opt_vectorize, NULL, NULL); + NIR_PASS(_, nir_s, nir_lower_bool_to_float, false); + NIR_PASS(_, nir_s, nir_opt_algebraic); + NIR_PASS(_, nir_s, nir_opt_algebraic_late); + NIR_PASS(_, nir_s, nir_opt_dce); + nir_index_ssa_defs(nir_shader_get_entrypoint(nir_s)); + + for (unsigned v = 0; v < ARRAY_SIZE(corm_variants); v++) { + nir_shader *variant_nir = (v == ARRAY_SIZE(corm_variants) - 1) + ? nir_s : nir_shader_clone(NULL, nir_s); + memset(&nir_results[v], 0, sizeof(nir_results[v])); + i915_populate_fs_metadata(&nir_results[v], variant_nir); + i915_translate_fragment_program_nir(i915, &nir_results[v], + variant_nir, &corm_variants[v]); + if (v < ARRAY_SIZE(corm_variants) - 1) + ralloc_free(variant_nir); + + bool ok = !nir_results[v].error || !nir_results[v].error[0]; + if (ok && (best_nir < 0 || + corm_fs_better(&nir_results[v], &nir_results[best_nir]))) + best_nir = v; + } + + if (try_tgsi) + ralloc_free(nir_s); + } + + if (try_tgsi) { + i915_compile_tgsi(i915, &tgsi_fs, pipe->screen, s); + } else { + ralloc_free(s); + } + + bool nir_ok = best_nir >= 0; + bool tgsi_ok = try_tgsi && (!tgsi_fs.error || !tgsi_fs.error[0]); + struct i915_fragment_shader *best_nir_fs = nir_ok ? &nir_results[best_nir] : NULL; + + bool use_nir; + if (nir_ok && tgsi_ok) + use_nir = !corm_fs_better(&tgsi_fs, best_nir_fs); + else + use_nir = nir_ok; + + if (debug && try_nir && try_tgsi) { + for (unsigned v = 0; v < ARRAY_SIZE(corm_variants); v++) { + bool ok = !nir_results[v].error || !nir_results[v].error[0]; + mesa_logi(" NIR[dc=%d,ss=%d]: %s (%d ALU, %d phase, %d temps)%s", + corm_variants[v].deferred_const, + corm_variants[v].seq_sne_opt, + ok ? "ok" : "FAIL", + ok ? nir_results[v].nr_alu_insn : 0, + ok ? nir_results[v].nr_tex_indirect : 0, + ok ? nir_results[v].nr_temps : 0, + (int)v == best_nir ? " *" : ""); + } + mesa_logi(" TGSI: %s (%d ALU, %d phase, %d temps)", + tgsi_ok ? "ok" : "FAIL", + tgsi_ok ? tgsi_fs.nr_alu_insn : 0, + tgsi_ok ? tgsi_fs.nr_tex_indirect : 0, + tgsi_ok ? tgsi_fs.nr_temps : 0); + mesa_logi(" -> %s%s", use_nir ? "NIR" : "TGSI", + use_nir ? (corm_fs_better(best_nir_fs, &tgsi_fs) + ? " (better)" : " (tied)") : ""); + } + + /* Free non-winning NIR variants */ + if (try_nir) { + for (unsigned v = 0; v < ARRAY_SIZE(corm_variants); v++) { + if ((int)v != best_nir) { + FREE(nir_results[v].program); + ralloc_free(nir_results[v].error); + } + } + } + + struct i915_fragment_shader *winner, *loser = NULL; + struct i915_fragment_shader nir_loser_copy = {0}; + if (use_nir) { + winner = best_nir_fs; + loser = tgsi_ok ? &tgsi_fs : NULL; + } else { + winner = &tgsi_fs; + if (best_nir_fs) { + nir_loser_copy = *best_nir_fs; + nir_loser_copy.program = NULL; + loser = &nir_loser_copy; + FREE(best_nir_fs->program); + ralloc_free(best_nir_fs->error); + } + } + + if (i915 && !ifs->internal) { + bool neither = (winner->nr_alu_insn + winner->nr_tex_insn) == 0; + char reason[32]; + if (neither) + snprintf(reason, sizeof(reason), "neither"); + else + corm_win_reason(winner, loser, reason, sizeof(reason)); + util_debug_message( + &i915->debug, SHADER_INFO, + "%s shader [%s, %s]: %d instructions, %d alu, %d tex, " + "%d tex_indirect, %d temps, %d const", + _mesa_shader_stage_to_abbrev(MESA_SHADER_FRAGMENT), + neither ? "FAIL" : use_nir ? "NIR" : "TGSI", reason, + winner->nr_alu_insn + winner->nr_tex_insn, + winner->nr_alu_insn, winner->nr_tex_insn, winner->nr_tex_indirect, + winner->nr_temps, winner->num_constants); + } + + ifs->program = winner->program; + ifs->program_len = winner->program_len; + ifs->nr_alu_insn = winner->nr_alu_insn; + ifs->nr_tex_insn = winner->nr_tex_insn; + ifs->nr_tex_indirect = winner->nr_tex_indirect; + ifs->nr_temps = winner->nr_temps; + ifs->num_constants = winner->num_constants; + memcpy(ifs->constants, winner->constants, sizeof(ifs->constants)); + memcpy(ifs->constant_flags, winner->constant_flags, + sizeof(ifs->constant_flags)); + memcpy(ifs->texcoords, winner->texcoords, sizeof(ifs->texcoords)); + ifs->reads_pntc = winner->reads_pntc; + ifs->writes_z = winner->writes_z; + ifs->num_inputs = winner->num_inputs; + memcpy(ifs->input_semantic_name, winner->input_semantic_name, + sizeof(ifs->input_semantic_name)); + memcpy(ifs->input_semantic_index, winner->input_semantic_index, + sizeof(ifs->input_semantic_index)); + if (winner->error) + ifs->error = winner->error; + + /* The loser's info may be in use (TGSI path populates ifs->info) */ + if (try_tgsi) + ifs->info = tgsi_fs.info; + + if (loser) { + FREE(loser->program); + ralloc_free(loser->error); + } + if (!use_nir && try_tgsi) { + /* TGSI won — tokens are in tgsi_fs via i915_compile_tgsi. + * We need them for ifs->state for draw's FS pipeline. */ + ifs->state = tgsi_fs.state; + } else if (try_tgsi) { + FREE((void *)tgsi_fs.state.tokens); + } - /* The shader's compiled to i915 instructions here */ - i915_translate_fragment_program(i915, ifs); if (ifs->error && templ->report_compile_error) { ((struct pipe_shader_state *)templ)->error_message = strdup(ifs->error); i915_delete_fs_state(NULL, ifs); diff --git a/src/gallium/drivers/i915/meson.build b/src/gallium/drivers/i915/meson.build index 80dc825fbc5..ef1d5f7ad34 100644 --- a/src/gallium/drivers/i915/meson.build +++ b/src/gallium/drivers/i915/meson.build @@ -16,6 +16,7 @@ files_i915 = files( 'i915_flush.c', 'i915_fpc_emit.c', 'i915_fpc.h', + 'i915_fpc_nir.c', 'i915_fpc_optimize.c', 'i915_fpc_translate.c', 'i915_prim_emit.c',