diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources index 5325f974cda..6f50f714c3f 100644 --- a/src/gallium/auxiliary/Makefile.sources +++ b/src/gallium/auxiliary/Makefile.sources @@ -271,6 +271,7 @@ C_SOURCES := \ util/u_prim_restart.h \ util/u_pstipple.c \ util/u_pstipple.h \ + util/u_pwr8.h \ util/u_range.h \ util/u_rect.h \ util/u_resource.c \ diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c index 142d78ae49d..b48bdcc779e 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.c +++ b/src/gallium/auxiliary/draw/draw_llvm.c @@ -1618,6 +1618,12 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant, context_ptr = LLVMGetParam(variant_func, 0); io_ptr = LLVMGetParam(variant_func, 1); vbuffers_ptr = LLVMGetParam(variant_func, 2); + /* + * XXX: stride is actually unused. The stride we use is strictly calculated + * from the number of outputs (including the draw_extra outputs). + * Should probably fix some day (we need a new vs just because of extra + * outputs which the generated vs won't touch). + */ stride = LLVMGetParam(variant_func, 5 + (elts ? 1 : 0)); vb_ptr = LLVMGetParam(variant_func, 6 + (elts ? 1 : 0)); system_values.instance_id = LLVMGetParam(variant_func, 7 + (elts ? 1 : 0)); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c index cdf6d80c261..0b0f7f0147c 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c @@ -461,50 +461,49 @@ lp_build_pack2(struct gallivm_state *gallivm, assert(src_type.length * 2 == dst_type.length); /* Check for special cases first */ - if((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) && - src_type.width * src_type.length >= 128) { + if ((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) && + src_type.width * src_type.length >= 128) { const char *intrinsic = NULL; boolean swap_intrinsic_operands = FALSE; switch(src_type.width) { case 32: if (util_cpu_caps.has_sse2) { - if(dst_type.sign) { + if (dst_type.sign) { intrinsic = "llvm.x86.sse2.packssdw.128"; - } - else { + } else { if (util_cpu_caps.has_sse4_1) { intrinsic = "llvm.x86.sse41.packusdw"; } } } else if (util_cpu_caps.has_altivec) { if (dst_type.sign) { - intrinsic = "llvm.ppc.altivec.vpkswus"; - } else { - intrinsic = "llvm.ppc.altivec.vpkuwus"; - } + intrinsic = "llvm.ppc.altivec.vpkswss"; + } else { + intrinsic = "llvm.ppc.altivec.vpkuwus"; + } #ifdef PIPE_ARCH_LITTLE_ENDIAN - swap_intrinsic_operands = TRUE; + swap_intrinsic_operands = TRUE; #endif } break; case 16: if (dst_type.sign) { if (util_cpu_caps.has_sse2) { - intrinsic = "llvm.x86.sse2.packsswb.128"; + intrinsic = "llvm.x86.sse2.packsswb.128"; } else if (util_cpu_caps.has_altivec) { - intrinsic = "llvm.ppc.altivec.vpkshss"; + intrinsic = "llvm.ppc.altivec.vpkshss"; #ifdef PIPE_ARCH_LITTLE_ENDIAN - swap_intrinsic_operands = TRUE; + swap_intrinsic_operands = TRUE; #endif } } else { if (util_cpu_caps.has_sse2) { - intrinsic = "llvm.x86.sse2.packuswb.128"; + intrinsic = "llvm.x86.sse2.packuswb.128"; } else if (util_cpu_caps.has_altivec) { - intrinsic = "llvm.ppc.altivec.vpkshus"; + intrinsic = "llvm.ppc.altivec.vpkshus"; #ifdef PIPE_ARCH_LITTLE_ENDIAN - swap_intrinsic_operands = TRUE; + swap_intrinsic_operands = TRUE; #endif } } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c index 3d5e2cb316b..6f75bec5005 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c @@ -1536,8 +1536,22 @@ mod_emit_cpu( struct lp_build_tgsi_context * bld_base, struct lp_build_emit_data * emit_data) { - emit_data->output[emit_data->chan] = lp_build_mod(&bld_base->int_bld, - emit_data->args[0], emit_data->args[1]); + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + LLVMValueRef div_mask = lp_build_cmp(&bld_base->uint_bld, + PIPE_FUNC_EQUAL, emit_data->args[1], + bld_base->uint_bld.zero); + /* We want to make sure that we never divide/mod by zero to not + * generate sigfpe. We don't want to crash just because the + * shader is doing something weird. */ + LLVMValueRef divisor = LLVMBuildOr(builder, + div_mask, + emit_data->args[1], ""); + LLVMValueRef result = lp_build_mod(&bld_base->int_bld, + emit_data->args[0], divisor); + /* umod by zero doesn't have a guaranteed return value chose -1 for now. */ + emit_data->output[emit_data->chan] = LLVMBuildOr(builder, + div_mask, + result, ""); } /* TGSI_OPCODE_NOT */ diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c index 7c577592f70..dfda80f228f 100644 --- a/src/gallium/auxiliary/nir/tgsi_to_nir.c +++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c @@ -673,10 +673,6 @@ ttn_get_dest(struct ttn_compile *c, struct tgsi_full_dst_register *tgsi_fdst) if (tgsi_dst->File == TGSI_FILE_TEMPORARY) { if (c->temp_regs[index].var) { - nir_builder *b = &c->build; - nir_intrinsic_instr *load; - struct tgsi_ind_register *indirect = - tgsi_dst->Indirect ? &tgsi_fdst->Indirect : NULL; nir_register *reg; /* this works, because TGSI will give us a base offset @@ -690,26 +686,6 @@ ttn_get_dest(struct ttn_compile *c, struct tgsi_full_dst_register *tgsi_fdst) reg->num_components = 4; dest.dest.reg.reg = reg; dest.dest.reg.base_offset = 0; - - /* since the alu op might not write to all components - * of the temporary, we must first do a load_var to - * get the previous array elements into the register. - * This is one area that NIR could use a bit of - * improvement (or opt pass to clean up the mess - * once things are scalarized) - */ - - load = nir_intrinsic_instr_create(c->build.shader, - nir_intrinsic_load_var); - load->num_components = 4; - load->variables[0] = - ttn_array_deref(c, load, c->temp_regs[index].var, - c->temp_regs[index].offset, - indirect); - - load->dest = nir_dest_for_reg(reg); - - nir_builder_instr_insert(b, &load->instr); } else { assert(!tgsi_dst->Indirect); dest.dest.reg.reg = c->temp_regs[index].reg; @@ -1886,7 +1862,7 @@ ttn_emit_instruction(struct ttn_compile *c) ttn_move_dest(b, dest, nir_fsat(b, ttn_src_for_dest(b, &dest))); } - /* if the dst has a matching var, append store_global to move + /* if the dst has a matching var, append store_var to move * output from reg to var */ nir_variable *var = ttn_get_var(c, tgsi_dst); @@ -1899,7 +1875,7 @@ ttn_emit_instruction(struct ttn_compile *c) &tgsi_dst->Indirect : NULL; store->num_components = 4; - store->const_index[0] = 0xf; + store->const_index[0] = dest.write_mask; store->variables[0] = ttn_array_deref(c, store, var, offset, indirect); store->src[0] = nir_src_for_reg(dest.dest.reg.reg); @@ -1932,6 +1908,7 @@ ttn_add_output_stores(struct ttn_compile *c) store->src[0].reg.reg = c->output_regs[loc].reg; store->src[0].reg.base_offset = c->output_regs[loc].offset; store->const_index[0] = loc; + store->const_index[1] = 0xf; /* writemask */ store->src[1] = nir_src_for_ssa(nir_imm_int(b, 0)); nir_builder_instr_insert(b, &store->instr); } diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c index ea207461d27..83f50628b40 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_build.c +++ b/src/gallium/auxiliary/tgsi/tgsi_build.c @@ -110,6 +110,7 @@ tgsi_default_declaration( void ) declaration.Invariant = 0; declaration.Local = 0; declaration.Array = 0; + declaration.Atomic = 0; declaration.Padding = 0; return declaration; diff --git a/src/gallium/auxiliary/util/u_pstipple.c b/src/gallium/auxiliary/util/u_pstipple.c index 08dec13846d..3428172203b 100644 --- a/src/gallium/auxiliary/util/u_pstipple.c +++ b/src/gallium/auxiliary/util/u_pstipple.c @@ -230,6 +230,7 @@ pstip_transform_immed(struct tgsi_transform_context *ctx, struct pstip_transform_context *pctx = (struct pstip_transform_context *) ctx; pctx->numImmed++; + ctx->emit_immediate(ctx, immed); } diff --git a/src/gallium/auxiliary/util/u_pwr8.h b/src/gallium/auxiliary/util/u_pwr8.h index 1eca6d6df2c..ffd9f923142 100644 --- a/src/gallium/auxiliary/util/u_pwr8.h +++ b/src/gallium/auxiliary/util/u_pwr8.h @@ -153,6 +153,12 @@ vec_mullo_epi32 (__m128i a, __m128i b) return v; } +static inline __m128i +vec_andnot_si128 (__m128i a, __m128i b) +{ + return vec_andc (b, a); +} + static inline void transpose4_epi32(const __m128i * restrict a, const __m128i * restrict b, diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst index d7ea123b0e9..b461810644a 100644 --- a/src/gallium/docs/source/screen.rst +++ b/src/gallium/docs/source/screen.rst @@ -305,6 +305,7 @@ The integer capabilities: for buffers is supported. * ``PIPE_CAP_GENERATE_MIPMAP``: Indicates whether pipe_context::generate_mipmap is supported. +* ``PIPE_CAP_STRING_MARKER``: Whether pipe->emit_string_marker() is supported. .. _pipe_capf: diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_context.c b/src/gallium/drivers/freedreno/a2xx/fd2_context.c index 3bed73573a6..058f8219ed5 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_context.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_context.c @@ -109,6 +109,7 @@ fd2_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags) fd2_gmem_init(pctx); fd2_texture_init(pctx); fd2_prog_init(pctx); + fd2_emit_init(pctx); pctx = fd_context_init(&fd2_ctx->base, pscreen, (screen->gpu_id >= 220) ? a22x_primtypes : a20x_primtypes, diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c index cc0ed59f300..4f667ab7d57 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c @@ -446,3 +446,17 @@ fd2_emit_setup(struct fd_context *ctx) fd_ringbuffer_flush(ring); fd_ringmarker_mark(ctx->draw_start); } + +static void +fd2_emit_ib(struct fd_ringbuffer *ring, struct fd_ringmarker *start, + struct fd_ringmarker *end) +{ + __OUT_IB(ring, false, start, end); +} + +void +fd2_emit_init(struct pipe_context *pctx) +{ + struct fd_context *ctx = fd_context(pctx); + ctx->emit_ib = fd2_emit_ib; +} diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.h b/src/gallium/drivers/freedreno/a2xx/fd2_emit.h index 8ee04632091..3c146c17151 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.h +++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.h @@ -45,4 +45,6 @@ void fd2_emit_vertex_bufs(struct fd_ringbuffer *ring, uint32_t val, void fd2_emit_state(struct fd_context *ctx, uint32_t dirty); void fd2_emit_setup(struct fd_context *ctx); +void fd2_emit_init(struct pipe_context *pctx); + #endif /* FD2_EMIT_H */ diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c index e65a352e7f6..811f58bbba2 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c @@ -891,10 +891,18 @@ fd3_emit_restore(struct fd_context *ctx) ctx->needs_rb_fbd = true; } +static void +fd3_emit_ib(struct fd_ringbuffer *ring, struct fd_ringmarker *start, + struct fd_ringmarker *end) +{ + __OUT_IB(ring, true, start, end); +} + void fd3_emit_init(struct pipe_context *pctx) { struct fd_context *ctx = fd_context(pctx); ctx->emit_const = fd3_emit_const; ctx->emit_const_bo = fd3_emit_const_bo; + ctx->emit_ib = fd3_emit_ib; } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c index 21fb59e450d..2ce393a41ae 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c @@ -853,7 +853,7 @@ emit_binning_pass(struct fd_context *ctx) A3XX_PC_VSTREAM_CONTROL_N(0)); /* emit IB to binning drawcmds: */ - OUT_IB(ring, ctx->binning_start, ctx->binning_end); + ctx->emit_ib(ring, ctx->binning_start, ctx->binning_end); fd_reset_wfi(ctx); fd_wfi(ctx, ring); diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c index bc62a5d9a4b..4a3f1da30ed 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c @@ -885,10 +885,18 @@ fd4_emit_restore(struct fd_context *ctx) ctx->needs_rb_fbd = true; } +static void +fd4_emit_ib(struct fd_ringbuffer *ring, struct fd_ringmarker *start, + struct fd_ringmarker *end) +{ + __OUT_IB(ring, true, start, end); +} + void fd4_emit_init(struct pipe_context *pctx) { struct fd_context *ctx = fd_context(pctx); ctx->emit_const = fd4_emit_const; ctx->emit_const_bo = fd4_emit_const_bo; + ctx->emit_ib = fd4_emit_ib; } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c index 32b8fce1613..74716fb733f 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c @@ -217,6 +217,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, struct stage s[MAX_STAGES]; uint32_t pos_regid, posz_regid, psize_regid, color_regid[8]; uint32_t face_regid, coord_regid, zwcoord_regid; + enum a3xx_threadsize fssz; int constmode; int i, j, k; @@ -224,6 +225,8 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, setup_stages(emit, s); + fssz = (s[FS].i->max_reg >= 24) ? TWO_QUADS : FOUR_QUADS; + /* blob seems to always use constmode currently: */ constmode = 1; @@ -258,7 +261,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, OUT_RING(ring, 0x00000003); OUT_PKT0(ring, REG_A4XX_HLSQ_CONTROL_0_REG, 5); - OUT_RING(ring, A4XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) | + OUT_RING(ring, A4XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(fssz) | A4XX_HLSQ_CONTROL_0_REG_CONSTMODE(constmode) | A4XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE | /* NOTE: I guess SHADERRESTART and CONSTFULLUPDATE maybe @@ -385,7 +388,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) | A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) | A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | - A4XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) | + A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) | A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE | COND(s[FS].v->has_samp, A4XX_SP_FS_CTRL_REG0_PIXLODENABLE)); OUT_RING(ring, A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) | diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c index 0b6b9fbbe7a..c5ea86f9368 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.c +++ b/src/gallium/drivers/freedreno/freedreno_context.c @@ -141,6 +141,32 @@ fd_context_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence, } } +/** + * emit marker string as payload of a no-op packet, which can be + * decoded by cffdump. + */ +static void +fd_emit_string_marker(struct pipe_context *pctx, const char *string, int len) +{ + struct fd_context *ctx = fd_context(pctx); + struct fd_ringbuffer *ring = ctx->ring; + const uint32_t *buf = (const void *)string; + + OUT_PKT3(ring, CP_NOP, align(len, 4) / 4); + while (len >= 4) { + OUT_RING(ring, *buf); + buf++; + len -= 4; + } + + /* copy remainder bytes without reading past end of input string: */ + if (len > 0) { + uint32_t w = 0; + memcpy(&w, buf, len); + OUT_RING(ring, w); + } +} + void fd_context_destroy(struct pipe_context *pctx) { @@ -207,6 +233,7 @@ fd_context_init(struct fd_context *ctx, struct pipe_screen *pscreen, pctx->screen = pscreen; pctx->priv = priv; pctx->flush = fd_context_flush; + pctx->emit_string_marker = fd_emit_string_marker; for (i = 0; i < ARRAY_SIZE(ctx->rings); i++) { ctx->rings[i] = fd_ringbuffer_new(screen->pipe, 0x100000); diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h index 418b71b95de..9e7130ab915 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.h +++ b/src/gallium/drivers/freedreno/freedreno_context.h @@ -386,6 +386,10 @@ struct fd_context { const uint32_t *dwords, struct pipe_resource *prsc); void (*emit_const_bo)(struct fd_ringbuffer *ring, enum shader_t type, boolean write, uint32_t regid, uint32_t num, struct fd_bo **bos, uint32_t *offsets); + + /* indirect-branch emit: */ + void (*emit_ib)(struct fd_ringbuffer *ring, struct fd_ringmarker *start, + struct fd_ringmarker *end); }; static inline struct fd_context * diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c index 648db9baee5..0d73349057c 100644 --- a/src/gallium/drivers/freedreno/freedreno_gmem.c +++ b/src/gallium/drivers/freedreno/freedreno_gmem.c @@ -331,7 +331,7 @@ render_tiles(struct fd_context *ctx) fd_hw_query_prepare_tile(ctx, i, ctx->ring); /* emit IB to drawcmds: */ - OUT_IB(ctx->ring, ctx->draw_start, ctx->draw_end); + ctx->emit_ib(ctx->ring, ctx->draw_start, ctx->draw_end); fd_reset_wfi(ctx); /* emit gmem2mem to transfer tile back to system memory: */ @@ -349,7 +349,7 @@ render_sysmem(struct fd_context *ctx) fd_hw_query_prepare_tile(ctx, 0, ctx->ring); /* emit IB to drawcmds: */ - OUT_IB(ctx->ring, ctx->draw_start, ctx->draw_end); + ctx->emit_ib(ctx->ring, ctx->draw_start, ctx->draw_end); fd_reset_wfi(ctx); } diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index a75b04b327a..640f50f5dcb 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -155,6 +155,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_USER_CONSTANT_BUFFERS: case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT: case PIPE_CAP_VERTEXID_NOBASE: + case PIPE_CAP_STRING_MARKER: return 1; case PIPE_CAP_SHADER_STENCIL_EXPORT: @@ -400,9 +401,16 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, return 1; case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR: + /* Technically this should be the same as for TEMP/CONST, since + * everything is just normal registers. This is just temporary + * hack until load_input/store_output handle arrays in a similar + * way as load_var/store_var.. + */ + return 0; case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: - return 1; + /* a2xx compiler doesn't handle indirect: */ + return is_ir3(screen) ? 1 : 0; case PIPE_SHADER_CAP_SUBROUTINES: case PIPE_SHADER_CAP_DOUBLES: case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: @@ -566,6 +574,7 @@ fd_screen_create(struct fd_device *dev) fd3_screen_init(pscreen); break; case 420: + case 430: fd4_screen_init(pscreen); break; default: diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h index 0d2418e1e00..47dd467f498 100644 --- a/src/gallium/drivers/freedreno/freedreno_util.h +++ b/src/gallium/drivers/freedreno/freedreno_util.h @@ -265,8 +265,8 @@ OUT_WFI(struct fd_ringbuffer *ring) } static inline void -OUT_IB(struct fd_ringbuffer *ring, struct fd_ringmarker *start, - struct fd_ringmarker *end) +__OUT_IB(struct fd_ringbuffer *ring, bool prefetch, + struct fd_ringmarker *start, struct fd_ringmarker *end) { uint32_t dwords = fd_ringmarker_dwords(start, end); @@ -280,7 +280,7 @@ OUT_IB(struct fd_ringbuffer *ring, struct fd_ringmarker *start, */ emit_marker(ring, 6); - OUT_PKT3(ring, CP_INDIRECT_BUFFER_PFD, 2); + OUT_PKT3(ring, prefetch ? CP_INDIRECT_BUFFER_PFE : CP_INDIRECT_BUFFER_PFD, 2); fd_ringbuffer_emit_reloc_ring(ring, start, end); OUT_RING(ring, dwords); diff --git a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c index 83ed5ffdca0..599872470fc 100644 --- a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c +++ b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c @@ -220,7 +220,7 @@ static void print_instr_cat1(instr_t *instr) else if (cat1->off > 0) printf("%c", type, cat1->off); else - printf("c"); + printf("%c", type); } else { print_reg_src((reg_t)(cat1->src), type_size(cat1->src_type) == 32, cat1->src_r, cat1->src_c, cat1->src_im, false, false, false); @@ -650,7 +650,7 @@ static void print_instr_cat6(instr_t *instr) /* size of largest OPC field of all the instruction categories: */ #define NOPC_BITS 6 -struct opc_info { +static const struct opc_info { uint16_t cat; uint16_t opc; const char *name; diff --git a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h index c3fb68d511c..1b1f1f0a797 100644 --- a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h +++ b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h @@ -261,6 +261,7 @@ typedef union PACKED { /* to make compiler happy: */ uint32_t dummy32; uint32_t dummy10 : 10; + int32_t idummy10 : 10; uint32_t dummy11 : 11; uint32_t dummy12 : 12; uint32_t dummy13 : 13; diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c index b24825cff85..7d89142d7a1 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.c +++ b/src/gallium/drivers/freedreno/ir3/ir3.c @@ -81,6 +81,7 @@ struct ir3 * ir3_create(struct ir3_compiler *compiler, shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout); list_inithead(&shader->block_list); + list_inithead(&shader->array_list); return shader; } @@ -121,18 +122,19 @@ static uint32_t reg(struct ir3_register *reg, struct ir3_info *info, val.iim_val = reg->iim_val; } else { unsigned components; + int16_t max; if (reg->flags & IR3_REG_RELATIV) { components = reg->size; - val.dummy10 = reg->offset; + val.idummy10 = reg->array.offset; + max = (reg->array.offset + repeat + components - 1) >> 2; } else { components = util_last_bit(reg->wrmask); val.comp = reg->num & 0x3; val.num = reg->num >> 2; + max = (reg->num + repeat + components - 1) >> 2; } - int16_t max = (reg->num + repeat + components - 1) >> 2; - if (reg->flags & IR3_REG_CONST) { info->max_const = MAX2(info->max_const, max); } else if (val.num == 63) { @@ -233,7 +235,7 @@ static int emit_cat2(struct ir3_instruction *instr, void *ptr, iassert((instr->regs_count == 2) || (instr->regs_count == 3)); if (src1->flags & IR3_REG_RELATIV) { - iassert(src1->num < (1 << 10)); + iassert(src1->array.offset < (1 << 10)); cat2->rel1.src1 = reg(src1, info, instr->repeat, IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF | absneg); @@ -260,7 +262,7 @@ static int emit_cat2(struct ir3_instruction *instr, void *ptr, !((src1->flags ^ src2->flags) & IR3_REG_HALF)); if (src2->flags & IR3_REG_RELATIV) { - iassert(src2->num < (1 << 10)); + iassert(src2->array.offset < (1 << 10)); cat2->rel2.src2 = reg(src2, info, instr->repeat, IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF | absneg); @@ -333,7 +335,7 @@ static int emit_cat3(struct ir3_instruction *instr, void *ptr, iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF)); if (src1->flags & IR3_REG_RELATIV) { - iassert(src1->num < (1 << 10)); + iassert(src1->array.offset < (1 << 10)); cat3->rel1.src1 = reg(src1, info, instr->repeat, IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF | absneg); @@ -361,7 +363,7 @@ static int emit_cat3(struct ir3_instruction *instr, void *ptr, if (src3->flags & IR3_REG_RELATIV) { - iassert(src3->num < (1 << 10)); + iassert(src3->array.offset < (1 << 10)); cat3->rel2.src3 = reg(src3, info, instr->repeat, IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF | absneg); @@ -404,7 +406,7 @@ static int emit_cat4(struct ir3_instruction *instr, void *ptr, iassert(instr->regs_count == 2); if (src->flags & IR3_REG_RELATIV) { - iassert(src->num < (1 << 10)); + iassert(src->array.offset < (1 << 10)); cat4->rel.src = reg(src, info, instr->repeat, IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_FNEG | IR3_REG_FABS | IR3_REG_R | IR3_REG_HALF); @@ -737,6 +739,14 @@ struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, return reg; } +struct ir3_register * ir3_reg_clone(struct ir3 *shader, + struct ir3_register *reg) +{ + struct ir3_register *new_reg = reg_create(shader, 0, 0); + *new_reg = *reg; + return new_reg; +} + void ir3_instr_set_address(struct ir3_instruction *instr, struct ir3_instruction *addr) @@ -777,3 +787,12 @@ ir3_count_instructions(struct ir3 *ir) } return cnt; } + +struct ir3_array * +ir3_lookup_array(struct ir3 *ir, unsigned id) +{ + list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) + if (arr->id == id) + return arr; + return NULL; +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h index 62d14a0ae37..1a109d880e6 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.h +++ b/src/gallium/drivers/freedreno/ir3/ir3.h @@ -83,7 +83,8 @@ struct ir3_register { * before register assignment is done: */ IR3_REG_SSA = 0x2000, /* 'instr' is ptr to assigning instr */ - IR3_REG_PHI_SRC= 0x4000, /* phi src, regs[0]->instr points to phi */ + IR3_REG_ARRAY = 0x4000, + IR3_REG_PHI_SRC= 0x8000, /* phi src, regs[0]->instr points to phi */ } flags; union { @@ -97,11 +98,18 @@ struct ir3_register { uint32_t uim_val; float fim_val; /* relative: */ - int offset; + struct { + uint16_t id; + int16_t offset; + } array; }; - /* for IR3_REG_SSA, src registers contain ptr back to - * assigning instruction. + /* For IR3_REG_SSA, src registers contain ptr back to assigning + * instruction. + * + * For IR3_REG_ARRAY, the pointer is back to the last dependent + * array access (although the net effect is the same, it points + * back to a previous instruction that we depend on). */ struct ir3_instruction *instr; @@ -221,9 +229,6 @@ struct ir3_instruction { struct { int off; /* component/offset */ } fo; - struct { - int aid; - } fi; struct { /* used to temporarily hold reference to nir_phi_instr * until we resolve the phi srcs @@ -293,19 +298,6 @@ struct ir3_instruction { */ struct ir3_instruction *address; - /* in case of a instruction with relative dst instruction, we need to - * capture the dependency on the fanin for the previous values of - * the array elements. Since we don't know at compile time actually - * which array elements are written, this serves to preserve the - * unconditional write to array elements prior to the conditional - * write. - * - * TODO only cat1 can do indirect write.. we could maybe move this - * into instr->cat1.fanin (but would require the frontend to insert - * the extra mov) - */ - struct ir3_instruction *fanin; - /* Entry in ir3_block's instruction list: */ struct list_head node; @@ -379,10 +371,41 @@ struct ir3 { /* List of blocks: */ struct list_head block_list; + /* List of ir3_array's: */ + struct list_head array_list; + unsigned heap_idx; struct ir3_heap_chunk *chunk; }; +typedef struct nir_variable nir_variable; + +struct ir3_array { + struct list_head node; + unsigned length; + unsigned id; + + nir_variable *var; + + /* We track the last write and last access (read or write) to + * setup dependencies on instructions that read or write the + * array. Reads can be re-ordered wrt. other reads, but should + * not be re-ordered wrt. to writes. Writes cannot be reordered + * wrt. any other access to the array. + * + * So array reads depend on last write, and array writes depend + * on the last access. + */ + struct ir3_instruction *last_write, *last_access; + + /* extra stuff used in RA pass: */ + unsigned base; /* base vreg name */ + unsigned reg; /* base physical reg */ + uint16_t start_ip, end_ip; +}; + +struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id); + typedef struct nir_block nir_block; struct ir3_block { @@ -430,6 +453,8 @@ const char *ir3_instr_name(struct ir3_instruction *instr); struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, int num, int flags); +struct ir3_register * ir3_reg_clone(struct ir3 *shader, + struct ir3_register *reg); void ir3_instr_set_address(struct ir3_instruction *instr, struct ir3_instruction *addr); @@ -510,6 +535,9 @@ static inline bool is_same_type_mov(struct ir3_instruction *instr) if (dst->num == regid(REG_A0, 0)) return false; + if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY)) + return false; + if ((instr->category == 1) && (instr->cat1.src_type == instr->cat1.dst_type)) return true; @@ -623,8 +651,10 @@ static inline bool writes_pred(struct ir3_instruction *instr) /* TODO better name */ static inline struct ir3_instruction *ssa(struct ir3_register *reg) { - if (reg->flags & IR3_REG_SSA) + if (reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) { + debug_assert(!(reg->instr && (reg->instr->flags & IR3_INSTR_UNUSED))); return reg->instr; + } return NULL; } @@ -813,8 +843,6 @@ static inline unsigned ir3_cat3_absneg(opc_t opc) static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr) { - if (instr->fanin) - return instr->regs_count + 2; if (instr->address) return instr->regs_count + 1; return instr->regs_count; @@ -822,8 +850,6 @@ static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr) static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n) { - if (n == (instr->regs_count + 1)) - return instr->fanin; if (n == (instr->regs_count + 0)) return instr->address; return ssa(instr->regs[n]); @@ -834,8 +860,8 @@ static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr /* iterator for an instruction's SSA sources (instr), also returns src #: */ #define foreach_ssa_src_n(__srcinst, __n, __instr) \ if ((__instr)->regs_count) \ - for (unsigned __cnt = __ssa_src_cnt(__instr) - 1, __n = 0; __n < __cnt; __n++) \ - if ((__srcinst = __ssa_src_n(__instr, __n + 1))) + for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \ + if ((__srcinst = __ssa_src_n(__instr, __n))) /* iterator for an instruction's SSA sources (instr): */ #define foreach_ssa_src(__srcinst, __instr) \ @@ -878,7 +904,15 @@ ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type) struct ir3_instruction *instr = ir3_instr_create(block, 1, 0); ir3_reg_create(instr, 0, 0); /* dst */ - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; + if (src->regs[0]->flags & IR3_REG_ARRAY) { + struct ir3_register *src_reg = + ir3_reg_create(instr, 0, IR3_REG_ARRAY); + src_reg->array = src->regs[0]->array; + src_reg->instr = src; + } else { + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; + } + debug_assert(!(src->regs[0]->flags & IR3_REG_RELATIV)); instr->cat1.src_type = type; instr->cat1.dst_type = type; return instr; @@ -894,6 +928,7 @@ ir3_COV(struct ir3_block *block, struct ir3_instruction *src, ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; instr->cat1.src_type = src_type; instr->cat1.dst_type = dst_type; + debug_assert(!(src->regs[0]->flags & IR3_REG_ARRAY)); return instr; } @@ -1083,7 +1118,7 @@ typedef uint8_t regmask_t[2 * MAX_REG / 8]; static inline unsigned regmask_idx(struct ir3_register *reg) { - unsigned num = reg->num; + unsigned num = (reg->flags & IR3_REG_RELATIV) ? reg->array.offset : reg->num; debug_assert(num < MAX_REG); if (reg->flags & IR3_REG_HALF) num += MAX_REG; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index 86afda4ba08..1ea2dd9cbf7 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -46,7 +46,6 @@ struct ir3_compile { struct ir3_compiler *compiler; - const struct tgsi_token *tokens; struct nir_shader *s; struct ir3 *ir; @@ -75,8 +74,6 @@ struct ir3_compile { /* mapping from nir_register to defining instruction: */ struct hash_table *def_ht; - /* mapping from nir_variable to ir3_array: */ - struct hash_table *var_ht; unsigned num_arrays; /* a common pattern for indirect addressing is to request the @@ -143,8 +140,6 @@ compile_init(struct ir3_compiler *compiler, ctx->so = so; ctx->def_ht = _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal); - ctx->var_ht = _mesa_hash_table_create(ctx, - _mesa_hash_pointer, _mesa_key_pointer_equal); ctx->block_ht = _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal); @@ -221,206 +216,26 @@ compile_free(struct ir3_compile *ctx) ralloc_free(ctx); } -/* global per-array information: */ -struct ir3_array { - unsigned length, aid; -}; - -/* per-block array state: */ -struct ir3_array_value { - /* TODO drop length/aid, and just have ptr back to ir3_array */ - unsigned length, aid; - /* initial array element values are phi's, other than for the - * entry block. The phi src's get added later in a resolve step - * after we have visited all the blocks, to account for back - * edges in the cfg. - */ - struct ir3_instruction **phis; - /* current array element values (as block is processed). When - * the array phi's are resolved, it will contain the array state - * at exit of block, so successor blocks can use it to add their - * phi srcs. - */ - struct ir3_instruction *arr[]; -}; - -/* track array assignments per basic block. When an array is read - * outside of the same basic block, we can use NIR's dominance-frontier - * information to figure out where phi nodes are needed. - */ -struct ir3_nir_block_data { - unsigned foo; - /* indexed by array-id (aid): */ - struct ir3_array_value *arrs[]; -}; - -static struct ir3_nir_block_data * -get_block_data(struct ir3_compile *ctx, struct ir3_block *block) -{ - if (!block->data) { - struct ir3_nir_block_data *bd = ralloc_size(ctx, sizeof(*bd) + - ((ctx->num_arrays + 1) * sizeof(bd->arrs[0]))); - block->data = bd; - } - return block->data; -} - static void declare_var(struct ir3_compile *ctx, nir_variable *var) { unsigned length = glsl_get_length(var->type) * 4; /* always vec4, at least with ttn */ struct ir3_array *arr = ralloc(ctx, struct ir3_array); + arr->id = ++ctx->num_arrays; arr->length = length; - arr->aid = ++ctx->num_arrays; - _mesa_hash_table_insert(ctx->var_ht, var, arr); + arr->var = var; + list_addtail(&arr->node, &ctx->ir->array_list); } -static nir_block * -nir_block_pred(nir_block *block) -{ - assert(block->predecessors->entries < 2); - if (block->predecessors->entries == 0) - return NULL; - return (nir_block *)_mesa_set_next_entry(block->predecessors, NULL)->key; -} - -static struct ir3_array_value * +static struct ir3_array * get_var(struct ir3_compile *ctx, nir_variable *var) { - struct hash_entry *entry = _mesa_hash_table_search(ctx->var_ht, var); - struct ir3_block *block = ctx->block; - struct ir3_nir_block_data *bd = get_block_data(ctx, block); - struct ir3_array *arr = entry->data; - - if (!bd->arrs[arr->aid]) { - struct ir3_array_value *av = ralloc_size(bd, sizeof(*av) + - (arr->length * sizeof(av->arr[0]))); - struct ir3_array_value *defn = NULL; - nir_block *pred_block; - - av->length = arr->length; - av->aid = arr->aid; - - /* For loops, we have to consider that we have not visited some - * of the blocks who should feed into the phi (ie. back-edges in - * the cfg).. for example: - * - * loop { - * block { load_var; ... } - * if then block {} else block {} - * block { store_var; ... } - * if then block {} else block {} - * block {...} - * } - * - * We can skip the phi if we can chase the block predecessors - * until finding the block previously defining the array without - * crossing a block that has more than one predecessor. - * - * Otherwise create phi's and resolve them as a post-pass after - * all the blocks have been visited (to handle back-edges). - */ - - for (pred_block = block->nblock; - pred_block && (pred_block->predecessors->entries < 2) && !defn; - pred_block = nir_block_pred(pred_block)) { - struct ir3_block *pblock = get_block(ctx, pred_block); - struct ir3_nir_block_data *pbd = pblock->data; - if (!pbd) - continue; - defn = pbd->arrs[arr->aid]; - } - - if (defn) { - /* only one possible definer: */ - for (unsigned i = 0; i < arr->length; i++) - av->arr[i] = defn->arr[i]; - } else if (pred_block) { - /* not the first block, and multiple potential definers: */ - av->phis = ralloc_size(av, arr->length * sizeof(av->phis[0])); - - for (unsigned i = 0; i < arr->length; i++) { - struct ir3_instruction *phi; - - phi = ir3_instr_create2(block, -1, OPC_META_PHI, - 1 + ctx->impl->num_blocks); - ir3_reg_create(phi, 0, 0); /* dst */ - - /* phi's should go at head of block: */ - list_delinit(&phi->node); - list_add(&phi->node, &block->instr_list); - - av->phis[i] = av->arr[i] = phi; - } - } else { - /* Some shaders end up reading array elements without - * first writing.. so initialize things to prevent null - * instr ptrs later: - */ - for (unsigned i = 0; i < arr->length; i++) - av->arr[i] = create_immed(block, 0); - } - - bd->arrs[arr->aid] = av; - } - - return bd->arrs[arr->aid]; -} - -static void -add_array_phi_srcs(struct ir3_compile *ctx, nir_block *nblock, - struct ir3_array_value *av, BITSET_WORD *visited) -{ - struct ir3_block *block; - struct ir3_nir_block_data *bd; - - if (BITSET_TEST(visited, nblock->index)) - return; - - BITSET_SET(visited, nblock->index); - - block = get_block(ctx, nblock); - bd = block->data; - - if (bd && bd->arrs[av->aid]) { - struct ir3_array_value *dav = bd->arrs[av->aid]; - for (unsigned i = 0; i < av->length; i++) { - ir3_reg_create(av->phis[i], 0, IR3_REG_SSA)->instr = - dav->arr[i]; - } - } else { - /* didn't find defn, recurse predecessors: */ - struct set_entry *entry; - set_foreach(nblock->predecessors, entry) { - add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited); - } - } -} - -static void -resolve_array_phis(struct ir3_compile *ctx, struct ir3_block *block) -{ - struct ir3_nir_block_data *bd = block->data; - unsigned bitset_words = BITSET_WORDS(ctx->impl->num_blocks); - - if (!bd) - return; - - /* TODO use nir dom_frontier to help us with this? */ - - for (unsigned i = 1; i <= ctx->num_arrays; i++) { - struct ir3_array_value *av = bd->arrs[i]; - BITSET_WORD visited[bitset_words]; - struct set_entry *entry; - - if (!(av && av->phis)) - continue; - - memset(visited, 0, sizeof(visited)); - set_foreach(block->nblock->predecessors, entry) { - add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited); - } + list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { + if (arr->var == var) + return arr; } + compile_error(ctx, "bogus var: %s\n", var->name); + return NULL; } /* allocate a n element value array (to be populated by caller) and @@ -438,6 +253,7 @@ __get_dst(struct ir3_compile *ctx, void *key, unsigned n) static struct ir3_instruction ** get_dst(struct ir3_compile *ctx, nir_dest *dst, unsigned n) { + compile_assert(ctx, dst->is_ssa); if (dst->is_ssa) { return __get_dst(ctx, &dst->ssa, n); } else { @@ -455,6 +271,7 @@ static struct ir3_instruction ** get_src(struct ir3_compile *ctx, nir_src *src) { struct hash_entry *entry; + compile_assert(ctx, src->is_ssa); if (src->is_ssa) { entry = _mesa_hash_table_search(ctx->def_ht, src->ssa); } else { @@ -560,7 +377,7 @@ create_uniform(struct ir3_compile *ctx, unsigned n) } static struct ir3_instruction * -create_uniform_indirect(struct ir3_compile *ctx, unsigned n, +create_uniform_indirect(struct ir3_compile *ctx, int n, struct ir3_instruction *address) { struct ir3_instruction *mov; @@ -569,7 +386,7 @@ create_uniform_indirect(struct ir3_compile *ctx, unsigned n, mov->cat1.src_type = TYPE_U32; mov->cat1.dst_type = TYPE_U32; ir3_reg_create(mov, 0, 0); - ir3_reg_create(mov, n, IR3_REG_CONST | IR3_REG_RELATIV); + ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n; ir3_instr_set_address(mov, address); @@ -594,7 +411,7 @@ create_collect(struct ir3_block *block, struct ir3_instruction **arr, } static struct ir3_instruction * -create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n, +create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, int n, struct ir3_instruction *address, struct ir3_instruction *collect) { struct ir3_block *block = ctx->block; @@ -608,17 +425,45 @@ create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n, src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV); src->instr = collect; src->size = arrsz; - src->offset = n; + src->array.offset = n; ir3_instr_set_address(mov, address); return mov; } +/* relative (indirect) if address!=NULL */ static struct ir3_instruction * -create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n, - struct ir3_instruction *src, struct ir3_instruction *address, - struct ir3_instruction *collect) +create_var_load(struct ir3_compile *ctx, struct ir3_array *arr, int n, + struct ir3_instruction *address) +{ + struct ir3_block *block = ctx->block; + struct ir3_instruction *mov; + struct ir3_register *src; + + mov = ir3_instr_create(block, 1, 0); + mov->cat1.src_type = TYPE_U32; + mov->cat1.dst_type = TYPE_U32; + ir3_reg_create(mov, 0, 0); + src = ir3_reg_create(mov, 0, IR3_REG_ARRAY | + COND(address, IR3_REG_RELATIV)); + src->instr = arr->last_write; + src->size = arr->length; + src->array.id = arr->id; + src->array.offset = n; + + if (address) + ir3_instr_set_address(mov, address); + + arr->last_access = mov; + + return mov; +} + +/* relative (indirect) if address!=NULL */ +static struct ir3_instruction * +create_var_store(struct ir3_compile *ctx, struct ir3_array *arr, int n, + struct ir3_instruction *src, struct ir3_instruction *address) { struct ir3_block *block = ctx->block; struct ir3_instruction *mov; @@ -627,14 +472,18 @@ create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n, mov = ir3_instr_create(block, 1, 0); mov->cat1.src_type = TYPE_U32; mov->cat1.dst_type = TYPE_U32; - dst = ir3_reg_create(mov, 0, IR3_REG_RELATIV); - dst->size = arrsz; - dst->offset = n; + dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY | + COND(address, IR3_REG_RELATIV)); + dst->instr = arr->last_access; + dst->size = arr->length; + dst->array.id = arr->id; + dst->array.offset = n; ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src; - mov->fanin = collect; ir3_instr_set_address(mov, address); + arr->last_write = arr->last_access = mov; + return mov; } @@ -1151,7 +1000,7 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr, nir_const_value *const_offset; /* UBO addresses are the first driver params: */ unsigned ubo = regid(ctx->so->first_driver_param + IR3_UBOS_OFF, 0); - unsigned off = intr->const_index[0]; + int off = intr->const_index[0]; /* First src is ubo index, which could either be an immed or not: */ src0 = get_src(ctx, &intr->src[0])[0]; @@ -1199,7 +1048,7 @@ emit_intrinsic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr, { nir_deref_var *dvar = intr->variables[0]; nir_deref_array *darr = nir_deref_as_array(dvar->deref.child); - struct ir3_array_value *arr = get_var(ctx, dvar->var); + struct ir3_array *arr = get_var(ctx, dvar->var); compile_assert(ctx, dvar->deref.child && (dvar->deref.child->deref_type == nir_deref_type_array)); @@ -1210,19 +1059,17 @@ emit_intrinsic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr, for (int i = 0; i < intr->num_components; i++) { unsigned n = darr->base_offset * 4 + i; compile_assert(ctx, n < arr->length); - dst[i] = arr->arr[n]; + dst[i] = create_var_load(ctx, arr, n, NULL); } break; case nir_deref_array_type_indirect: { /* for indirect, we need to collect all the array elements: */ - struct ir3_instruction *collect = - create_collect(ctx->block, arr->arr, arr->length); struct ir3_instruction *addr = get_addr(ctx, get_src(ctx, &darr->indirect)[0]); for (int i = 0; i < intr->num_components; i++) { unsigned n = darr->base_offset * 4 + i; compile_assert(ctx, n < arr->length); - dst[i] = create_indirect_load(ctx, arr->length, n, addr, collect); + dst[i] = create_var_load(ctx, arr, n, addr); } break; } @@ -1239,8 +1086,9 @@ emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) { nir_deref_var *dvar = intr->variables[0]; nir_deref_array *darr = nir_deref_as_array(dvar->deref.child); - struct ir3_array_value *arr = get_var(ctx, dvar->var); - struct ir3_instruction **src; + struct ir3_array *arr = get_var(ctx, dvar->var); + struct ir3_instruction *addr, **src; + unsigned wrmask = intr->const_index[0]; compile_assert(ctx, dvar->deref.child && (dvar->deref.child->deref_type == nir_deref_type_array)); @@ -1249,66 +1097,24 @@ emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) switch (darr->deref_array_type) { case nir_deref_array_type_direct: - /* direct access does not require anything special: */ - for (int i = 0; i < intr->num_components; i++) { - /* ttn doesn't generate partial writemasks */ - assert(intr->const_index[0] == - (1 << intr->num_components) - 1); - - unsigned n = darr->base_offset * 4 + i; - compile_assert(ctx, n < arr->length); - arr->arr[n] = src[i]; - } + addr = NULL; break; - case nir_deref_array_type_indirect: { - /* for indirect, create indirect-store and fan that out: */ - struct ir3_instruction *collect = - create_collect(ctx->block, arr->arr, arr->length); - struct ir3_instruction *addr = - get_addr(ctx, get_src(ctx, &darr->indirect)[0]); - for (int i = 0; i < intr->num_components; i++) { - /* ttn doesn't generate partial writemasks */ - assert(intr->const_index[0] == - (1 << intr->num_components) - 1); - - struct ir3_instruction *store; - unsigned n = darr->base_offset * 4 + i; - compile_assert(ctx, n < arr->length); - - store = create_indirect_store(ctx, arr->length, - n, src[i], addr, collect); - - store->fanin->fi.aid = arr->aid; - - /* TODO: probably split this out to be used for - * store_output_indirect? or move this into - * create_indirect_store()? - */ - for (int j = i; j < arr->length; j += intr->num_components) { - struct ir3_instruction *split; - - split = ir3_instr_create(ctx->block, -1, OPC_META_FO); - split->fo.off = j; - ir3_reg_create(split, 0, 0); - ir3_reg_create(split, 0, IR3_REG_SSA)->instr = store; - - arr->arr[j] = split; - } - } - /* fixup fanout/split neighbors: */ - for (int i = 0; i < arr->length; i++) { - arr->arr[i]->cp.right = (i < (arr->length - 1)) ? - arr->arr[i+1] : NULL; - arr->arr[i]->cp.left = (i > 0) ? - arr->arr[i-1] : NULL; - } + case nir_deref_array_type_indirect: + addr = get_addr(ctx, get_src(ctx, &darr->indirect)[0]); break; - } default: compile_error(ctx, "Unhandled store deref type: %u\n", darr->deref_array_type); break; } + + for (int i = 0; i < intr->num_components; i++) { + if (!(wrmask & (1 << i))) + continue; + unsigned n = darr->base_offset * 4 + i; + compile_assert(ctx, n < arr->length); + create_var_store(ctx, arr, n, src[i], addr); + } } static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot, @@ -1335,7 +1141,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic]; struct ir3_instruction **dst, **src; struct ir3_block *b = ctx->block; - unsigned idx = intr->const_index[0]; + int idx = intr->const_index[0]; nir_const_value *const_offset; if (info->has_dest) { @@ -1356,7 +1162,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) } else { src = get_src(ctx, &intr->src[0]); for (int i = 0; i < intr->num_components; i++) { - unsigned n = idx * 4 + i; + int n = idx * 4 + i; dst[i] = create_uniform_indirect(ctx, n, get_addr(ctx, src[0])); } @@ -1836,8 +1642,6 @@ resolve_phis(struct ir3_compile *ctx, struct ir3_block *block) ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; } } - - resolve_array_phis(ctx, block); } static void diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c index be4e4e81109..1cc211a7663 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c @@ -41,16 +41,22 @@ static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags) struct ir3_register *dst = instr->regs[0]; struct ir3_register *src = instr->regs[1]; struct ir3_instruction *src_instr = ssa(src); + + /* only if mov src is SSA (not const/immed): */ + if (!src_instr) + return false; + + /* no indirect: */ if (dst->flags & IR3_REG_RELATIV) return false; if (src->flags & IR3_REG_RELATIV) return false; + if (!allow_flags) if (src->flags & (IR3_REG_FABS | IR3_REG_FNEG | IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT)) return false; - if (!src_instr) - return false; + /* TODO: remove this hack: */ if (is_meta(src_instr) && (src_instr->opc == OPC_META_FO)) return false; @@ -82,10 +88,17 @@ static bool valid_flags(struct ir3_instruction *instr, unsigned n, unsigned valid_flags; flags = cp_flags(flags); + /* If destination is indirect, then source cannot be.. at least + * I don't think so.. + */ + if ((instr->regs[0]->flags & IR3_REG_RELATIV) && + (flags & IR3_REG_RELATIV)) + return false; + /* clear flags that are 'ok' */ switch (instr->category) { case 1: - valid_flags = IR3_REG_IMMED | IR3_REG_RELATIV; + valid_flags = IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV; if (flags & ~valid_flags) return false; break; @@ -183,9 +196,14 @@ static void combine_flags(unsigned *dstflags, unsigned srcflags) *dstflags ^= IR3_REG_SNEG; if (srcflags & IR3_REG_BNOT) *dstflags ^= IR3_REG_BNOT; -} -static struct ir3_instruction * instr_cp(struct ir3_instruction *instr, unsigned *flags); + *dstflags &= ~IR3_REG_SSA; + *dstflags |= srcflags & IR3_REG_SSA; + *dstflags |= srcflags & IR3_REG_CONST; + *dstflags |= srcflags & IR3_REG_IMMED; + *dstflags |= srcflags & IR3_REG_RELATIV; + *dstflags |= srcflags & IR3_REG_ARRAY; +} /* the "plain" MAD's (ie. the ones that don't shift first src prior to * multiply) can swap their first two srcs if src[0] is !CONST and @@ -206,52 +224,35 @@ static bool is_valid_mad(struct ir3_instruction *instr) static void reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n) { - unsigned src_flags = 0, new_flags; - struct ir3_instruction *src_instr; + struct ir3_instruction *src = ssa(reg); - if (is_meta(instr)) { - /* meta instructions cannot fold up register - * flags.. they are usually src for texture - * fetch, etc, where we cannot specify abs/neg - */ - reg->instr = instr_cp(reg->instr, NULL); - return; - } + if (is_eligible_mov(src, true)) { + /* simple case, no immed/const/relativ, only mov's w/ ssa src: */ + struct ir3_register *src_reg = src->regs[1]; + unsigned new_flags = reg->flags; - src_instr = instr_cp(reg->instr, &src_flags); + combine_flags(&new_flags, src_reg->flags); - new_flags = reg->flags; - combine_flags(&new_flags, src_flags); - - reg->flags = new_flags; - reg->instr = src_instr; - - if (!valid_flags(instr, n, reg->flags)) { - /* insert an absneg.f */ - if (reg->flags & (IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT)) { - debug_assert(!(reg->flags & (IR3_REG_FNEG | IR3_REG_FABS))); - reg->instr = ir3_ABSNEG_S(instr->block, - reg->instr, cp_flags(src_flags)); - } else { - debug_assert(!(reg->flags & (IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT))); - reg->instr = ir3_ABSNEG_F(instr->block, - reg->instr, cp_flags(src_flags)); + if (valid_flags(instr, n, new_flags)) { + if (new_flags & IR3_REG_ARRAY) { + debug_assert(!(reg->flags & IR3_REG_ARRAY)); + reg->array = src_reg->array; + } + reg->flags = new_flags; + reg->instr = ssa(src_reg); } - reg->flags &= ~cp_flags(src_flags); - debug_assert(valid_flags(instr, n, reg->flags)); - /* send it through instr_cp() again since - * the absneg src might be a mov from const - * that could be cleaned up: - */ - reg->instr = instr_cp(reg->instr, NULL); - return; - } - if (is_same_type_mov(reg->instr)) { - struct ir3_register *src_reg = reg->instr->regs[1]; - unsigned new_flags = src_reg->flags; + src = ssa(reg); /* could be null for IR3_REG_ARRAY case */ + if (!src) + return; + } else if (is_same_type_mov(src) && + /* cannot collapse const/immed/etc into meta instrs: */ + !is_meta(instr)) { + /* immed/const/etc cases, which require some special handling: */ + struct ir3_register *src_reg = src->regs[1]; + unsigned new_flags = reg->flags; - combine_flags(&new_flags, reg->flags); + combine_flags(&new_flags, src_reg->flags); if (!valid_flags(instr, n, new_flags)) { /* special case for "normal" mad instructions, we can @@ -287,6 +288,16 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n) conflicts(instr->address, reg->instr->address)) return; + /* This seems to be a hw bug, or something where the timings + * just somehow don't work out. This restriction may only + * apply if the first src is also CONST. + */ + if ((instr->category == 3) && (n == 2) && + (src_reg->flags & IR3_REG_RELATIV) && + (src_reg->array.offset == 0)) + return; + + src_reg = ir3_reg_clone(instr->block->shader, src_reg); src_reg->flags = new_flags; instr->regs[n+1] = src_reg; @@ -298,6 +309,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n) if ((src_reg->flags & IR3_REG_RELATIV) && !conflicts(instr->address, reg->instr->address)) { + src_reg = ir3_reg_clone(instr->block->shader, src_reg); src_reg->flags = new_flags; instr->regs[n+1] = src_reg; ir3_instr_set_address(instr, reg->instr->address); @@ -330,8 +342,10 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n) if (new_flags & IR3_REG_BNOT) iim_val = ~iim_val; - if (!(iim_val & ~0x3ff)) { + /* other than category 1 (mov) we can only encode up to 10 bits: */ + if ((instr->category == 1) || !(iim_val & ~0x3ff)) { new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT); + src_reg = ir3_reg_clone(instr->block->shader, src_reg); src_reg->flags = new_flags; src_reg->iim_val = iim_val; instr->regs[n+1] = src_reg; @@ -342,56 +356,68 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n) } } -/** - * Given an SSA src (instruction), return the one with extraneous - * mov's removed, ie, for (to copy NIR syntax): - * - * vec1 ssa1 = fadd , - * vec1 ssa2 = fabs ssa1 - * vec1 ssa3 = fneg ssa1 - * - * then calling instr_cp(ssa3, &flags) would return ssa1 with - * (IR3_REG_ABS | IR3_REG_NEGATE) in flags. If flags is NULL, - * then disallow eliminating copies which would require flag - * propagation (for example, we cannot propagate abs/neg into - * an output). +/* Handle special case of eliminating output mov, and similar cases where + * there isn't a normal "consuming" instruction. In this case we cannot + * collapse flags (ie. output mov from const, or w/ abs/neg flags, cannot + * be eliminated) */ static struct ir3_instruction * -instr_cp(struct ir3_instruction *instr, unsigned *flags) +eliminate_output_mov(struct ir3_instruction *instr) +{ + if (is_eligible_mov(instr, false)) { + struct ir3_register *reg = instr->regs[1]; + if (!(reg->flags & IR3_REG_ARRAY)) { + struct ir3_instruction *src_instr = ssa(reg); + debug_assert(src_instr); + return src_instr; + } + } + return instr; +} + +/** + * Find instruction src's which are mov's that can be collapsed, replacing + * the mov dst with the mov src + */ +static void +instr_cp(struct ir3_instruction *instr) { struct ir3_register *reg; - if (is_eligible_mov(instr, !!flags)) { - struct ir3_register *reg = instr->regs[1]; - struct ir3_instruction *src_instr = ssa(reg); - if (flags) - combine_flags(flags, reg->flags); - return instr_cp(src_instr, flags); - } + if (instr->regs_count == 0) + return; - /* Check termination condition before walking children (rather - * than before checking eligible-mov). A mov instruction may - * appear as ssa-src for multiple other instructions, and we - * want to consider it for removal for each, rather than just - * the first one. (But regardless of how many places it shows - * up as a src, we only need to recursively walk the children - * once.) - */ if (ir3_instr_check_mark(instr)) - return instr; + return; /* walk down the graph from each src: */ foreach_src_n(reg, n, instr) { - if (!(reg->flags & IR3_REG_SSA)) + struct ir3_instruction *src = ssa(reg); + + if (!src) + continue; + + instr_cp(src); + + /* TODO non-indirect access we could figure out which register + * we actually want and allow cp.. + */ + if (reg->flags & IR3_REG_ARRAY) continue; reg_cp(instr, reg, n); } - if (instr->address) - ir3_instr_set_address(instr, instr_cp(instr->address, NULL)); + if (instr->regs[0]->flags & IR3_REG_ARRAY) { + struct ir3_instruction *src = ssa(instr->regs[0]); + if (src) + instr_cp(src); + } - return instr; + if (instr->address) { + instr_cp(instr->address); + ir3_instr_set_address(instr, eliminate_output_mov(instr->address)); + } } void @@ -401,19 +427,20 @@ ir3_cp(struct ir3 *ir) for (unsigned i = 0; i < ir->noutputs; i++) { if (ir->outputs[i]) { - struct ir3_instruction *out = - instr_cp(ir->outputs[i], NULL); - - ir->outputs[i] = out; + instr_cp(ir->outputs[i]); + ir->outputs[i] = eliminate_output_mov(ir->outputs[i]); } } for (unsigned i = 0; i < ir->keeps_count; i++) { - ir->keeps[i] = instr_cp(ir->keeps[i], NULL); + instr_cp(ir->keeps[i]); + ir->keeps[i] = eliminate_output_mov(ir->keeps[i]); } list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - if (block->condition) - block->condition = instr_cp(block->condition, NULL); + if (block->condition) { + instr_cp(block->condition); + block->condition = eliminate_output_mov(block->condition); + } } } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c index 4bbc0458790..6d294f1a48c 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c @@ -76,7 +76,7 @@ int ir3_delayslots(struct ir3_instruction *assigner, return 6; } else if ((consumer->category == 3) && (is_mad(consumer->opc) || is_madsh(consumer->opc)) && - (n == 2)) { + (n == 3)) { /* special case, 3rd src to cat3 not required on first cycle */ return 1; } else { @@ -118,6 +118,10 @@ ir3_instr_depth(struct ir3_instruction *instr) /* visit child to compute it's depth: */ ir3_instr_depth(src); + /* for array writes, no need to delay on previous write: */ + if (i == 0) + continue; + sd = ir3_delayslots(src, instr, i) + src->depth; instr->depth = MAX2(instr->depth, sd); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c index a84e7989cf8..ba0c4a57aa3 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_print.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c @@ -94,7 +94,7 @@ static void print_instr_name(struct ir3_instruction *instr) } } -static void print_reg_name(struct ir3_register *reg, bool followssa) +static void print_reg_name(struct ir3_register *reg) { if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) && (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))) @@ -106,20 +106,29 @@ static void print_reg_name(struct ir3_register *reg, bool followssa) if (reg->flags & IR3_REG_IMMED) { printf("imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val); - } else if (reg->flags & IR3_REG_SSA) { - printf("_"); - if (followssa) { - printf("["); + } else if (reg->flags & IR3_REG_ARRAY) { + printf("arr[id=%u, offset=%d, size=%u", reg->array.id, + reg->array.offset, reg->size); + /* for ARRAY we could have null src, for example first write + * instruction.. + */ + if (reg->instr) { + printf(", _["); print_instr_name(reg->instr); printf("]"); } + printf("]"); + } else if (reg->flags & IR3_REG_SSA) { + printf("_["); + print_instr_name(reg->instr); + printf("]"); } else if (reg->flags & IR3_REG_RELATIV) { if (reg->flags & IR3_REG_HALF) printf("h"); if (reg->flags & IR3_REG_CONST) - printf("c", reg->num); + printf("c", reg->array.offset); else - printf("\x1b[0;31mr\x1b[0m (%u)", reg->num, reg->size); + printf("\x1b[0;31mr\x1b[0m (%u)", reg->array.offset, reg->size); } else { if (reg->flags & IR3_REG_HALF) printf("h"); @@ -158,7 +167,7 @@ print_instr(struct ir3_instruction *instr, int lvl) for (i = 0; i < instr->regs_count; i++) { struct ir3_register *reg = instr->regs[i]; printf(i ? ", " : " "); - print_reg_name(reg, !!i); + print_reg_name(reg); } if (instr->address) { @@ -168,13 +177,6 @@ print_instr(struct ir3_instruction *instr, int lvl) printf("]"); } - if (instr->fanin) { - printf(", fanin=_"); - printf("["); - print_instr_name(instr->fanin); - printf("]"); - } - if (instr->cp.left) { printf(", left=_"); printf("["); @@ -192,8 +194,6 @@ print_instr(struct ir3_instruction *instr, int lvl) if (is_meta(instr)) { if (instr->opc == OPC_META_FO) { printf(", off=%d", instr->fo.off); - } else if ((instr->opc == OPC_META_FI) && instr->fi.aid) { - printf(", aid=%d", instr->fi.aid); } } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c index 74755eb3bc0..2ed78818e61 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c @@ -68,25 +68,24 @@ * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after * register assignment. But for us that is horrible from a scheduling * standpoint. Instead what we do is use idea of 'definer' instruction. - * Ie. the first instruction (lowest ip) to write to the array is the + * Ie. the first instruction (lowest ip) to write to the variable is the * one we consider from use/def perspective when building interference - * graph. (Other instructions which write other array elements just - * define the variable some more.) + * graph. (Other instructions which write other variable components + * just define the variable some more.) + * + * Arrays of arbitrary size are handled via pre-coloring a consecutive + * sequence of registers. Additional scalar (single component) reg + * names are allocated starting at ctx->class_base[total_class_count] + * (see arr->base), which are pre-colored. In the use/def graph direct + * access is treated as a single element use/def, and indirect access + * is treated as use or def of all array elements. (Only the first + * def is tracked, in case of multiple indirect writes, etc.) */ static const unsigned class_sizes[] = { 1, 2, 3, 4, 4 + 4, /* txd + 1d/2d */ 4 + 6, /* txd + 3d */ - /* temporary: until we can assign arrays, create classes so we - * can round up array to fit. NOTE with tgsi arrays should - * really all be multiples of four: - */ - 4 * 4, - 4 * 8, - 4 * 16, - 4 * 32, - }; #define class_count ARRAY_SIZE(class_sizes) @@ -265,13 +264,21 @@ struct ir3_ra_ctx { struct ir3_ra_reg_set *set; struct ra_graph *g; unsigned alloc_count; - unsigned class_alloc_count[total_class_count]; - unsigned class_base[total_class_count]; + /* one per class, plus one slot for arrays: */ + unsigned class_alloc_count[total_class_count + 1]; + unsigned class_base[total_class_count + 1]; unsigned instr_cnt; unsigned *def, *use; /* def/use table */ struct ir3_ra_instr_data *instrd; }; +/* does it conflict? */ +static inline bool +intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end) +{ + return !((a_start >= b_end) || (b_start >= a_end)); +} + static bool is_half(struct ir3_instruction *instr) { @@ -329,9 +336,6 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; struct ir3_instruction *d = NULL; - if (instr->fanin) - return get_definer(ctx, instr->fanin, sz, off); - if (id->defn) { *sz = id->sz; *off = id->off; @@ -485,10 +489,13 @@ ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block) /* couple special cases: */ if (writes_addr(instr) || writes_pred(instr)) { id->cls = -1; - continue; + } else if (instr->regs[0]->flags & IR3_REG_ARRAY) { + id->cls = total_class_count; + id->defn = instr; + } else { + id->defn = get_definer(ctx, instr, &id->sz, &id->off); + id->cls = size_to_class(id->sz, is_half(id->defn)); } - id->defn = get_definer(ctx, instr, &id->sz, &id->off); - id->cls = size_to_class(id->sz, is_half(id->defn)); } } @@ -518,8 +525,6 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block) /* arrays which don't fit in one of the pre-defined class * sizes are pre-colored: - * - * TODO but we still need to allocate names for them, don't we?? */ if (id->cls >= 0) { instr->name = ctx->class_alloc_count[id->cls]++; @@ -531,7 +536,7 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block) static void ra_init(struct ir3_ra_ctx *ctx) { - unsigned n; + unsigned n, base; ir3_clear_mark(ctx->ir); n = ir3_count_instructions(ctx->ir); @@ -550,11 +555,20 @@ ra_init(struct ir3_ra_ctx *ctx) * actual ra name is class_base[cls] + instr->name; */ ctx->class_base[0] = 0; - for (unsigned i = 1; i < total_class_count; i++) { + for (unsigned i = 1; i <= total_class_count; i++) { ctx->class_base[i] = ctx->class_base[i-1] + ctx->class_alloc_count[i-1]; } + /* and vreg names for array elements: */ + base = ctx->class_base[total_class_count]; + list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { + arr->base = base; + ctx->class_alloc_count[total_class_count] += arr->length; + base += arr->length; + } + ctx->alloc_count += ctx->class_alloc_count[total_class_count]; + ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count); ralloc_steal(ctx->g, ctx->instrd); ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); @@ -562,15 +576,23 @@ ra_init(struct ir3_ra_ctx *ctx) } static unsigned -ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn) +__ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn) { unsigned name; debug_assert(cls >= 0); + debug_assert(cls < total_class_count); /* we shouldn't get arrays here.. */ name = ctx->class_base[cls] + defn->name; debug_assert(name < ctx->alloc_count); return name; } +static int +ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id) +{ + /* TODO handle name mapping for arrays */ + return __ra_name(ctx, id->cls, id->defn); +} + static void ra_destroy(struct ir3_ra_ctx *ctx) { @@ -583,6 +605,22 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) struct ir3_ra_block_data *bd; unsigned bitset_words = BITSET_WORDS(ctx->alloc_count); + void def(unsigned name, struct ir3_instruction *instr) + { + /* defined on first write: */ + if (!ctx->def[name]) + ctx->def[name] = instr->ip; + ctx->use[name] = instr->ip; + BITSET_SET(bd->def, name); + } + + void use(unsigned name, struct ir3_instruction *instr) + { + ctx->use[name] = MAX2(ctx->use[name], instr->ip); + if (!BITSET_TEST(bd->def, name)) + BITSET_SET(bd->use, name); + } + bd = rzalloc(ctx->g, struct ir3_ra_block_data); bd->def = rzalloc_array(bd, BITSET_WORD, bitset_words); @@ -594,6 +632,7 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { struct ir3_instruction *src; + struct ir3_register *reg; if (instr->regs_count == 0) continue; @@ -625,61 +664,101 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) if (writes_gpr(instr)) { struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; + struct ir3_register *dst = instr->regs[0]; - if (id->defn == instr) { - /* arrays which don't fit in one of the pre-defined class - * sizes are pre-colored: + if (dst->flags & IR3_REG_ARRAY) { + struct ir3_array *arr = + ir3_lookup_array(ctx->ir, dst->array.id); + unsigned i; + + debug_assert(!(dst->flags & IR3_REG_PHI_SRC)); + + arr->start_ip = MIN2(arr->start_ip, instr->ip); + arr->end_ip = MAX2(arr->end_ip, instr->ip); + + /* set the node class now.. in case we don't encounter + * this array dst again. From register_alloc algo's + * perspective, these are all single/scalar regs: */ - if (id->cls >= 0) { - unsigned name = ra_name(ctx, id->cls, id->defn); + for (i = 0; i < arr->length; i++) { + unsigned name = arr->base + i; + ra_set_node_class(ctx->g, name, ctx->set->classes[0]); + } - ctx->def[name] = id->defn->ip; - ctx->use[name] = id->defn->ip; - - /* since we are in SSA at this point: */ - debug_assert(!BITSET_TEST(bd->use, name)); - - BITSET_SET(bd->def, name); - - if (is_half(id->defn)) { - ra_set_node_class(ctx->g, name, - ctx->set->half_classes[id->cls - class_count]); - } else { - ra_set_node_class(ctx->g, name, - ctx->set->classes[id->cls]); + /* indirect write is treated like a write to all array + * elements, since we don't know which one is actually + * written: + */ + if (dst->flags & IR3_REG_RELATIV) { + for (i = 0; i < arr->length; i++) { + unsigned name = arr->base + i; + def(name, instr); } + } else { + unsigned name = arr->base + dst->array.offset; + def(name, instr); + } - /* extend the live range for phi srcs, which may come - * from the bottom of the loop - */ - if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) { - struct ir3_instruction *phi = id->defn->regs[0]->instr; - foreach_ssa_src(src, phi) { - /* if src is after phi, then we need to extend - * the liverange to the end of src's block: - */ - if (src->ip > phi->ip) { - struct ir3_instruction *last = + } else if (id->defn == instr) { + unsigned name = ra_name(ctx, id); + + /* since we are in SSA at this point: */ + debug_assert(!BITSET_TEST(bd->use, name)); + + def(name, id->defn); + + if (is_half(id->defn)) { + ra_set_node_class(ctx->g, name, + ctx->set->half_classes[id->cls - class_count]); + } else { + ra_set_node_class(ctx->g, name, + ctx->set->classes[id->cls]); + } + + /* extend the live range for phi srcs, which may come + * from the bottom of the loop + */ + if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) { + struct ir3_instruction *phi = id->defn->regs[0]->instr; + foreach_ssa_src(src, phi) { + /* if src is after phi, then we need to extend + * the liverange to the end of src's block: + */ + if (src->ip > phi->ip) { + struct ir3_instruction *last = list_last_entry(&src->block->instr_list, - struct ir3_instruction, node); - ctx->use[name] = MAX2(ctx->use[name], last->ip); - } + struct ir3_instruction, node); + ctx->use[name] = MAX2(ctx->use[name], last->ip); } } } } } - foreach_ssa_src(src, instr) { - if (writes_gpr(src)) { - struct ir3_ra_instr_data *id = &ctx->instrd[src->ip]; - - if (id->cls >= 0) { - unsigned name = ra_name(ctx, id->cls, id->defn); - ctx->use[name] = MAX2(ctx->use[name], instr->ip); - if (!BITSET_TEST(bd->def, name)) - BITSET_SET(bd->use, name); + foreach_src(reg, instr) { + if (reg->flags & IR3_REG_ARRAY) { + struct ir3_array *arr = + ir3_lookup_array(ctx->ir, reg->array.id); + arr->start_ip = MIN2(arr->start_ip, instr->ip); + arr->end_ip = MAX2(arr->end_ip, instr->ip); + /* indirect read is treated like a read fromall array + * elements, since we don't know which one is actually + * read: + */ + if (reg->flags & IR3_REG_RELATIV) { + unsigned i; + for (i = 0; i < arr->length; i++) { + unsigned name = arr->base + i; + use(name, instr); + } + } else { + unsigned name = arr->base + reg->array.offset; + use(name, instr); + debug_assert(reg->array.offset < arr->length); } + } else if ((src = ssa(reg)) && writes_gpr(src)) { + unsigned name = ra_name(ctx, &ctx->instrd[src->ip]); + use(name, instr); } } } @@ -735,6 +814,12 @@ ra_add_interference(struct ir3_ra_ctx *ctx) { struct ir3 *ir = ctx->ir; + /* initialize array live ranges: */ + list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) { + arr->start_ip = ~0; + arr->end_ip = 0; + } + /* compute live ranges (use/def) on a block level, also updating * block's def/use bitmasks (used below to calculate per-block * livein/liveout): @@ -767,18 +852,14 @@ ra_add_interference(struct ir3_ra_ctx *ctx) /* need to fix things up to keep outputs live: */ for (unsigned i = 0; i < ir->noutputs; i++) { struct ir3_instruction *instr = ir->outputs[i]; - struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; - - if (id->cls >= 0) { - unsigned name = ra_name(ctx, id->cls, id->defn); - ctx->use[name] = ctx->instr_cnt; - } + unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]); + ctx->use[name] = ctx->instr_cnt; } for (unsigned i = 0; i < ctx->alloc_count; i++) { for (unsigned j = 0; j < ctx->alloc_count; j++) { - if (!((ctx->def[i] >= ctx->use[j]) || - (ctx->def[j] >= ctx->use[i]))) { + if (intersects(ctx->def[i], ctx->use[i], + ctx->def[j], ctx->use[j])) { ra_add_node_interference(ctx->g, i, j); } } @@ -836,19 +917,36 @@ static void fixup_half_instr_src(struct ir3_instruction *instr) } } +/* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first + * array access(es) which do not have any previous access to depend + * on from scheduling point of view + */ static void reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg, struct ir3_instruction *instr) { - struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; + struct ir3_ra_instr_data *id; - if (id->cls >= 0) { - unsigned name = ra_name(ctx, id->cls, id->defn); + if (reg->flags & IR3_REG_ARRAY) { + struct ir3_array *arr = + ir3_lookup_array(ctx->ir, reg->array.id); + unsigned name = arr->base + reg->array.offset; + unsigned r = ra_get_node_reg(ctx->g, name); + unsigned num = ctx->set->ra_reg_to_gpr[r]; + + if (reg->flags & IR3_REG_RELATIV) { + reg->array.offset = num; + } else { + reg->num = num; + } + + reg->flags &= ~IR3_REG_ARRAY; + } else if ((id = &ctx->instrd[instr->ip]) && id->defn) { + unsigned name = ra_name(ctx, id); unsigned r = ra_get_node_reg(ctx->g, name); unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off; - if (reg->flags & IR3_REG_RELATIV) - num += reg->offset; + debug_assert(!(reg->flags & IR3_REG_RELATIV)); reg->num = num; reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC); @@ -875,9 +973,9 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block) foreach_src_n(reg, n, instr) { struct ir3_instruction *src = reg->instr; - if (!src) + /* Note: reg->instr could be null for IR3_REG_ARRAY */ + if (!(src || (reg->flags & IR3_REG_ARRAY))) continue; - reg_assign(ctx, instr->regs[n+1], src); if (instr->regs[n+1]->flags & IR3_REG_HALF) fixup_half_instr_src(instr); @@ -888,6 +986,8 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block) static int ra_alloc(struct ir3_ra_ctx *ctx) { + unsigned n = 0; + /* frag shader inputs get pre-assigned, since we have some * constraints/unknowns about setup for some of these regs: */ @@ -897,7 +997,7 @@ ra_alloc(struct ir3_ra_ctx *ctx) if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) { struct ir3_instruction *instr = ir->inputs[i]; int cls = size_to_class(1, true); - unsigned name = ra_name(ctx, cls, instr); + unsigned name = __ra_name(ctx, cls, instr); unsigned reg = ctx->set->gpr_to_ra_reg[cls][0]; /* if we have frag_face, it gets hr0.x */ @@ -905,7 +1005,8 @@ ra_alloc(struct ir3_ra_ctx *ctx) i += 4; } - for (j = 0; i < ir->ninputs; i++) { + j = 0; + for (; i < ir->ninputs; i++) { struct ir3_instruction *instr = ir->inputs[i]; if (instr) { struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; @@ -913,7 +1014,7 @@ ra_alloc(struct ir3_ra_ctx *ctx) if (id->defn == instr) { unsigned name, reg; - name = ra_name(ctx, id->cls, id->defn); + name = ra_name(ctx, id); reg = ctx->set->gpr_to_ra_reg[id->cls][j]; ra_set_node_reg(ctx->g, name, reg); @@ -921,6 +1022,46 @@ ra_alloc(struct ir3_ra_ctx *ctx) } } } + n = j; + } + + /* pre-assign array elements: + */ + list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { + unsigned base = n; + + if (arr->end_ip == 0) + continue; + + /* figure out what else we conflict with which has already + * been assigned: + */ +retry: + list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) { + if (arr2 == arr) + break; + if (arr2->end_ip == 0) + continue; + /* if it intersects with liverange AND register range.. */ + if (intersects(arr->start_ip, arr->end_ip, + arr2->start_ip, arr2->end_ip) && + intersects(base, base + arr->length, + arr2->reg, arr2->reg + arr2->length)) { + base = MAX2(base, arr2->reg + arr2->length); + goto retry; + } + } + + arr->reg = base; + + for (unsigned i = 0; i < arr->length; i++) { + unsigned name, reg; + + name = arr->base + i; + reg = ctx->set->gpr_to_ra_reg[0][base++]; + + ra_set_node_reg(ctx->g, name, reg); + } } if (!ra_allocate(ctx->g)) diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c index 6aaa16edbfe..8f640febc5d 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c @@ -187,6 +187,9 @@ delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) foreach_ssa_src_n(src, i, instr) { unsigned d; + /* for array writes, no need to delay on previous write: */ + if (i == 0) + continue; if (src->block != instr->block) continue; d = delay_calc_srcn(ctx, src, instr, i); diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c index ede5558a11e..6b0ab587001 100644 --- a/src/gallium/drivers/i915/i915_screen.c +++ b/src/gallium/drivers/i915/i915_screen.c @@ -261,6 +261,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap) case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: case PIPE_CAP_INVALIDATE_BUFFER: case PIPE_CAP_GENERATE_MIPMAP: + case PIPE_CAP_STRING_MARKER: return 0; case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: diff --git a/src/gallium/drivers/i915/i915_state_derived.c b/src/gallium/drivers/i915/i915_state_derived.c index bd0f448f645..177b8545985 100644 --- a/src/gallium/drivers/i915/i915_state_derived.c +++ b/src/gallium/drivers/i915/i915_state_derived.c @@ -184,7 +184,7 @@ static void calculate_vertex_layout(struct i915_context *i915) struct i915_tracked_state i915_update_vertex_layout = { "vertex_layout", calculate_vertex_layout, - I915_NEW_FS | I915_NEW_VS + I915_NEW_RASTERIZER | I915_NEW_FS | I915_NEW_VS }; diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c index fa327571b9b..5171cca9ea6 100644 --- a/src/gallium/drivers/ilo/ilo_screen.c +++ b/src/gallium/drivers/ilo/ilo_screen.c @@ -485,6 +485,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: case PIPE_CAP_INVALIDATE_BUFFER: case PIPE_CAP_GENERATE_MIPMAP: + case PIPE_CAP_STRING_MARKER: return 0; case PIPE_CAP_VENDOR_ID: diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h index 62d99bbaac8..d4bd02d0225 100644 --- a/src/gallium/drivers/llvmpipe/lp_context.h +++ b/src/gallium/drivers/llvmpipe/lp_context.h @@ -82,8 +82,6 @@ struct llvmpipe_context { struct pipe_viewport_state viewports[PIPE_MAX_VIEWPORTS]; struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS]; struct pipe_index_buffer index_buffer; - struct pipe_resource *mapped_vs_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS]; - struct pipe_resource *mapped_gs_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS]; unsigned num_samplers[PIPE_SHADER_TYPES]; unsigned num_sampler_views[PIPE_SHADER_TYPES]; diff --git a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c index edfb2040969..22ef5fc17f9 100644 --- a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c +++ b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c @@ -149,9 +149,6 @@ llvmpipe_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) draw_vs_reset_so(lp->vs); } } - - llvmpipe_cleanup_vertex_sampling(lp); - llvmpipe_cleanup_geometry_sampling(lp); /* * TODO: Flush only when a user vertex/index buffer is present diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c index fb52f5dc063..879a2e7d2f0 100644 --- a/src/gallium/drivers/llvmpipe/lp_screen.c +++ b/src/gallium/drivers/llvmpipe/lp_screen.c @@ -310,6 +310,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: case PIPE_CAP_INVALIDATE_BUFFER: case PIPE_CAP_GENERATE_MIPMAP: + case PIPE_CAP_STRING_MARKER: return 0; } /* should only get here on unhandled cases */ diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c index bd850519468..34d3c812b60 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup.c +++ b/src/gallium/drivers/llvmpipe/lp_setup.c @@ -476,27 +476,30 @@ lp_setup_try_clear_zs(struct lp_setup_context *setup, uint64_t zsvalue = 0; uint32_t zmask32; uint8_t smask8; + enum pipe_format format = setup->fb.zsbuf->format; LP_DBG(DEBUG_SETUP, "%s state %d\n", __FUNCTION__, setup->state); zmask32 = (flags & PIPE_CLEAR_DEPTH) ? ~0 : 0; smask8 = (flags & PIPE_CLEAR_STENCIL) ? ~0 : 0; - zsvalue = util_pack64_z_stencil(setup->fb.zsbuf->format, - depth, - stencil); + zsvalue = util_pack64_z_stencil(format, depth, stencil); - /* - * XXX: should make a full mask here for things like D24X8, - * otherwise we'll do a read-modify-write clear later which - * should be unnecessary. - */ - zsmask = util_pack64_mask_z_stencil(setup->fb.zsbuf->format, - zmask32, - smask8); + zsmask = util_pack64_mask_z_stencil(format, zmask32, smask8); zsvalue &= zsmask; + if (format == PIPE_FORMAT_Z24X8_UNORM || + format == PIPE_FORMAT_X8Z24_UNORM) { + /* + * Make full mask if there's "X" bits so we can do full + * clear (without rmw). + */ + uint32_t zsmask_full = 0; + zsmask_full = util_pack_mask_z_stencil(format, ~0, ~0); + zsmask |= ~zsmask_full; + } + if (setup->state == SETUP_ACTIVE) { struct lp_scene *scene = setup->scene; @@ -796,13 +799,15 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup, unsigned num, struct pipe_sampler_view **views) { - unsigned i; + unsigned i, max_tex_num; LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__); assert(num <= PIPE_MAX_SHADER_SAMPLER_VIEWS); - for (i = 0; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; i++) { + max_tex_num = MAX2(num, setup->fs.current_tex_num); + + for (i = 0; i < max_tex_num; i++) { struct pipe_sampler_view *view = i < num ? views[i] : NULL; if (view) { @@ -922,7 +927,11 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup, assert(jit_tex->base); } } + else { + pipe_resource_reference(&setup->fs.current_tex[i], NULL); + } } + setup->fs.current_tex_num = num; setup->dirty |= LP_SETUP_NEW_FS; } diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h index 80acd74bddd..03bb8ce2b6f 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_context.h +++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h @@ -133,6 +133,7 @@ struct lp_setup_context const struct lp_rast_state *stored; /**< what's in the scene */ struct lp_rast_state current; /**< currently set state */ struct pipe_resource *current_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS]; + unsigned current_tex_num; } fs; /** fragment shader constants */ diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c index aa241761586..907129dbd1b 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c @@ -556,7 +556,7 @@ do_triangle_ccw(struct lp_setup_context *setup, /* Calculate trivial reject values: */ - eo = vec_sub_epi32(vec_andc(dcdy_neg_mask, dcdy), + eo = vec_sub_epi32(vec_andnot_si128(dcdy_neg_mask, dcdy), vec_and(dcdx_neg_mask, dcdx)); /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */ diff --git a/src/gallium/drivers/llvmpipe/lp_state.h b/src/gallium/drivers/llvmpipe/lp_state.h index 2da6caaef16..78918cf984d 100644 --- a/src/gallium/drivers/llvmpipe/lp_state.h +++ b/src/gallium/drivers/llvmpipe/lp_state.h @@ -130,16 +130,10 @@ void llvmpipe_prepare_vertex_sampling(struct llvmpipe_context *ctx, unsigned num, struct pipe_sampler_view **views); -void -llvmpipe_cleanup_vertex_sampling(struct llvmpipe_context *ctx); - void llvmpipe_prepare_geometry_sampling(struct llvmpipe_context *ctx, unsigned num, struct pipe_sampler_view **views); -void -llvmpipe_cleanup_geometry_sampling(struct llvmpipe_context *ctx); - #endif diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c index 34961cbbac5..c90f2f270fe 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_derived.c +++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c @@ -190,8 +190,10 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe ) llvmpipe->tex_timestamp = lp_screen->timestamp; llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW; } - - if (llvmpipe->dirty & (LP_NEW_FS | + + /* This needs LP_NEW_RASTERIZER because of draw_prepare_shader_outputs(). */ + if (llvmpipe->dirty & (LP_NEW_RASTERIZER | + LP_NEW_FS | LP_NEW_VS)) compute_vertex_info(llvmpipe); diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c index 1e055878f7c..32bf9fdd25d 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c +++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c @@ -98,8 +98,9 @@ llvmpipe_bind_sampler_states(struct pipe_context *pipe, llvmpipe->samplers[shader], llvmpipe->num_samplers[shader]); } - - llvmpipe->dirty |= LP_NEW_SAMPLER; + else { + llvmpipe->dirty |= LP_NEW_SAMPLER; + } } @@ -128,6 +129,15 @@ llvmpipe_set_sampler_views(struct pipe_context *pipe, */ pipe_sampler_view_release(pipe, &llvmpipe->sampler_views[shader][start + i]); + /* + * Warn if someone tries to set a view created in a different context + * (which is why we need the hack above in the first place). + * An assert would be better but st/mesa relies on it... + */ + if (views[i] && views[i]->context != pipe) { + debug_printf("Illegal setting of sampler_view %d created in another " + "context\n", i); + } pipe_sampler_view_reference(&llvmpipe->sampler_views[shader][start + i], views[i]); } @@ -146,8 +156,9 @@ llvmpipe_set_sampler_views(struct pipe_context *pipe, llvmpipe->sampler_views[shader], llvmpipe->num_sampler_views[shader]); } - - llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW; + else { + llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW; + } } @@ -228,8 +239,7 @@ prepare_shader_sampling( struct llvmpipe_context *lp, unsigned num, struct pipe_sampler_view **views, - unsigned shader_type, - struct pipe_resource *mapped_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS]) + unsigned shader_type) { unsigned i; @@ -242,7 +252,7 @@ prepare_shader_sampling( if (!num) return; - for (i = 0; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; i++) { + for (i = 0; i < num; i++) { struct pipe_sampler_view *view = i < num ? views[i] : NULL; if (view) { @@ -253,11 +263,6 @@ prepare_shader_sampling( unsigned first_level = 0; unsigned last_level = 0; - /* We're referencing the texture's internal data, so save a - * reference to it. - */ - pipe_resource_reference(&mapped_tex[i], tex); - if (!lp_tex->dt) { /* regular texture - setup array of mipmap level offsets */ struct pipe_resource *res = view->texture; @@ -335,47 +340,28 @@ prepare_shader_sampling( /** - * Called during state validation when LP_NEW_SAMPLER_VIEW is set. + * Called whenever we're about to draw (no dirty flag, FIXME?). */ void llvmpipe_prepare_vertex_sampling(struct llvmpipe_context *lp, unsigned num, struct pipe_sampler_view **views) { - prepare_shader_sampling(lp, num, views, PIPE_SHADER_VERTEX, - lp->mapped_vs_tex); -} - -void -llvmpipe_cleanup_vertex_sampling(struct llvmpipe_context *ctx) -{ - unsigned i; - for (i = 0; i < Elements(ctx->mapped_vs_tex); i++) { - pipe_resource_reference(&ctx->mapped_vs_tex[i], NULL); - } + prepare_shader_sampling(lp, num, views, PIPE_SHADER_VERTEX); } /** - * Called during state validation when LP_NEW_SAMPLER_VIEW is set. + * Called whenever we're about to draw (no dirty flag, FIXME?). */ void llvmpipe_prepare_geometry_sampling(struct llvmpipe_context *lp, unsigned num, struct pipe_sampler_view **views) { - prepare_shader_sampling(lp, num, views, PIPE_SHADER_GEOMETRY, - lp->mapped_gs_tex); + prepare_shader_sampling(lp, num, views, PIPE_SHADER_GEOMETRY); } -void -llvmpipe_cleanup_geometry_sampling(struct llvmpipe_context *ctx) -{ - unsigned i; - for (i = 0; i < Elements(ctx->mapped_gs_tex); i++) { - pipe_resource_reference(&ctx->mapped_gs_tex[i], NULL); - } -} void llvmpipe_init_sampler_funcs(struct llvmpipe_context *llvmpipe) diff --git a/src/gallium/drivers/llvmpipe/lp_state_so.c b/src/gallium/drivers/llvmpipe/lp_state_so.c index 2af04cdf1c3..b2afd6fbf70 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_so.c +++ b/src/gallium/drivers/llvmpipe/lp_state_so.c @@ -70,6 +70,15 @@ llvmpipe_set_so_targets(struct pipe_context *pipe, int i; for (i = 0; i < num_targets; i++) { const boolean append = (offsets[i] == (unsigned)-1); + /* + * Warn if the so target was created in another context. + * XXX Not entirely sure if mesa/st may rely on this? + * Otherwise should just assert. + */ + if (targets[i] && targets[i]->context != pipe) { + debug_printf("Illegal setting of so target with target %d created in " + "another context\n", i); + } pipe_so_target_reference((struct pipe_stream_output_target **)&llvmpipe->so_targets[i], targets[i]); /* If we're not appending then lets set the internal offset to what was requested */ diff --git a/src/gallium/drivers/llvmpipe/lp_state_surface.c b/src/gallium/drivers/llvmpipe/lp_state_surface.c index c879ba9751d..b20b9c5cdd5 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_surface.c +++ b/src/gallium/drivers/llvmpipe/lp_state_surface.c @@ -52,6 +52,7 @@ llvmpipe_set_framebuffer_state(struct pipe_context *pipe, struct llvmpipe_context *lp = llvmpipe_context(pipe); boolean changed = !util_framebuffer_state_equal(&lp->framebuffer, fb); + unsigned i; assert(fb->width <= LP_MAX_WIDTH); assert(fb->height <= LP_MAX_HEIGHT); @@ -66,10 +67,22 @@ llvmpipe_set_framebuffer_state(struct pipe_context *pipe, const struct util_format_description *depth_desc = util_format_description(depth_format); + if (lp->framebuffer.zsbuf && lp->framebuffer.zsbuf->context != pipe) { + debug_printf("Illegal setting of fb state with zsbuf created in " + "another context\n"); + } + for (i = 0; i < fb->nr_cbufs; i++) { + if (lp->framebuffer.cbufs[i] && + lp->framebuffer.cbufs[i]->context != pipe) { + debug_printf("Illegal setting of fb state with cbuf %d created in " + "another context\n", i); + } + } + util_copy_framebuffer_state(&lp->framebuffer, fb); if (LP_PERF & PERF_NO_DEPTH) { - pipe_surface_reference(&lp->framebuffer.zsbuf, NULL); + pipe_surface_reference(&lp->framebuffer.zsbuf, NULL); } /* diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp index 1bf7240e131..f58cf97646e 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp @@ -615,6 +615,7 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i, case FILE_MEMORY_CONST: case FILE_MEMORY_SHARED: case FILE_SHADER_INPUT: + case FILE_SHADER_OUTPUT: hi->getSrc(s)->reg.data.offset += 4; break; default: @@ -625,7 +626,7 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i, } } if (srcNr == 2) { - lo->setDef(1, carry); + lo->setFlagsDef(1, carry); hi->setFlagsSrc(hi->srcCount(), carry); } return hi; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp index b1064bf0a92..17cb484d2ba 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp @@ -75,7 +75,8 @@ private: void emitLOAD(const Instruction *); void emitSTORE(const Instruction *); void emitMOV(const Instruction *); - void emitMEMBAR(const Instruction *); + void emitATOM(const Instruction *); + void emitCCTL(const Instruction *); void emitINTERP(const Instruction *); void emitAFETCH(const Instruction *); @@ -123,6 +124,7 @@ private: void emitPIXLD(const Instruction *); void emitBAR(const Instruction *); + void emitMEMBAR(const Instruction *); void emitFlow(const Instruction *); @@ -698,6 +700,10 @@ CodeEmitterGK110::emitIMAD(const Instruction *i) if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) code[1] |= 1 << 25; + + if (i->flagsDef >= 0) code[1] |= 1 << 18; + if (i->flagsSrc >= 0) code[1] |= 1 << 20; + SAT_(35); } @@ -1252,8 +1258,32 @@ CodeEmitterGK110::emitPIXLD(const Instruction *i) void CodeEmitterGK110::emitBAR(const Instruction *i) { - /* TODO */ - emitNOP(i); + code[0] = 0x00000002; + code[1] = 0x85400000; + + switch (i->subOp) { + case NV50_IR_SUBOP_BAR_ARRIVE: code[1] |= 0x08; break; + case NV50_IR_SUBOP_BAR_RED_AND: code[1] |= 0x50; break; + case NV50_IR_SUBOP_BAR_RED_OR: code[1] |= 0x90; break; + case NV50_IR_SUBOP_BAR_RED_POPC: code[1] |= 0x10; break; + default: + code[1] |= 0x20; + assert(i->subOp == NV50_IR_SUBOP_BAR_SYNC); + break; + } + + emitPredicate(i); + + srcId(i->src(0), 10); + srcId(i->src(1), 23); +} + +void CodeEmitterGK110::emitMEMBAR(const Instruction *i) +{ + code[0] = 0x00000002 | NV50_IR_SUBOP_MEMBAR_SCOPE(i->subOp) << 8; + code[1] = 0x7cc00000; + + emitPredicate(i); } void @@ -1587,6 +1617,10 @@ CodeEmitterGK110::emitSTORE(const Instruction *i) srcId(i->src(1), 2); srcId(i->src(0).getIndirect(0), 10); + if (i->src(0).getFile() == FILE_MEMORY_GLOBAL && + i->src(0).isIndirect(0) && + i->getIndirect(0, 0)->reg.size == 8) + code[1] |= 1 << 23; } void @@ -1597,7 +1631,7 @@ CodeEmitterGK110::emitLOAD(const Instruction *i) switch (i->src(0).getFile()) { case FILE_MEMORY_GLOBAL: code[1] = 0xc0000000; code[0] = 0x00000000; break; case FILE_MEMORY_LOCAL: code[1] = 0x7a000000; code[0] = 0x00000002; break; - case FILE_MEMORY_SHARED: code[1] = 0x7ac00000; code[0] = 0x00000002; break; + case FILE_MEMORY_SHARED: code[1] = 0x7a400000; code[0] = 0x00000002; break; case FILE_MEMORY_CONST: if (!i->src(0).isIndirect(0) && typeSizeof(i->dType) == 4) { emitMOV(i); @@ -1628,7 +1662,13 @@ CodeEmitterGK110::emitLOAD(const Instruction *i) emitPredicate(i); defId(i->def(0), 2); - srcId(i->src(0).getIndirect(0), 10); + if (i->getIndirect(0, 0)) { + srcId(i->src(0).getIndirect(0), 10); + if (i->getIndirect(0, 0)->reg.size == 8) + code[1] |= 1 << 23; + } else { + code[0] |= 255 << 10; + } } uint8_t @@ -1683,10 +1723,83 @@ CodeEmitterGK110::emitMOV(const Instruction *i) } } -void CodeEmitterGK110::emitMEMBAR(const Instruction *i) +static inline bool +uses64bitAddress(const Instruction *ldst) { - code[0] = 0x00000002 | NV50_IR_SUBOP_MEMBAR_SCOPE(i->subOp) << 8; - code[1] = 0x7cc00000; + return ldst->src(0).getFile() == FILE_MEMORY_GLOBAL && + ldst->src(0).isIndirect(0) && + ldst->getIndirect(0, 0)->reg.size == 8; +} + +void +CodeEmitterGK110::emitATOM(const Instruction *i) +{ + code[0] = 0x00000002; + if (i->subOp == NV50_IR_SUBOP_ATOM_CAS) + code[1] = 0x77800000; + else + code[1] = 0x68000000; + + switch (i->subOp) { + case NV50_IR_SUBOP_ATOM_CAS: break; + case NV50_IR_SUBOP_ATOM_EXCH: code[1] |= 0x04000000; break; + default: code[1] |= i->subOp << 23; break; + } + + switch (i->dType) { + case TYPE_U32: break; + case TYPE_S32: code[1] |= 0x00100000; break; + case TYPE_U64: code[1] |= 0x00200000; break; + case TYPE_F32: code[1] |= 0x00300000; break; + case TYPE_B128: code[1] |= 0x00400000; break; /* TODO: U128 */ + case TYPE_S64: code[1] |= 0x00500000; break; + default: assert(!"unsupported type"); break; + } + + emitPredicate(i); + + /* TODO: cas: check that src regs line up */ + /* TODO: cas: flip bits if $r255 is used */ + srcId(i->src(1), 23); + + if (i->defExists(0)) + defId(i->def(0), 2); + else + code[0] |= 255 << 2; + + const int32_t offset = SDATA(i->src(0)).offset; + assert(offset < 0x80000 && offset >= -0x80000); + code[0] |= (offset & 1) << 31; + code[1] |= (offset & 0xffffe) >> 1; + + if (i->getIndirect(0, 0)) { + srcId(i->getIndirect(0, 0), 10); + if (i->getIndirect(0, 0)->reg.size == 8) + code[1] |= 1 << 19; + } else { + code[0] |= 255 << 10; + } +} + +void +CodeEmitterGK110::emitCCTL(const Instruction *i) +{ + int32_t offset = SDATA(i->src(0)).offset; + + code[0] = 0x00000002 | (i->subOp << 2); + + if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) { + code[1] = 0x7b000000; + } else { + code[1] = 0x7c000000; + offset &= 0xffffff; + } + code[0] |= offset << 23; + code[1] |= offset >> 9; + + if (uses64bitAddress(i)) + code[1] |= 1 << 23; + srcId(i->src(0).getIndirect(0), 10); emitPredicate(i); } @@ -1925,6 +2038,12 @@ CodeEmitterGK110::emitInstruction(Instruction *insn) case OP_MEMBAR: emitMEMBAR(insn); break; + case OP_ATOM: + emitATOM(insn); + break; + case OP_CCTL: + emitCCTL(insn); + break; case OP_PHI: case OP_UNION: case OP_CONSTRAINT: diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp index ec74e7ac811..1fa0eb6da6d 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp @@ -176,6 +176,8 @@ private: void emitISBERD(); void emitAL2P(); void emitIPA(); + void emitATOM(); + void emitCCTL(); void emitPIXLD(); @@ -1552,11 +1554,13 @@ CodeEmitterGM107::emitLOP() break; } emitPRED (0x30); + emitX (0x2b); emitField(0x29, 2, lop); emitINV (0x28, insn->src(1)); emitINV (0x27, insn->src(0)); } else { emitInsn (0x04000000); + emitX (0x39); emitINV (0x38, insn->src(1)); emitINV (0x37, insn->src(0)); emitField(0x35, 2, lop); @@ -1624,9 +1628,11 @@ CodeEmitterGM107::emitIADD() emitNEG(0x31, insn->src(0)); emitNEG(0x30, insn->src(1)); emitCC (0x2f); + emitX (0x2b); } else { emitInsn(0x1c000000); emitSAT (0x36); + emitX (0x35); emitCC (0x34); emitIMMD(0x14, 32, insn->src(1)); } @@ -2146,6 +2152,7 @@ CodeEmitterGM107::emitLD() emitPRED (0x3a); emitLDSTc(0x38); emitLDSTs(0x35, insn->dType); + emitField(0x34, 1, insn->src(0).getIndirect(0)->getSize() == 8); emitADDR (0x08, 0x14, 32, 0, insn->src(0)); emitGPR (0x00, insn->def(0)); } @@ -2176,6 +2183,7 @@ CodeEmitterGM107::emitST() emitPRED (0x3a); emitLDSTc(0x38); emitLDSTs(0x35, insn->dType); + emitField(0x34, 1, insn->src(0).getIndirect(0)->getSize() == 8); emitADDR (0x08, 0x14, 32, 0, insn->src(0)); emitGPR (0x00, insn->src(1)); } @@ -2296,6 +2304,50 @@ CodeEmitterGM107::emitIPA() emitGPR(0x27); } +void +CodeEmitterGM107::emitATOM() +{ + unsigned dType, subOp; + switch (insn->dType) { + case TYPE_U32: dType = 0; break; + case TYPE_S32: dType = 1; break; + case TYPE_U64: dType = 2; break; + case TYPE_F32: dType = 3; break; + case TYPE_B128: dType = 4; break; + case TYPE_S64: dType = 5; break; + default: assert(!"unexpected dType"); dType = 0; break; + } + if (insn->subOp == NV50_IR_SUBOP_ATOM_EXCH) + subOp = 8; + else + subOp = insn->subOp; + assert(insn->subOp != NV50_IR_SUBOP_ATOM_CAS); /* XXX */ + + emitInsn (0xed000000); + emitField(0x34, 4, subOp); + emitField(0x31, 3, dType); + emitField(0x30, 1, insn->src(0).getIndirect(0)->getSize() == 8); + emitGPR (0x14, insn->src(1)); + emitADDR (0x08, 0x1c, 20, 0, insn->src(0)); + emitGPR (0x00, insn->def(0)); +} + +void +CodeEmitterGM107::emitCCTL() +{ + unsigned width; + if (insn->src(0).getFile() == FILE_MEMORY_GLOBAL) { + emitInsn(0xef600000); + width = 30; + } else { + emitInsn(0xef800000); + width = 22; + } + emitField(0x34, 1, insn->src(0).getIndirect(0)->getSize() == 8); + emitADDR (0x08, 0x16, width, 2, insn->src(0)); + emitField(0x00, 4, insn->subOp); +} + /******************************************************************************* * surface ******************************************************************************/ @@ -2795,6 +2847,12 @@ CodeEmitterGM107::emitInstruction(Instruction *i) break; } break; + case OP_ATOM: + emitATOM(); + break; + case OP_CCTL: + emitCCTL(); + break; case OP_VFETCH: emitALD(); break; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp index c126c085daf..bc8354deba1 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp @@ -1463,6 +1463,7 @@ CodeEmitterNV50::emitSFnOp(const Instruction *i, uint8_t subOp) if (i->encSize == 4) { assert(i->op == OP_RCP); + assert(!i->saturate); code[0] |= i->src(0).mod.abs() << 15; code[0] |= i->src(0).mod.neg() << 22; emitForm_MUL(i); @@ -1470,6 +1471,10 @@ CodeEmitterNV50::emitSFnOp(const Instruction *i, uint8_t subOp) code[1] = subOp << 29; code[1] |= i->src(0).mod.abs() << 20; code[1] |= i->src(0).mod.neg() << 26; + if (i->saturate) { + assert(subOp == 6 && i->op == OP_EX2); + code[1] |= 1 << 27; + } emitForm_MAD(i); } } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index 7b313f3c39c..9c4a38f291b 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -95,6 +95,13 @@ public: return tgsi_util_get_src_register_swizzle(®, chan); } + int getArrayId() const + { + if (isIndirect(0)) + return fsr->Indirect.ArrayID; + return 0; + } + nv50_ir::Modifier getMod(int chan) const; SrcRegister getIndirect(int dim) const @@ -154,6 +161,13 @@ public: return SrcRegister(fdr->Indirect); } + int getArrayId() const + { + if (isIndirect(0)) + return fdr->Indirect.ArrayID; + return 0; + } + private: const struct tgsi_dst_register reg; const struct tgsi_full_dst_register *fdr; @@ -809,7 +823,10 @@ public: // these registers are per-subroutine, cannot be used for parameter passing std::set locals; - bool mainTempsInLMem; + std::set indirectTempArrays; + std::map indirectTempOffsets; + std::map > tempArrayInfo; + std::vector tempArrayId; int clipVertexOutput; @@ -841,8 +858,6 @@ Source::Source(struct nv50_ir_prog_info *prog) : info(prog) if (prog->dbgFlags & NV50_IR_DEBUG_BASIC) tgsi_dump(tokens, 0); - - mainTempsInLMem = false; } Source::~Source() @@ -872,6 +887,7 @@ bool Source::scanSource() textureViews.resize(scan.file_max[TGSI_FILE_SAMPLER_VIEW] + 1); //resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1); + tempArrayId.resize(scan.file_max[TGSI_FILE_TEMPORARY] + 1); info->immd.bufSize = 0; @@ -917,8 +933,16 @@ bool Source::scanSource() } tgsi_parse_free(&parse); - if (mainTempsInLMem) - info->bin.tlsSpace += (scan.file_max[TGSI_FILE_TEMPORARY] + 1) * 16; + if (indirectTempArrays.size()) { + int tempBase = 0; + for (std::set::const_iterator it = indirectTempArrays.begin(); + it != indirectTempArrays.end(); ++it) { + std::pair& info = tempArrayInfo[*it]; + indirectTempOffsets.insert(std::make_pair(*it, tempBase - info.first)); + tempBase += info.second; + } + info->bin.tlsSpace += tempBase * 16; + } if (info->io.genUserClip > 0) { info->io.clipDistances = info->io.genUserClip; @@ -1028,6 +1052,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl) unsigned sn = TGSI_SEMANTIC_GENERIC; unsigned si = 0; const unsigned first = decl->Range.First, last = decl->Range.Last; + const int arrayId = decl->Array.ArrayID; if (decl->Declaration.Semantic) { sn = decl->Semantic.Name; @@ -1172,8 +1197,14 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl) for (i = first; i <= last; ++i) textureViews[i].target = decl->SamplerView.Resource; break; - case TGSI_FILE_NULL: case TGSI_FILE_TEMPORARY: + for (i = first; i <= last; ++i) + tempArrayId[i] = arrayId; + if (arrayId) + tempArrayInfo.insert(std::make_pair(arrayId, std::make_pair( + first, last - first + 1))); + break; + case TGSI_FILE_NULL: case TGSI_FILE_ADDRESS: case TGSI_FILE_CONSTANT: case TGSI_FILE_IMMEDIATE: @@ -1223,7 +1254,7 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst) } else if (insn.getDst(0).getFile() == TGSI_FILE_TEMPORARY) { if (insn.getDst(0).isIndirect(0)) - mainTempsInLMem = true; + indirectTempArrays.insert(insn.getDst(0).getArrayId()); } } @@ -1231,7 +1262,7 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst) Instruction::SrcRegister src = insn.getSrc(s); if (src.getFile() == TGSI_FILE_TEMPORARY) { if (src.isIndirect(0)) - mainTempsInLMem = true; + indirectTempArrays.insert(src.getArrayId()); } else /* if (src.getFile() == TGSI_FILE_RESOURCE) { @@ -1337,6 +1368,7 @@ private: void storeDst(const tgsi::Instruction::DstRegister dst, int c, Value *val, Value *ptr); + void adjustTempIndex(int arrayId, int &idx, int &idx2d) const; Value *applySrcMod(Value *, int s, int c); Symbol *makeSym(uint file, int fileIndex, int idx, int c, uint32_t addr); @@ -1416,6 +1448,7 @@ private: DataType srcTy; DataArray tData; // TGSI_FILE_TEMPORARY + DataArray lData; // TGSI_FILE_TEMPORARY, for indirect arrays DataArray aData; // TGSI_FILE_ADDRESS DataArray pData; // TGSI_FILE_PREDICATE DataArray oData; // TGSI_FILE_OUTPUT (if outputs in registers) @@ -1619,7 +1652,7 @@ Converter::getArrayForFile(unsigned file, int idx) { switch (file) { case TGSI_FILE_TEMPORARY: - return &tData; + return idx == 0 ? &tData : &lData; case TGSI_FILE_PREDICATE: return &pData; case TGSI_FILE_ADDRESS: @@ -1641,11 +1674,23 @@ Converter::shiftAddress(Value *index) return mkOp2v(OP_SHL, TYPE_U32, getSSA(4, FILE_ADDRESS), index, mkImm(4)); } +void +Converter::adjustTempIndex(int arrayId, int &idx, int &idx2d) const +{ + std::map::const_iterator it = + code->indirectTempOffsets.find(arrayId); + if (it == code->indirectTempOffsets.end()) + return; + + idx2d = 1; + idx += it->second; +} + Value * Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr) { - const int idx2d = src.is2D() ? src.getIndex(1) : 0; - const int idx = src.getIndex(0); + int idx2d = src.is2D() ? src.getIndex(1) : 0; + int idx = src.getIndex(0); const int swz = src.getSwizzle(c); Instruction *ld; @@ -1686,6 +1731,13 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr) ld = mkOp1(OP_RDSV, TYPE_U32, getSSA(), srcToSym(src, c)); ld->perPatch = info->sv[idx].patch; return ld->getDef(0); + case TGSI_FILE_TEMPORARY: { + int arrayid = src.getArrayId(); + if (!arrayid) + arrayid = code->tempArrayId[idx]; + adjustTempIndex(arrayid, idx, idx2d); + } + /* fallthrough */ default: return getArrayForFile(src.getFile(), idx2d)->load( sub.cur->values, idx, swz, shiftAddress(ptr)); @@ -1697,8 +1749,8 @@ Converter::acquireDst(int d, int c) { const tgsi::Instruction::DstRegister dst = tgsi.getDst(d); const unsigned f = dst.getFile(); - const int idx = dst.getIndex(0); - const int idx2d = dst.is2D() ? dst.getIndex(1) : 0; + int idx = dst.getIndex(0); + int idx2d = dst.is2D() ? dst.getIndex(1) : 0; if (dst.isMasked(c)/* || f == TGSI_FILE_RESOURCE*/) return NULL; @@ -1708,6 +1760,13 @@ Converter::acquireDst(int d, int c) (f == TGSI_FILE_OUTPUT && prog->getType() != Program::TYPE_FRAGMENT)) return getScratch(); + if (f == TGSI_FILE_TEMPORARY) { + int arrayid = dst.getArrayId(); + if (!arrayid) + arrayid = code->tempArrayId[idx]; + adjustTempIndex(arrayid, idx, idx2d); + } + return getArrayForFile(f, idx2d)-> acquire(sub.cur->values, idx, c); } @@ -1739,8 +1798,8 @@ Converter::storeDst(const tgsi::Instruction::DstRegister dst, int c, Value *val, Value *ptr) { const unsigned f = dst.getFile(); - const int idx = dst.getIndex(0); - const int idx2d = dst.is2D() ? dst.getIndex(1) : 0; + int idx = dst.getIndex(0); + int idx2d = dst.is2D() ? dst.getIndex(1) : 0; if (f == TGSI_FILE_SYSTEM_VALUE) { assert(!ptr); @@ -1763,6 +1822,13 @@ Converter::storeDst(const tgsi::Instruction::DstRegister dst, int c, f == TGSI_FILE_PREDICATE || f == TGSI_FILE_ADDRESS || f == TGSI_FILE_OUTPUT) { + if (f == TGSI_FILE_TEMPORARY) { + int arrayid = dst.getArrayId(); + if (!arrayid) + arrayid = code->tempArrayId[idx]; + adjustTempIndex(arrayid, idx, idx2d); + } + getArrayForFile(f, idx2d)->store(sub.cur->values, idx, c, ptr, val); } else { assert(!"invalid dst file"); @@ -3326,18 +3392,17 @@ Converter::exportOutputs() Converter::Converter(Program *ir, const tgsi::Source *code) : BuildUtil(ir), code(code), tgsi(NULL), - tData(this), aData(this), pData(this), oData(this) + tData(this), lData(this), aData(this), pData(this), oData(this) { info = code->info; - const DataFile tFile = code->mainTempsInLMem ? FILE_MEMORY_LOCAL : FILE_GPR; - const unsigned tSize = code->fileSize(TGSI_FILE_TEMPORARY); const unsigned pSize = code->fileSize(TGSI_FILE_PREDICATE); const unsigned aSize = code->fileSize(TGSI_FILE_ADDRESS); const unsigned oSize = code->fileSize(TGSI_FILE_OUTPUT); - tData.setup(TGSI_FILE_TEMPORARY, 0, 0, tSize, 4, 4, tFile, 0); + tData.setup(TGSI_FILE_TEMPORARY, 0, 0, tSize, 4, 4, FILE_GPR, 0); + lData.setup(TGSI_FILE_TEMPORARY, 1, 0, tSize, 4, 4, FILE_MEMORY_LOCAL, 0); pData.setup(TGSI_FILE_PREDICATE, 0, 0, pSize, 4, 4, FILE_PREDICATE, 0); aData.setup(TGSI_FILE_ADDRESS, 0, 0, aSize, 4, 4, FILE_GPR, 0); oData.setup(TGSI_FILE_OUTPUT, 0, 0, oSize, 4, 4, FILE_GPR, 0); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index 6530078b938..dc1ab769b98 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -540,6 +540,12 @@ NVC0LegalizePostRA::visit(BasicBlock *bb) // It seems like barriers are never required for tessellation since // the warp size is 32, and there are always at most 32 tcs threads. bb->remove(i); + } else + if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LDC_IS) { + int offset = i->src(0).get()->reg.data.offset; + if (abs(offset) > 0x10000) + i->src(0).get()->reg.fileIndex += offset >> 16; + i->src(0).get()->reg.data.offset = (int)(short)offset; } else { // TODO: Move this to before register allocation for operations that // need the $c register ! diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index f5c590eef10..95e9fdfc57d 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -171,7 +171,10 @@ LoadPropagation::isImmdLoad(Instruction *ld) if (!ld || (ld->op != OP_MOV) || ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8))) return false; - return ld->src(0).getFile() == FILE_IMMEDIATE; + + // A 0 can be replaced with a register, so it doesn't count as an immediate. + ImmediateValue val; + return ld->src(0).getImmediate(val) && !val.isInteger(0); } bool @@ -187,7 +190,8 @@ LoadPropagation::isAttribOrSharedLoad(Instruction *ld) void LoadPropagation::checkSwapSrc01(Instruction *insn) { - if (!prog->getTarget()->getOpInfo(insn).commutative) + const Target *targ = prog->getTarget(); + if (!targ->getOpInfo(insn).commutative) if (insn->op != OP_SET && insn->op != OP_SLCT) return; if (insn->src(1).getFile() != FILE_GPR) @@ -196,14 +200,15 @@ LoadPropagation::checkSwapSrc01(Instruction *insn) Instruction *i0 = insn->getSrc(0)->getInsn(); Instruction *i1 = insn->getSrc(1)->getInsn(); - if (isCSpaceLoad(i0)) { - if (!isCSpaceLoad(i1)) - insn->swapSources(0, 1); - else - return; - } else - if (isImmdLoad(i0)) { - if (!isCSpaceLoad(i1) && !isImmdLoad(i1)) + // Swap sources to inline the less frequently used source. That way, + // optimistically, it will eventually be able to remove the instruction. + int i0refs = insn->getSrc(0)->refCount(); + int i1refs = insn->getSrc(1)->refCount(); + + if ((isCSpaceLoad(i0) || isImmdLoad(i0)) && targ->insnCanLoad(insn, 1, i0)) { + if ((!isImmdLoad(i1) && !isCSpaceLoad(i1)) || + !targ->insnCanLoad(insn, 1, i1) || + i0refs < i1refs) insn->swapSources(0, 1); else return; @@ -1224,6 +1229,8 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) adds = 1; else return; + if (si->src(!adds).mod != Modifier(0)) + return; // SHL(ADD(x, y), z) = ADD(SHL(x, z), SHL(y, z)) // This is more operations, but if one of x, y is an immediate, then diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h index 673f8811ff3..e6e1912adae 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h @@ -192,7 +192,7 @@ public: virtual bool insnCanLoad(const Instruction *insn, int s, const Instruction *ld) const = 0; virtual bool insnCanLoadOffset(const Instruction *insn, int s, - int offset) const { return true; } + int offset) const = 0; virtual bool isOpSupported(operation, DataType) const = 0; virtual bool isAccessSupported(DataFile, DataType) const = 0; virtual bool isModSupported(const Instruction *, diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp index 101082e7491..2c4d7f53d60 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp @@ -99,6 +99,7 @@ static const struct opProperties _initProps[] = { OP_SET, 0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 }, { OP_PREEX2, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }, { OP_PRESIN, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }, + { OP_EX2, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0, 0x0, 0x0 }, { OP_LG2, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }, { OP_RCP, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }, { OP_RSQ, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }, diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp index 014c652eede..a03afa8dc8d 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp @@ -383,6 +383,16 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s, return true; } +bool +TargetNVC0::insnCanLoadOffset(const Instruction *insn, int s, int offset) const +{ + const ValueRef& ref = insn->src(s); + if (ref.getFile() == FILE_MEMORY_CONST && + (insn->op != OP_LOAD || insn->subOp != NV50_IR_SUBOP_LDC_IS)) + return offset >= -0x8000 && offset < 0x8000; + return true; +} + bool TargetNVC0::isAccessSupported(DataFile file, DataType ty) const { diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h index 3c5c7480405..7d11cd96315 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h @@ -48,6 +48,8 @@ public: virtual bool insnCanLoad(const Instruction *insn, int s, const Instruction *ld) const; + virtual bool insnCanLoadOffset(const Instruction *insn, int s, + int offset) const; virtual bool isOpSupported(operation, DataType) const; virtual bool isAccessSupported(DataFile, DataType) const; virtual bool isModSupported(const Instruction *, int s, Modifier) const; diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c index 933330f107a..61d91fd4cce 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c @@ -183,6 +183,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: case PIPE_CAP_INVALIDATE_BUFFER: case PIPE_CAP_GENERATE_MIPMAP: + case PIPE_CAP_STRING_MARKER: return 0; case PIPE_CAP_VENDOR_ID: diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c index 712835c1ce1..32da60e0a23 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c @@ -226,6 +226,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: case PIPE_CAP_INVALIDATE_BUFFER: case PIPE_CAP_GENERATE_MIPMAP: + case PIPE_CAP_STRING_MARKER: return 0; case PIPE_CAP_VENDOR_ID: diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index ccf96fb2815..84dbd69b8a5 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -215,6 +215,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: case PIPE_CAP_INVALIDATE_BUFFER: case PIPE_CAP_GENERATE_MIPMAP: + case PIPE_CAP_STRING_MARKER: return 0; case PIPE_CAP_VENDOR_ID: @@ -295,9 +296,10 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, if (shader == PIPE_SHADER_COMPUTE && class_3d >= NVE4_3D_CLASS) return NVE4_MAX_PIPE_CONSTBUFS_COMPUTE; return NVC0_MAX_PIPE_CONSTBUFS; - case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR: return shader != PIPE_SHADER_FRAGMENT; + case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: + return shader != PIPE_SHADER_FRAGMENT || class_3d < GM107_3D_CLASS; case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: return 1; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c index c53f946a762..af072a8acdc 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c @@ -64,7 +64,7 @@ nvc0_decoder_bsp_next(struct nouveau_vp3_decoder *dec, bsp_size += num_bytes[i]; bsp_size += 256; /* the 4 end markers */ - if (!bsp_bo || bsp_size > bsp_bo->size) { + if (bsp_size > bsp_bo->size) { union nouveau_bo_config cfg; struct nouveau_bo *tmp_bo = NULL; diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c index 8823b8d3197..90c4f71a945 100644 --- a/src/gallium/drivers/r300/r300_screen.c +++ b/src/gallium/drivers/r300/r300_screen.c @@ -209,6 +209,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: case PIPE_CAP_INVALIDATE_BUFFER: case PIPE_CAP_GENERATE_MIPMAP: + case PIPE_CAP_STRING_MARKER: return 0; /* SWTCL-only features. */ diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index 08fdd361049..9b0f31270df 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -68,6 +68,7 @@ static const struct debug_named_value r600_debug_options[] = { static void r600_destroy_context(struct pipe_context *context) { struct r600_context *rctx = (struct r600_context *)context; + unsigned sh; r600_isa_destroy(rctx->isa); @@ -76,6 +77,11 @@ static void r600_destroy_context(struct pipe_context *context) pipe_resource_reference((struct pipe_resource**)&rctx->dummy_cmask, NULL); pipe_resource_reference((struct pipe_resource**)&rctx->dummy_fmask, NULL); + for (sh = 0; sh < PIPE_SHADER_TYPES; sh++) { + rctx->b.b.set_constant_buffer(&rctx->b.b, sh, R600_BUFFER_INFO_CONST_BUFFER, NULL); + free(rctx->driver_consts[sh].constants); + } + if (rctx->fixed_func_tcs_shader) rctx->b.b.delete_tcs_state(&rctx->b.b, rctx->fixed_func_tcs_shader); @@ -357,6 +363,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: case PIPE_CAP_GENERATE_MIPMAP: + case PIPE_CAP_STRING_MARKER: return 0; case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c index 6592c5bdeca..c7984c47304 100644 --- a/src/gallium/drivers/radeon/r600_buffer_common.c +++ b/src/gallium/drivers/radeon/r600_buffer_common.c @@ -210,8 +210,8 @@ static void r600_buffer_destroy(struct pipe_screen *screen, } static bool -r600_do_invalidate_resource(struct r600_common_context *rctx, - struct r600_resource *rbuffer) +r600_invalidate_buffer(struct r600_common_context *rctx, + struct r600_resource *rbuffer) { /* In AMD_pinned_memory, the user pointer association only gets * broken when the buffer is explicitly re-allocated. @@ -236,7 +236,9 @@ void r600_invalidate_resource(struct pipe_context *ctx, struct r600_common_context *rctx = (struct r600_common_context*)ctx; struct r600_resource *rbuffer = r600_resource(resource); - (void)r600_do_invalidate_resource(rctx, rbuffer); + /* We currently only do anyting here for buffers */ + if (resource->target == PIPE_BUFFER) + (void)r600_invalidate_buffer(rctx, rbuffer); } static void *r600_buffer_get_transfer(struct pipe_context *ctx, @@ -306,7 +308,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { assert(usage & PIPE_TRANSFER_WRITE); - if (r600_do_invalidate_resource(rctx, rbuffer)) { + if (r600_invalidate_buffer(rctx, rbuffer)) { /* At this point, the buffer is always idle. */ usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index f6ff4a81bd4..3e20c3b81fa 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -349,6 +349,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: case PIPE_CAP_GENERATE_MIPMAP: + case PIPE_CAP_STRING_MARKER: return 0; case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index cc9718e42d3..2de7def8dd2 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -3728,6 +3728,9 @@ void si_shader_binary_read_config(struct radeon_shader_binary *binary, case R_0286CC_SPI_PS_INPUT_ENA: conf->spi_ps_input_ena = value; break; + case R_0286D0_SPI_PS_INPUT_ADDR: + /* Not used yet, but will be in the future */ + break; case R_0286E8_SPI_TMPRING_SIZE: case R_00B860_COMPUTE_TMPRING_SIZE: /* WAVESIZE is in units of 256 dwords. */ @@ -3735,8 +3738,15 @@ void si_shader_binary_read_config(struct radeon_shader_binary *binary, G_00B860_WAVESIZE(value) * 256 * 4 * 1; break; default: - fprintf(stderr, "Warning: Compiler emitted unknown " - "config register: 0x%x\n", reg); + { + static bool printed; + + if (!printed) { + fprintf(stderr, "Warning: LLVM emitted unknown " + "config register: 0x%x\n", reg); + printed = true; + } + } break; } } diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c index 143702a5650..3bc580899d4 100644 --- a/src/gallium/drivers/softpipe/sp_screen.c +++ b/src/gallium/drivers/softpipe/sp_screen.c @@ -260,6 +260,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: case PIPE_CAP_INVALIDATE_BUFFER: case PIPE_CAP_GENERATE_MIPMAP: + case PIPE_CAP_STRING_MARKER: return 0; } /* should only get here on unhandled cases */ diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c index b21634f3d73..8d04222a0cd 100644 --- a/src/gallium/drivers/svga/svga_screen.c +++ b/src/gallium/drivers/svga/svga_screen.c @@ -357,6 +357,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_TGSI_PACK_HALF_FLOAT: case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: case PIPE_CAP_INVALIDATE_BUFFER: + case PIPE_CAP_STRING_MARKER: return 0; case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT: return 64; diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c index fb41877017d..08c2dad8406 100644 --- a/src/gallium/drivers/vc4/vc4_screen.c +++ b/src/gallium/drivers/vc4/vc4_screen.c @@ -198,6 +198,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: case PIPE_CAP_INVALIDATE_BUFFER: case PIPE_CAP_GENERATE_MIPMAP: + case PIPE_CAP_STRING_MARKER: return 0; /* Stream output. */ diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h index 4b551ed0b41..f69a75be50e 100644 --- a/src/gallium/include/pipe/p_context.h +++ b/src/gallium/include/pipe/p_context.h @@ -678,6 +678,13 @@ struct pipe_context { void (*dump_debug_state)(struct pipe_context *ctx, FILE *stream, unsigned flags); + /** + * Emit string marker in cmdstream + */ + void (*emit_string_marker)(struct pipe_context *ctx, + const char *string, + int len); + /** * Generate mipmap. * \return TRUE if mipmap generation succeeds, FALSE otherwise diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h index cb837cd2597..b46187bc8a1 100644 --- a/src/gallium/include/pipe/p_defines.h +++ b/src/gallium/include/pipe/p_defines.h @@ -644,6 +644,7 @@ enum pipe_cap PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT, PIPE_CAP_INVALIDATE_BUFFER, PIPE_CAP_GENERATE_MIPMAP, + PIPE_CAP_STRING_MARKER, }; #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0) diff --git a/src/gallium/state_trackers/dri/dri_drawable.c b/src/gallium/state_trackers/dri/dri_drawable.c index f0cc4a2a3ef..adc51284767 100644 --- a/src/gallium/state_trackers/dri/dri_drawable.c +++ b/src/gallium/state_trackers/dri/dri_drawable.c @@ -492,8 +492,10 @@ dri_flush(__DRIcontext *cPriv, if (pipe->invalidate_resource && (flags & __DRI2_FLUSH_INVALIDATE_ANCILLARY)) { - pipe->invalidate_resource(pipe, drawable->textures[ST_ATTACHMENT_DEPTH_STENCIL]); - pipe->invalidate_resource(pipe, drawable->msaa_textures[ST_ATTACHMENT_DEPTH_STENCIL]); + if (drawable->textures[ST_ATTACHMENT_DEPTH_STENCIL]) + pipe->invalidate_resource(pipe, drawable->textures[ST_ATTACHMENT_DEPTH_STENCIL]); + if (drawable->msaa_textures[ST_ATTACHMENT_DEPTH_STENCIL]) + pipe->invalidate_resource(pipe, drawable->msaa_textures[ST_ATTACHMENT_DEPTH_STENCIL]); } } diff --git a/src/gallium/state_trackers/va/context.c b/src/gallium/state_trackers/va/context.c index 37a011799e2..b25c381d968 100644 --- a/src/gallium/state_trackers/va/context.c +++ b/src/gallium/state_trackers/va/context.c @@ -31,6 +31,7 @@ #include "util/u_memory.h" #include "util/u_handle_table.h" #include "util/u_video.h" +#include "vl/vl_deint_filter.h" #include "vl/vl_winsys.h" #include "va_private.h" @@ -296,6 +297,10 @@ vlVaDestroyContext(VADriverContextP ctx, VAContextID context_id) } context->decoder->destroy(context->decoder); } + if (context->deint) { + vl_deint_filter_cleanup(context->deint); + FREE(context->deint); + } FREE(context); handle_table_remove(drv->htab, context_id); pipe_mutex_unlock(drv->mutex); diff --git a/src/gallium/state_trackers/va/postproc.c b/src/gallium/state_trackers/va/postproc.c index 0cec0c88124..d06f01617df 100644 --- a/src/gallium/state_trackers/va/postproc.c +++ b/src/gallium/state_trackers/va/postproc.c @@ -29,6 +29,7 @@ #include "vl/vl_defines.h" #include "vl/vl_video_buffer.h" +#include "vl/vl_deint_filter.h" #include "va_private.h" @@ -174,6 +175,51 @@ static VAStatus vlVaPostProcBlit(vlVaDriver *drv, vlVaContext *context, return VA_STATUS_SUCCESS; } +static struct pipe_video_buffer * +vlVaApplyDeint(vlVaDriver *drv, vlVaContext *context, + VAProcPipelineParameterBuffer *param, + struct pipe_video_buffer *current, + unsigned field) +{ + vlVaSurface *prevprev, *prev, *next; + + if (param->num_forward_references < 1 || + param->num_backward_references < 2) + return current; + + prevprev = handle_table_get(drv->htab, param->backward_references[1]); + prev = handle_table_get(drv->htab, param->backward_references[0]); + next = handle_table_get(drv->htab, param->forward_references[0]); + + if (!prevprev || !prev || !next) + return current; + + if (context->deint && (context->deint->video_width != current->width || + context->deint->video_height != current->height)) { + vl_deint_filter_cleanup(context->deint); + FREE(context->deint); + context->deint = NULL; + } + + if (!context->deint) { + context->deint = MALLOC(sizeof(struct vl_deint_filter)); + if (!vl_deint_filter_init(context->deint, drv->pipe, current->width, + current->height, false, false)) { + FREE(context->deint); + context->deint = NULL; + return current; + } + } + + if (!vl_deint_filter_check_buffers(context->deint, prevprev->buffer, + prev->buffer, current, next->buffer)) + return current; + + vl_deint_filter_render(context->deint, prevprev->buffer, prev->buffer, + current, next->buffer, field); + return context->deint->video_buffer; +} + VAStatus vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf) { @@ -181,6 +227,7 @@ vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *contex VARectangle def_src_region, def_dst_region; const VARectangle *src_region, *dst_region; VAProcPipelineParameterBuffer *param; + struct pipe_video_buffer *src; vlVaSurface *src_surface; unsigned i; @@ -199,6 +246,8 @@ vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *contex if (!src_surface || !src_surface->buffer) return VA_STATUS_ERROR_INVALID_SURFACE; + src = src_surface->buffer; + for (i = 0; i < param->num_filters; i++) { vlVaBuffer *buf = handle_table_get(drv->htab, param->filters[i]); VAProcFilterParameterBufferBase *filter; @@ -222,6 +271,11 @@ vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *contex deinterlace = VL_COMPOSITOR_WEAVE; break; + case VAProcDeinterlacingMotionAdaptive: + src = vlVaApplyDeint(drv, context, param, src, + !!(deint->flags & VA_DEINTERLACING_BOTTOM_FIELD)); + break; + default: return VA_STATUS_ERROR_UNIMPLEMENTED; } @@ -239,10 +293,8 @@ vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *contex if (context->target->buffer_format != PIPE_FORMAT_NV12) return vlVaPostProcCompositor(drv, context, src_region, dst_region, - src_surface->buffer, context->target, - deinterlace); + src, context->target, deinterlace); else return vlVaPostProcBlit(drv, context, src_region, dst_region, - src_surface->buffer, context->target, - deinterlace); + src, context->target, deinterlace); } diff --git a/src/gallium/state_trackers/va/surface.c b/src/gallium/state_trackers/va/surface.c index f23a88901f5..84a94949c47 100644 --- a/src/gallium/state_trackers/va/surface.c +++ b/src/gallium/state_trackers/va/surface.c @@ -691,13 +691,14 @@ vlVaQueryVideoProcFilterCaps(VADriverContextP ctx, VAContextID context, case VAProcFilterDeinterlacing: { VAProcFilterCapDeinterlacing *deint = filter_caps; - if (*num_filter_caps < 2) { - *num_filter_caps = 2; + if (*num_filter_caps < 3) { + *num_filter_caps = 3; return VA_STATUS_ERROR_MAX_NUM_EXCEEDED; } deint[i++].type = VAProcDeinterlacingBob; deint[i++].type = VAProcDeinterlacingWeave; + deint[i++].type = VAProcDeinterlacingMotionAdaptive; break; } @@ -750,9 +751,24 @@ vlVaQueryVideoProcPipelineCaps(VADriverContextP ctx, VAContextID context, for (i = 0; i < num_filters; i++) { vlVaBuffer *buf = handle_table_get(VL_VA_DRIVER(ctx)->htab, filters[i]); + VAProcFilterParameterBufferBase *filter; - if (!buf || buf->type >= VABufferTypeMax) + if (!buf || buf->type != VAProcFilterParameterBufferType) return VA_STATUS_ERROR_INVALID_BUFFER; + + filter = buf->data; + switch (filter->type) { + case VAProcFilterDeinterlacing: { + VAProcFilterParameterBufferDeinterlacing *deint = buf->data; + if (deint->algorithm == VAProcDeinterlacingMotionAdaptive) { + pipeline_cap->num_forward_references = 1; + pipeline_cap->num_backward_references = 2; + } + break; + } + default: + return VA_STATUS_ERROR_UNIMPLEMENTED; + } } return VA_STATUS_SUCCESS; diff --git a/src/gallium/state_trackers/va/va_private.h b/src/gallium/state_trackers/va/va_private.h index 7afd81a196d..614fa98fef7 100644 --- a/src/gallium/state_trackers/va/va_private.h +++ b/src/gallium/state_trackers/va/va_private.h @@ -236,6 +236,8 @@ typedef struct { VAPictureParameterBufferMPEG4 pps; uint8_t start_code[32]; } mpeg4; + + struct vl_deint_filter *deint; } vlVaContext; typedef struct { diff --git a/src/glsl/ast.h b/src/glsl/ast.h index f8ab0b71b7b..03df6c08b2b 100644 --- a/src/glsl/ast.h +++ b/src/glsl/ast.h @@ -699,17 +699,18 @@ struct ast_type_qualifier { bool merge_qualifier(YYLTYPE *loc, _mesa_glsl_parse_state *state, - const ast_type_qualifier &q); + const ast_type_qualifier &q, + bool is_single_layout_merge); bool merge_out_qualifier(YYLTYPE *loc, _mesa_glsl_parse_state *state, const ast_type_qualifier &q, - ast_node* &node); + ast_node* &node, bool create_node); bool merge_in_qualifier(YYLTYPE *loc, _mesa_glsl_parse_state *state, const ast_type_qualifier &q, - ast_node* &node); + ast_node* &node, bool create_node); ast_subroutine_list *subroutine_list; }; diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp index 02e6e2d3d45..8d66131b2ca 100644 --- a/src/glsl/ast_to_hir.cpp +++ b/src/glsl/ast_to_hir.cpp @@ -487,15 +487,17 @@ unary_arithmetic_result_type(const struct glsl_type *type, * If the given types to the bit-logic operator are invalid, return * glsl_type::error_type. * - * \param type_a Type of LHS of bit-logic op - * \param type_b Type of RHS of bit-logic op + * \param value_a LHS of bit-logic op + * \param value_b RHS of bit-logic op */ static const struct glsl_type * -bit_logic_result_type(const struct glsl_type *type_a, - const struct glsl_type *type_b, +bit_logic_result_type(ir_rvalue * &value_a, ir_rvalue * &value_b, ast_operators op, struct _mesa_glsl_parse_state *state, YYLTYPE *loc) { + const glsl_type *type_a = value_a->type; + const glsl_type *type_b = value_b->type; + if (!state->check_bitwise_operations_allowed(loc)) { return glsl_type::error_type; } @@ -517,6 +519,36 @@ bit_logic_result_type(const struct glsl_type *type_a, return glsl_type::error_type; } + /* Prior to GLSL 4.0 / GL_ARB_gpu_shader5, implicit conversions didn't + * make sense for bitwise operations, as they don't operate on floats. + * + * GLSL 4.0 added implicit int -> uint conversions, which are relevant + * here. It wasn't clear whether or not we should apply them to bitwise + * operations. However, Khronos has decided that they should in future + * language revisions. Applications also rely on this behavior. We opt + * to apply them in general, but issue a portability warning. + * + * See https://www.khronos.org/bugzilla/show_bug.cgi?id=1405 + */ + if (type_a->base_type != type_b->base_type) { + if (!apply_implicit_conversion(type_a, value_b, state) + && !apply_implicit_conversion(type_b, value_a, state)) { + _mesa_glsl_error(loc, state, + "could not implicitly convert operands to " + "`%s` operator", + ast_expression::operator_string(op)); + return glsl_type::error_type; + } else { + _mesa_glsl_warning(loc, state, + "some implementations may not support implicit " + "int -> uint conversions for `%s' operators; " + "consider casting explicitly for portability", + ast_expression::operator_string(op)); + } + type_a = value_a->type; + type_b = value_b->type; + } + /* "The fundamental types of the operands (signed or unsigned) must * match," */ @@ -1435,8 +1467,7 @@ ast_expression::do_hir(exec_list *instructions, case ast_bit_or: op[0] = this->subexpressions[0]->hir(instructions, state); op[1] = this->subexpressions[1]->hir(instructions, state); - type = bit_logic_result_type(op[0]->type, op[1]->type, this->oper, - state, &loc); + type = bit_logic_result_type(op[0], op[1], this->oper, state, &loc); result = new(ctx) ir_expression(operations[this->oper], type, op[0], op[1]); error_emitted = op[0]->type->is_error() || op[1]->type->is_error(); @@ -1626,8 +1657,7 @@ ast_expression::do_hir(exec_list *instructions, case ast_or_assign: { op[0] = this->subexpressions[0]->hir(instructions, state); op[1] = this->subexpressions[1]->hir(instructions, state); - type = bit_logic_result_type(op[0]->type, op[1]->type, this->oper, - state, &loc); + type = bit_logic_result_type(op[0], op[1], this->oper, state, &loc); ir_rvalue *temp_rhs = new(ctx) ir_expression(operations[this->oper], type, op[0], op[1]); error_emitted = @@ -6329,7 +6359,7 @@ ast_process_struct_or_iface_block_members(exec_list *instructions, qual_stream != block_stream) { _mesa_glsl_error(&loc, state, "stream layout qualifier on " "interface block member does not match " - "the interface block (%d vs %d)", qual->stream, + "the interface block (%u vs %u)", qual_stream, block_stream); } } diff --git a/src/glsl/ast_type.cpp b/src/glsl/ast_type.cpp index 8643b7bfb76..e0e331152dd 100644 --- a/src/glsl/ast_type.cpp +++ b/src/glsl/ast_type.cpp @@ -74,9 +74,11 @@ ast_type_qualifier::has_layout() const || this->flags.q.row_major || this->flags.q.packed || this->flags.q.explicit_location + || this->flags.q.explicit_image_format || this->flags.q.explicit_index || this->flags.q.explicit_binding - || this->flags.q.explicit_offset; + || this->flags.q.explicit_offset + || this->flags.q.explicit_stream; } bool @@ -113,10 +115,16 @@ ast_type_qualifier::interpolation_string() const return NULL; } +/** + * This function merges both duplicate identifies within a single layout and + * multiple layout qualifiers on a single variable declaration. The + * is_single_layout_merge param is used differentiate between the two. + */ bool ast_type_qualifier::merge_qualifier(YYLTYPE *loc, _mesa_glsl_parse_state *state, - const ast_type_qualifier &q) + const ast_type_qualifier &q, + bool is_single_layout_merge) { ast_type_qualifier ubo_mat_mask; ubo_mat_mask.flags.i = 0; @@ -156,7 +164,8 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc, allowed_duplicates_mask.flags.i |= stream_layout_mask.flags.i; - if ((this->flags.i & q.flags.i & ~allowed_duplicates_mask.flags.i) != 0) { + if (is_single_layout_merge && !state->has_enhanced_layouts() && + (this->flags.i & q.flags.i & ~allowed_duplicates_mask.flags.i) != 0) { _mesa_glsl_error(loc, state, "duplicate layout qualifiers used"); return false; @@ -207,11 +216,6 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc, this->flags.q.stream = 1; this->stream = state->out_qualifier->stream; } - } else { - if (q.flags.q.explicit_stream) { - _mesa_glsl_error(loc, state, - "duplicate layout `stream' qualifier"); - } } } @@ -294,13 +298,35 @@ bool ast_type_qualifier::merge_out_qualifier(YYLTYPE *loc, _mesa_glsl_parse_state *state, const ast_type_qualifier &q, - ast_node* &node) + ast_node* &node, bool create_node) { void *mem_ctx = state; - const bool r = this->merge_qualifier(loc, state, q); + const bool r = this->merge_qualifier(loc, state, q, false); - if (state->stage == MESA_SHADER_TESS_CTRL) { - node = new(mem_ctx) ast_tcs_output_layout(*loc); + if (state->stage == MESA_SHADER_GEOMETRY) { + if (q.flags.q.prim_type) { + /* Make sure this is a valid output primitive type. */ + switch (q.prim_type) { + case GL_POINTS: + case GL_LINE_STRIP: + case GL_TRIANGLE_STRIP: + break; + default: + _mesa_glsl_error(loc, state, "invalid geometry shader output " + "primitive type"); + break; + } + } + + /* Allow future assigments of global out's stream id value */ + this->flags.q.explicit_stream = 0; + } else if (state->stage == MESA_SHADER_TESS_CTRL) { + if (create_node) { + node = new(mem_ctx) ast_tcs_output_layout(*loc); + } + } else { + _mesa_glsl_error(loc, state, "out layout qualifiers only valid in " + "tessellation control or geometry shaders"); } return r; @@ -310,7 +336,7 @@ bool ast_type_qualifier::merge_in_qualifier(YYLTYPE *loc, _mesa_glsl_parse_state *state, const ast_type_qualifier &q, - ast_node* &node) + ast_node* &node, bool create_node) { void *mem_ctx = state; bool create_gs_ast = false; @@ -450,10 +476,12 @@ ast_type_qualifier::merge_in_qualifier(YYLTYPE *loc, this->point_mode = q.point_mode; } - if (create_gs_ast) { - node = new(mem_ctx) ast_gs_input_layout(*loc, q.prim_type); - } else if (create_cs_ast) { - node = new(mem_ctx) ast_cs_input_layout(*loc, q.local_size); + if (create_node) { + if (create_gs_ast) { + node = new(mem_ctx) ast_gs_input_layout(*loc, q.prim_type); + } else if (create_cs_ast) { + node = new(mem_ctx) ast_cs_input_layout(*loc, q.local_size); + } } return true; diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy index 51796a65df9..10198758944 100644 --- a/src/glsl/glsl_parser.yy +++ b/src/glsl/glsl_parser.yy @@ -299,6 +299,10 @@ static bool match_layout_qualifier(const char *s1, const char *s2, %type for_init_statement %type for_rest_statement %type layout_defaults +%type layout_uniform_defaults +%type layout_buffer_defaults +%type layout_in_defaults +%type layout_out_defaults %right THEN ELSE %% @@ -953,7 +957,7 @@ parameter_qualifier: "or precise"); $$ = $1; - $$.merge_qualifier(&@1, state, $2); + $$.merge_qualifier(&@1, state, $2, false); } | precision_qualifier parameter_qualifier { @@ -970,7 +974,7 @@ parameter_qualifier: | memory_qualifier parameter_qualifier { $$ = $1; - $$.merge_qualifier(&@1, state, $2); + $$.merge_qualifier(&@1, state, $2, false); } parameter_direction_qualifier: @@ -1149,7 +1153,7 @@ layout_qualifier_id_list: | layout_qualifier_id_list ',' layout_qualifier_id { $$ = $1; - if (!$$.merge_qualifier(& @3, state, $3)) { + if (!$$.merge_qualifier(& @3, state, $3, true)) { YYERROR; } } @@ -1758,7 +1762,7 @@ type_qualifier: } $$ = $1; - $$.merge_qualifier(&@1, state, $2); + $$.merge_qualifier(&@1, state, $2, false); } | layout_qualifier type_qualifier { @@ -1775,12 +1779,12 @@ type_qualifier: _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers"); $$ = $1; - $$.merge_qualifier(&@1, state, $2); + $$.merge_qualifier(&@1, state, $2, false); } | subroutine_qualifier type_qualifier { $$ = $1; - $$.merge_qualifier(&@1, state, $2); + $$.merge_qualifier(&@1, state, $2, false); } | auxiliary_storage_qualifier type_qualifier { @@ -1796,7 +1800,7 @@ type_qualifier: "just before storage qualifiers"); } $$ = $1; - $$.merge_qualifier(&@1, state, $2); + $$.merge_qualifier(&@1, state, $2, false); } | storage_qualifier type_qualifier { @@ -1816,7 +1820,7 @@ type_qualifier: } $$ = $1; - $$.merge_qualifier(&@1, state, $2); + $$.merge_qualifier(&@1, state, $2, false); } | precision_qualifier type_qualifier { @@ -1833,7 +1837,7 @@ type_qualifier: | memory_qualifier type_qualifier { $$ = $1; - $$.merge_qualifier(&@1, state, $2); + $$.merge_qualifier(&@1, state, $2, false); } ; @@ -2585,7 +2589,7 @@ interface_block: YYERROR; } - if (!block->layout.merge_qualifier(& @1, state, $1)) { + if (!block->layout.merge_qualifier(& @1, state, $1, false)) { YYERROR; } @@ -2602,7 +2606,7 @@ interface_block: "memory qualifiers can only be used in the " "declaration of shader storage blocks"); } - if (!block->layout.merge_qualifier(& @1, state, $1)) { + if (!block->layout.merge_qualifier(& @1, state, $1, false)) { YYERROR; } $$ = block; @@ -2737,18 +2741,48 @@ member_declaration: } ; -layout_defaults: - layout_qualifier UNIFORM ';' +layout_uniform_defaults: + layout_qualifier layout_uniform_defaults { - if (!state->default_uniform_qualifier->merge_qualifier(& @1, state, $1)) { + $$ = NULL; + if (!state->has_420pack_or_es31()) { + _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers"); + YYERROR; + } else { + if (!state->default_uniform_qualifier-> + merge_qualifier(& @1, state, $1, false)) { + YYERROR; + } + } + } + | layout_qualifier UNIFORM ';' + { + if (!state->default_uniform_qualifier-> + merge_qualifier(& @1, state, $1, false)) { YYERROR; } $$ = NULL; } + ; +layout_buffer_defaults: + layout_qualifier layout_buffer_defaults + { + $$ = NULL; + if (!state->has_420pack_or_es31()) { + _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers"); + YYERROR; + } else { + if (!state->default_shader_storage_qualifier-> + merge_qualifier(& @1, state, $1, false)) { + YYERROR; + } + } + } | layout_qualifier BUFFER ';' { - if (!state->default_shader_storage_qualifier->merge_qualifier(& @1, state, $1)) { + if (!state->default_shader_storage_qualifier-> + merge_qualifier(& @1, state, $1, false)) { YYERROR; } @@ -2764,43 +2798,58 @@ layout_defaults: $$ = NULL; } + ; +layout_in_defaults: + layout_qualifier layout_in_defaults + { + $$ = NULL; + if (!state->has_420pack_or_es31()) { + _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers"); + YYERROR; + } else { + if (!state->in_qualifier-> + merge_in_qualifier(& @1, state, $1, $$, false)) { + YYERROR; + } + } + } | layout_qualifier IN_TOK ';' { $$ = NULL; - if (!state->in_qualifier->merge_in_qualifier(& @1, state, $1, $$)) { + if (!state->in_qualifier-> + merge_in_qualifier(& @1, state, $1, $$, true)) { YYERROR; } } + ; +layout_out_defaults: + layout_qualifier layout_out_defaults + { + $$ = NULL; + if (!state->has_420pack_or_es31()) { + _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers"); + YYERROR; + } else { + if (!state->out_qualifier-> + merge_out_qualifier(& @1, state, $1, $$, false)) { + YYERROR; + } + } + } | layout_qualifier OUT_TOK ';' { $$ = NULL; - if (state->stage == MESA_SHADER_GEOMETRY) { - if ($1.flags.q.prim_type) { - /* Make sure this is a valid output primitive type. */ - switch ($1.prim_type) { - case GL_POINTS: - case GL_LINE_STRIP: - case GL_TRIANGLE_STRIP: - break; - default: - _mesa_glsl_error(&@1, state, "invalid geometry shader output " - "primitive type"); - break; - } - } - if (!state->out_qualifier->merge_qualifier(& @1, state, $1)) - YYERROR; - - /* Allow future assigments of global out's stream id value */ - state->out_qualifier->flags.q.explicit_stream = 0; - } else if (state->stage == MESA_SHADER_TESS_CTRL) { - if (!state->out_qualifier->merge_out_qualifier(& @1, state, $1, $$)) - YYERROR; - } else { - _mesa_glsl_error(& @1, state, - "out layout qualifiers only valid in " - "tessellation control or geometry shaders"); - } + if (!state->out_qualifier-> + merge_out_qualifier(& @1, state, $1, $$, true)) + YYERROR; } + ; + +layout_defaults: + layout_uniform_defaults + | layout_buffer_defaults + | layout_in_defaults + | layout_out_defaults + ; diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp index b424edd8e96..db1947453ea 100644 --- a/src/glsl/ir.cpp +++ b/src/glsl/ir.cpp @@ -298,8 +298,6 @@ ir_expression::ir_expression(int op, ir_rvalue *op0) break; case ir_unop_noise: - case ir_unop_unpack_half_2x16_split_x: - case ir_unop_unpack_half_2x16_split_y: this->type = glsl_type::float_type; break; @@ -422,10 +420,6 @@ ir_expression::ir_expression(int op, ir_rvalue *op0, ir_rvalue *op1) this->type = op0->type->get_base_type(); break; - case ir_binop_pack_half_2x16_split: - this->type = glsl_type::uint_type; - break; - case ir_binop_imul_high: case ir_binop_carry: case ir_binop_borrow: @@ -555,8 +549,6 @@ static const char *const operator_strs[] = { "unpackUnorm2x16", "unpackUnorm4x8", "unpackHalf2x16", - "unpackHalf2x16_split_x", - "unpackHalf2x16_split_y", "bitfield_reverse", "bit_count", "find_msb", @@ -599,7 +591,6 @@ static const char *const operator_strs[] = { "min", "max", "pow", - "packHalf2x16_split", "ubo_load", "ldexp", "vector_extract", diff --git a/src/glsl/ir.h b/src/glsl/ir.h index 5b845c6e856..b453187c32a 100644 --- a/src/glsl/ir.h +++ b/src/glsl/ir.h @@ -1401,16 +1401,6 @@ enum ir_expression_operation { ir_unop_unpack_half_2x16, /*@}*/ - /** - * \name Lowered floating point unpacking operations. - * - * \see lower_packing_builtins_visitor::split_unpack_half_2x16 - */ - /*@{*/ - ir_unop_unpack_half_2x16_split_x, - ir_unop_unpack_half_2x16_split_y, - /*@}*/ - /** * \name Bit operations, part of ARB_gpu_shader5. */ @@ -1541,15 +1531,6 @@ enum ir_expression_operation { ir_binop_pow, - /** - * \name Lowered floating point packing operations. - * - * \see lower_packing_builtins_visitor::split_pack_half_2x16 - */ - /*@{*/ - ir_binop_pack_half_2x16_split, - /*@}*/ - /** * Load a value the size of a given GLSL type from a uniform block. * diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h index be86f547f77..b56413a1500 100644 --- a/src/glsl/ir_optimization.h +++ b/src/glsl/ir_optimization.h @@ -58,17 +58,14 @@ enum lower_packing_builtins_op { LOWER_PACK_HALF_2x16 = 0x0010, LOWER_UNPACK_HALF_2x16 = 0x0020, - LOWER_PACK_HALF_2x16_TO_SPLIT = 0x0040, - LOWER_UNPACK_HALF_2x16_TO_SPLIT = 0x0080, + LOWER_PACK_SNORM_4x8 = 0x0040, + LOWER_UNPACK_SNORM_4x8 = 0x0080, - LOWER_PACK_SNORM_4x8 = 0x0100, - LOWER_UNPACK_SNORM_4x8 = 0x0200, + LOWER_PACK_UNORM_4x8 = 0x0100, + LOWER_UNPACK_UNORM_4x8 = 0x0200, - LOWER_PACK_UNORM_4x8 = 0x0400, - LOWER_UNPACK_UNORM_4x8 = 0x0800, - - LOWER_PACK_USE_BFI = 0x1000, - LOWER_PACK_USE_BFE = 0x2000, + LOWER_PACK_USE_BFI = 0x0400, + LOWER_PACK_USE_BFE = 0x0800, }; bool do_common_optimization(exec_list *ir, bool linked, diff --git a/src/glsl/ir_validate.cpp b/src/glsl/ir_validate.cpp index 94814799b9b..12928836597 100644 --- a/src/glsl/ir_validate.cpp +++ b/src/glsl/ir_validate.cpp @@ -372,12 +372,6 @@ ir_validate::visit_leave(ir_expression *ir) assert(ir->operands[0]->type == glsl_type::uint_type); break; - case ir_unop_unpack_half_2x16_split_x: - case ir_unop_unpack_half_2x16_split_y: - assert(ir->type == glsl_type::float_type); - assert(ir->operands[0]->type == glsl_type::uint_type); - break; - case ir_unop_unpack_double_2x32: assert(ir->type == glsl_type::uvec2_type); assert(ir->operands[0]->type == glsl_type::double_type); @@ -567,12 +561,6 @@ ir_validate::visit_leave(ir_expression *ir) assert(ir->operands[0]->type == ir->operands[1]->type); break; - case ir_binop_pack_half_2x16_split: - assert(ir->type == glsl_type::uint_type); - assert(ir->operands[0]->type == glsl_type::float_type); - assert(ir->operands[1]->type == glsl_type::float_type); - break; - case ir_binop_ubo_load: assert(ir->operands[0]->type == glsl_type::uint_type); diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp index 7cc58800765..09f80d0f39d 100644 --- a/src/glsl/link_varyings.cpp +++ b/src/glsl/link_varyings.cpp @@ -968,10 +968,12 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var) } if ((consumer_var == NULL && producer_var->type->contains_integer()) || - consumer_stage != MESA_SHADER_FRAGMENT) { + (consumer_stage != -1 && consumer_stage != MESA_SHADER_FRAGMENT)) { /* Since this varying is not being consumed by the fragment shader, its - * interpolation type varying cannot possibly affect rendering. Also, - * this variable is non-flat and is (or contains) an integer. + * interpolation type varying cannot possibly affect rendering. + * Also, this variable is non-flat and is (or contains) an integer. + * If the consumer stage is unknown, don't modify the interpolation + * type as it could affect rendering later with separate shaders. * * lower_packed_varyings requires all integer varyings to flat, * regardless of where they appear. We can trivially satisfy that diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp index 564c4712871..6657777d74c 100644 --- a/src/glsl/linker.cpp +++ b/src/glsl/linker.cpp @@ -992,7 +992,17 @@ cross_validate_globals(struct gl_shader_program *prog, existing->data.location = var->data.location; existing->data.explicit_location = true; - } + } else { + /* Check if uniform with implicit location was marked explicit + * by earlier shader stage. If so, mark it explicit in this stage + * too to make sure later processing does not treat it as + * implicit one. + */ + if (existing->data.explicit_location) { + var->data.location = existing->data.location; + var->data.explicit_location = true; + } + } /* From the GLSL 4.20 specification: * "A link error will result if two compilation units in a program @@ -3152,7 +3162,7 @@ check_explicit_uniform_locations(struct gl_context *ctx, if (var->data.explicit_location) { bool ret; - if (var->type->is_subroutine()) + if (var->type->without_array()->is_subroutine()) ret = reserve_subroutine_explicit_locations(prog, sh, var); else ret = reserve_explicit_locations(prog, uniform_map, var); diff --git a/src/glsl/lower_packing_builtins.cpp b/src/glsl/lower_packing_builtins.cpp index 7f18238bc6e..a41627bd561 100644 --- a/src/glsl/lower_packing_builtins.cpp +++ b/src/glsl/lower_packing_builtins.cpp @@ -43,13 +43,6 @@ public: : op_mask(op_mask), progress(false) { - /* Mutually exclusive options. */ - assert(!((op_mask & LOWER_PACK_HALF_2x16) && - (op_mask & LOWER_PACK_HALF_2x16_TO_SPLIT))); - - assert(!((op_mask & LOWER_UNPACK_HALF_2x16) && - (op_mask & LOWER_UNPACK_HALF_2x16_TO_SPLIT))); - factory.instructions = &factory_instructions; } @@ -96,9 +89,6 @@ public: case LOWER_PACK_HALF_2x16: *rvalue = lower_pack_half_2x16(op0); break; - case LOWER_PACK_HALF_2x16_TO_SPLIT: - *rvalue = split_pack_half_2x16(op0); - break; case LOWER_UNPACK_SNORM_2x16: *rvalue = lower_unpack_snorm_2x16(op0); break; @@ -114,9 +104,6 @@ public: case LOWER_UNPACK_HALF_2x16: *rvalue = lower_unpack_half_2x16(op0); break; - case LOWER_UNPACK_HALF_2x16_TO_SPLIT: - *rvalue = split_unpack_half_2x16(op0); - break; case LOWER_PACK_UNPACK_NONE: case LOWER_PACK_USE_BFI: case LOWER_PACK_USE_BFE: @@ -161,7 +148,7 @@ private: result = op_mask & LOWER_PACK_UNORM_4x8; break; case ir_unop_pack_half_2x16: - result = op_mask & (LOWER_PACK_HALF_2x16 | LOWER_PACK_HALF_2x16_TO_SPLIT); + result = op_mask & LOWER_PACK_HALF_2x16; break; case ir_unop_unpack_snorm_2x16: result = op_mask & LOWER_UNPACK_SNORM_2x16; @@ -176,7 +163,7 @@ private: result = op_mask & LOWER_UNPACK_UNORM_4x8; break; case ir_unop_unpack_half_2x16: - result = op_mask & (LOWER_UNPACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16_TO_SPLIT); + result = op_mask & LOWER_UNPACK_HALF_2x16; break; default: result = LOWER_PACK_UNPACK_NONE; @@ -1092,41 +1079,6 @@ private: return result; } - /** - * \brief Split packHalf2x16's vec2 operand into two floats. - * - * \param vec2_rval is packHalf2x16's input - * \return a uint rvalue - * - * Some code generators, such as the i965 fragment shader, require that all - * vector expressions be lowered to a sequence of scalar expressions. - * However, packHalf2x16 cannot be scalarized by the same mechanism as - * a true vector operation because its input and output have a differing - * number of vector components. - * - * This method scalarizes packHalf2x16 by transforming it from an unary - * operation having vector input to a binary operation having scalar input. - * That is, it transforms - * - * packHalf2x16(VEC2_RVAL); - * - * into - * - * vec2 v = VEC2_RVAL; - * return packHalf2x16_split(v.x, v.y); - */ - ir_rvalue* - split_pack_half_2x16(ir_rvalue *vec2_rval) - { - assert(vec2_rval->type == glsl_type::vec2_type); - - ir_variable *v = factory.make_temp(glsl_type::vec2_type, - "tmp_split_pack_half_2x16_v"); - factory.emit(assign(v, vec2_rval)); - - return expr(ir_binop_pack_half_2x16_split, swizzle_x(v), swizzle_y(v)); - } - /** * \brief Lower the component-wise calculation of unpackHalf2x16. * @@ -1341,59 +1293,6 @@ private: assert(result->type == glsl_type::vec2_type); return result; } - - /** - * \brief Split unpackHalf2x16 into two operations. - * - * \param uint_rval is unpackHalf2x16's input - * \return a vec2 rvalue - * - * Some code generators, such as the i965 fragment shader, require that all - * vector expressions be lowered to a sequence of scalar expressions. - * However, unpackHalf2x16 cannot be scalarized by the same method as - * a true vector operation because the number of components of its input - * and output differ. - * - * This method scalarizes unpackHalf2x16 by transforming it from a single - * operation having vec2 output to a pair of operations each having float - * output. That is, it transforms - * - * unpackHalf2x16(UINT_RVAL) - * - * into - * - * uint u = UINT_RVAL; - * vec2 v; - * - * v.x = unpackHalf2x16_split_x(u); - * v.y = unpackHalf2x16_split_y(u); - * - * return v; - */ - ir_rvalue* - split_unpack_half_2x16(ir_rvalue *uint_rval) - { - assert(uint_rval->type == glsl_type::uint_type); - - /* uint u = uint_rval; */ - ir_variable *u = factory.make_temp(glsl_type::uint_type, - "tmp_split_unpack_half_2x16_u"); - factory.emit(assign(u, uint_rval)); - - /* vec2 v; */ - ir_variable *v = factory.make_temp(glsl_type::vec2_type, - "tmp_split_unpack_half_2x16_v"); - - /* v.x = unpack_half_2x16_split_x(u); */ - factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_x, u), - WRITEMASK_X)); - - /* v.y = unpack_half_2x16_split_y(u); */ - factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_y, u), - WRITEMASK_Y)); - - return deref(v).val; - } }; } // namespace anonymous diff --git a/src/glsl/lower_subroutine.cpp b/src/glsl/lower_subroutine.cpp index a0df5e1df81..ac8ade13d99 100644 --- a/src/glsl/lower_subroutine.cpp +++ b/src/glsl/lower_subroutine.cpp @@ -44,6 +44,7 @@ public: } ir_visitor_status visit_leave(ir_call *); + ir_call *call_clone(ir_call *call, ir_function_signature *callee); bool progress; struct _mesa_glsl_parse_state *state; }; @@ -58,6 +59,23 @@ lower_subroutine(exec_list *instructions, struct _mesa_glsl_parse_state *state) return v.progress; } +ir_call * +lower_subroutine_visitor::call_clone(ir_call *call, ir_function_signature *callee) +{ + void *mem_ctx = ralloc_parent(call); + ir_dereference_variable *new_return_ref = NULL; + if (call->return_deref != NULL) + new_return_ref = call->return_deref->clone(mem_ctx, NULL); + + exec_list new_parameters; + + foreach_in_list(ir_instruction, ir, &call->actual_parameters) { + new_parameters.push_tail(ir->clone(mem_ctx, NULL)); + } + + return new(mem_ctx) ir_call(callee, new_return_ref, &new_parameters); +} + ir_visitor_status lower_subroutine_visitor::visit_leave(ir_call *ir) { @@ -66,7 +84,6 @@ lower_subroutine_visitor::visit_leave(ir_call *ir) void *mem_ctx = ralloc_parent(ir); ir_if *last_branch = NULL; - ir_dereference_variable *return_deref = ir->return_deref; for (int s = this->state->num_subroutines - 1; s >= 0; s--) { ir_rvalue *var; @@ -92,14 +109,11 @@ lower_subroutine_visitor::visit_leave(ir_call *ir) fn->exact_matching_signature(this->state, &ir->actual_parameters); - ir_call *new_call = new(mem_ctx) ir_call(sub_sig, return_deref, &ir->actual_parameters); + ir_call *new_call = call_clone(ir, sub_sig); if (!last_branch) last_branch = if_tree(equal(subr_to_int(var), lc), new_call); else last_branch = if_tree(equal(subr_to_int(var), lc), new_call, last_branch); - - if (return_deref && s > 0) - return_deref = return_deref->clone(mem_ctx, NULL); } if (last_branch) ir->insert_before(last_branch); diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp index 5a1bbc43243..f29377cc260 100644 --- a/src/glsl/nir/glsl_to_nir.cpp +++ b/src/glsl/nir/glsl_to_nir.cpp @@ -1442,12 +1442,6 @@ nir_visitor::visit(ir_expression *ir) case ir_unop_unpack_half_2x16: result = nir_unpack_half_2x16(&b, srcs[0]); break; - case ir_unop_unpack_half_2x16_split_x: - result = nir_unpack_half_2x16_split_x(&b, srcs[0]); - break; - case ir_unop_unpack_half_2x16_split_y: - result = nir_unpack_half_2x16_split_y(&b, srcs[0]); - break; case ir_unop_bitfield_reverse: result = nir_bitfield_reverse(&b, srcs[0]); break; @@ -1731,9 +1725,6 @@ nir_visitor::visit(ir_expression *ir) } break; - case ir_binop_pack_half_2x16_split: - result = nir_pack_half_2x16_split(&b, srcs[0], srcs[1]); - break; case ir_binop_ldexp: result = nir_ldexp(&b, srcs[0], srcs[1]); break; case ir_triop_fma: result = nir_ffma(&b, srcs[0], srcs[1], srcs[2]); diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h index 4e3533189e4..ec6595b091d 100644 --- a/src/glsl/nir/nir.h +++ b/src/glsl/nir/nir.h @@ -140,7 +140,7 @@ typedef enum { * ir_variable - it should be easy to translate between the two. */ -typedef struct { +typedef struct nir_variable { struct exec_node node; /** @@ -383,7 +383,7 @@ nir_variable_get_io_mask(nir_variable *var, gl_shader_stage stage) return ((1ull << slots) - 1) << var->data.location; } -typedef struct { +typedef struct nir_register { struct exec_node node; unsigned num_components; /** < number of vector components */ @@ -477,7 +477,7 @@ nir_instr_is_last(nir_instr *instr) return exec_node_is_tail_sentinel(exec_node_get_next(&instr->node)); } -typedef struct { +typedef struct nir_ssa_def { /** for debugging only, can be NULL */ const char* name; @@ -1530,6 +1530,20 @@ typedef struct nir_shader_compiler_options { /** lowers ffract to fsub+ffloor: */ bool lower_ffract; + bool lower_pack_half_2x16; + bool lower_pack_unorm_2x16; + bool lower_pack_snorm_2x16; + bool lower_pack_unorm_4x8; + bool lower_pack_snorm_4x8; + bool lower_unpack_half_2x16; + bool lower_unpack_unorm_2x16; + bool lower_unpack_snorm_2x16; + bool lower_unpack_unorm_4x8; + bool lower_unpack_snorm_4x8; + + bool lower_extract_byte; + bool lower_extract_word; + /** * Does the driver support real 32-bit integers? (Otherwise, integers * are simulated by floats.) diff --git a/src/glsl/nir/nir_builder.h b/src/glsl/nir/nir_builder.h index e842b2252ff..1c7c78acae8 100644 --- a/src/glsl/nir/nir_builder.h +++ b/src/glsl/nir/nir_builder.h @@ -134,6 +134,20 @@ nir_imm_int(nir_builder *build, int x) return nir_build_imm(build, 1, v); } +static inline nir_ssa_def * +nir_imm_ivec4(nir_builder *build, int x, int y, int z, int w) +{ + nir_const_value v; + + memset(&v, 0, sizeof(v)); + v.i[0] = x; + v.i[1] = y; + v.i[2] = z; + v.i[3] = w; + + return nir_build_imm(build, 4, v); +} + static inline nir_ssa_def * nir_build_alu(nir_builder *build, nir_op op, nir_ssa_def *src0, nir_ssa_def *src1, nir_ssa_def *src2, nir_ssa_def *src3) diff --git a/src/glsl/nir/nir_lower_alu_to_scalar.c b/src/glsl/nir/nir_lower_alu_to_scalar.c index 0a27e66cf0f..37cb0221e0b 100644 --- a/src/glsl/nir/nir_lower_alu_to_scalar.c +++ b/src/glsl/nir/nir_lower_alu_to_scalar.c @@ -97,6 +97,20 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b) */ return; + case nir_op_pack_half_2x16: + if (!b->shader->options->lower_pack_half_2x16) + return; + + nir_ssa_def *val = + nir_pack_half_2x16_split(b, nir_channel(b, instr->src[0].src.ssa, + instr->src[0].swizzle[0]), + nir_channel(b, instr->src[0].src.ssa, + instr->src[0].swizzle[1])); + + nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(val)); + nir_instr_remove(&instr->instr); + return; + case nir_op_unpack_unorm_4x8: case nir_op_unpack_snorm_4x8: case nir_op_unpack_unorm_2x16: @@ -106,11 +120,51 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b) */ return; - case nir_op_unpack_half_2x16: - /* We could split this into unpack_half_2x16_split_[xy], but should - * we? - */ + case nir_op_unpack_half_2x16: { + if (!b->shader->options->lower_unpack_half_2x16) + return; + + nir_ssa_def *comps[2]; + comps[0] = nir_unpack_half_2x16_split_x(b, instr->src[0].src.ssa); + comps[1] = nir_unpack_half_2x16_split_y(b, instr->src[0].src.ssa); + nir_ssa_def *vec = nir_vec(b, comps, 2); + + nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(vec)); + nir_instr_remove(&instr->instr); return; + } + + case nir_op_pack_uvec2_to_uint: { + assert(b->shader->options->lower_pack_snorm_2x16 || + b->shader->options->lower_pack_unorm_2x16); + + nir_ssa_def *word = + nir_extract_uword(b, instr->src[0].src.ssa, nir_imm_int(b, 0)); + nir_ssa_def *val = + nir_ior(b, nir_ishl(b, nir_channel(b, word, 1), nir_imm_int(b, 16)), + nir_channel(b, word, 0)); + + nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(val)); + nir_instr_remove(&instr->instr); + break; + } + + case nir_op_pack_uvec4_to_uint: { + assert(b->shader->options->lower_pack_snorm_4x8 || + b->shader->options->lower_pack_unorm_4x8); + + nir_ssa_def *byte = + nir_extract_ubyte(b, instr->src[0].src.ssa, nir_imm_int(b, 0)); + nir_ssa_def *val = + nir_ior(b, nir_ior(b, nir_ishl(b, nir_channel(b, byte, 3), nir_imm_int(b, 24)), + nir_ishl(b, nir_channel(b, byte, 2), nir_imm_int(b, 16))), + nir_ior(b, nir_ishl(b, nir_channel(b, byte, 1), nir_imm_int(b, 8)), + nir_channel(b, byte, 0))); + + nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(val)); + nir_instr_remove(&instr->instr); + break; + } case nir_op_fdph: { nir_ssa_def *sum[4]; diff --git a/src/glsl/nir/nir_opcodes.py b/src/glsl/nir/nir_opcodes.py index c5fb0420bb6..0eff89783dd 100644 --- a/src/glsl/nir/nir_opcodes.py +++ b/src/glsl/nir/nir_opcodes.py @@ -105,7 +105,7 @@ def opcode(name, output_size, output_type, input_sizes, input_types, opcodes[name] = Opcode(name, output_size, output_type, input_sizes, input_types, algebraic_properties, const_expr) -def unop_convert(name, in_type, out_type, const_expr): +def unop_convert(name, out_type, in_type, const_expr): opcode(name, 0, out_type, [0], [in_type], "", const_expr) def unop(name, ty, const_expr): @@ -155,17 +155,17 @@ unop("frsq", tfloat, "1.0f / sqrtf(src0)") unop("fsqrt", tfloat, "sqrtf(src0)") unop("fexp2", tfloat, "exp2f(src0)") unop("flog2", tfloat, "log2f(src0)") -unop_convert("f2i", tfloat, tint, "src0") # Float-to-integer conversion. -unop_convert("f2u", tfloat, tuint, "src0") # Float-to-unsigned conversion -unop_convert("i2f", tint, tfloat, "src0") # Integer-to-float conversion. +unop_convert("f2i", tint, tfloat, "src0") # Float-to-integer conversion. +unop_convert("f2u", tuint, tfloat, "src0") # Float-to-unsigned conversion +unop_convert("i2f", tfloat, tint, "src0") # Integer-to-float conversion. # Float-to-boolean conversion -unop_convert("f2b", tfloat, tbool, "src0 != 0.0f") +unop_convert("f2b", tbool, tfloat, "src0 != 0.0f") # Boolean-to-float conversion -unop_convert("b2f", tbool, tfloat, "src0 ? 1.0f : 0.0f") +unop_convert("b2f", tfloat, tbool, "src0 ? 1.0f : 0.0f") # Int-to-boolean conversion -unop_convert("i2b", tint, tbool, "src0 != 0") -unop_convert("b2i", tbool, tint, "src0 ? 1 : 0") # Boolean-to-int conversion -unop_convert("u2f", tuint, tfloat, "src0") # Unsigned-to-float conversion. +unop_convert("i2b", tbool, tint, "src0 != 0") +unop_convert("b2i", tint, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion +unop_convert("u2f", tfloat, tuint, "src0") # Unsigned-to-float conversion. # Unary floating-point rounding operations. @@ -238,6 +238,16 @@ unpack_2x16("unorm") unpack_4x8("unorm") unpack_2x16("half") +unop_horiz("pack_uvec2_to_uint", 0, tuint, 2, tuint, """ +dst = (src0.x & 0xffff) | (src0.y >> 16); +""") + +unop_horiz("pack_uvec4_to_uint", 0, tuint, 4, tuint, """ +dst = (src0.x << 0) | + (src0.y << 8) | + (src0.z << 16) | + (src0.w << 24); +""") # Lowered floating point unpacking operations. @@ -265,7 +275,7 @@ for (unsigned bit = 0; bit < 32; bit++) { } """) -unop_convert("ufind_msb", tuint, tint, """ +unop_convert("ufind_msb", tint, tuint, """ dst = -1; for (int bit = 31; bit > 0; bit--) { if ((src0 >> bit) & 1) { @@ -551,6 +561,15 @@ dst.x = src0.x; dst.y = src1.x; """) +# Byte extraction +binop("extract_ubyte", tuint, "", "(uint8_t)(src0 >> (src1 * 8))") +binop("extract_ibyte", tint, "", "(int8_t)(src0 >> (src1 * 8))") + +# Word extraction +binop("extract_uword", tuint, "", "(uint16_t)(src0 >> (src1 * 16))") +binop("extract_iword", tint, "", "(int16_t)(src0 >> (src1 * 16))") + + def triop(name, ty, const_expr): opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr) def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr): diff --git a/src/glsl/nir/nir_opt_algebraic.py b/src/glsl/nir/nir_opt_algebraic.py index a46cbf711ac..190e4b7b43b 100644 --- a/src/glsl/nir/nir_opt_algebraic.py +++ b/src/glsl/nir/nir_opt_algebraic.py @@ -245,6 +245,70 @@ optimizations = [ ('bcsel', ('ult', 31, 'bits'), 'value', ('ubfe', 'value', 'offset', 'bits')), 'options->lower_bitfield_extract'), + + (('extract_ibyte', a, b), + ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 8), + 'options->lower_extract_byte'), + + (('extract_ubyte', a, b), + ('iand', ('ushr', a, ('imul', b, 8)), 0xff), + 'options->lower_extract_byte'), + + (('extract_iword', a, b), + ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16), + 'options->lower_extract_word'), + + (('extract_uword', a, b), + ('iand', ('ushr', a, ('imul', b, 16)), 0xffff), + 'options->lower_extract_word'), + + (('pack_unorm_2x16', 'v'), + ('pack_uvec2_to_uint', + ('f2u', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))), + 'options->lower_pack_unorm_2x16'), + + (('pack_unorm_4x8', 'v'), + ('pack_uvec4_to_uint', + ('f2u', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))), + 'options->lower_pack_unorm_4x8'), + + (('pack_snorm_2x16', 'v'), + ('pack_uvec2_to_uint', + ('f2i', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))), + 'options->lower_pack_snorm_2x16'), + + (('pack_snorm_4x8', 'v'), + ('pack_uvec4_to_uint', + ('f2i', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))), + 'options->lower_pack_snorm_4x8'), + + (('unpack_unorm_2x16', 'v'), + ('fdiv', ('u2f', ('vec4', ('extract_uword', 'v', 0), + ('extract_uword', 'v', 1), 0, 0)), + 65535.0), + 'options->lower_unpack_unorm_2x16'), + + (('unpack_unorm_4x8', 'v'), + ('fdiv', ('u2f', ('vec4', ('extract_ubyte', 'v', 0), + ('extract_ubyte', 'v', 1), + ('extract_ubyte', 'v', 2), + ('extract_ubyte', 'v', 3))), + 255.0), + 'options->lower_unpack_unorm_4x8'), + + (('unpack_snorm_2x16', 'v'), + ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_iword', 'v', 0), + ('extract_iword', 'v', 1), 0, 0)), + 32767.0))), + 'options->lower_unpack_snorm_2x16'), + + (('unpack_snorm_4x8', 'v'), + ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_ibyte', 'v', 0), + ('extract_ibyte', 'v', 1), + ('extract_ibyte', 'v', 2), + ('extract_ibyte', 'v', 3))), + 127.0))), + 'options->lower_unpack_snorm_4x8'), ] # Add optimizations to handle the case where the result of a ternary is diff --git a/src/glsl/nir/nir_print.c b/src/glsl/nir/nir_print.c index 850774b1099..a137706b15b 100644 --- a/src/glsl/nir/nir_print.c +++ b/src/glsl/nir/nir_print.c @@ -487,7 +487,7 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state) if (i != 0) fprintf(fp, ", "); - fprintf(fp, "%u", instr->const_index[i]); + fprintf(fp, "%d", instr->const_index[i]); } fprintf(fp, ")"); diff --git a/src/glsl/nir/shader_enums.c b/src/glsl/nir/shader_enums.c index 1410a504484..41da4a7b9ea 100644 --- a/src/glsl/nir/shader_enums.c +++ b/src/glsl/nir/shader_enums.c @@ -33,7 +33,8 @@ #define ENUM(x) [x] = #x #define NAME(val) ((((val) < ARRAY_SIZE(names)) && names[(val)]) ? names[(val)] : "UNKNOWN") -const char * gl_shader_stage_name(gl_shader_stage stage) +const char * +gl_shader_stage_name(gl_shader_stage stage) { static const char *names[] = { ENUM(MESA_SHADER_VERTEX), @@ -51,15 +52,16 @@ const char * gl_shader_stage_name(gl_shader_stage stage) * Translate a gl_shader_stage to a short shader stage name for debug * printouts and error messages. */ -const char * _mesa_shader_stage_to_string(unsigned stage) +const char * +_mesa_shader_stage_to_string(unsigned stage) { switch (stage) { case MESA_SHADER_VERTEX: return "vertex"; case MESA_SHADER_FRAGMENT: return "fragment"; case MESA_SHADER_GEOMETRY: return "geometry"; case MESA_SHADER_COMPUTE: return "compute"; - case MESA_SHADER_TESS_CTRL: return "tess ctrl"; - case MESA_SHADER_TESS_EVAL: return "tess eval"; + case MESA_SHADER_TESS_CTRL: return "tessellation control"; + case MESA_SHADER_TESS_EVAL: return "tessellation evaluation"; } unreachable("Unknown shader stage."); @@ -69,7 +71,8 @@ const char * _mesa_shader_stage_to_string(unsigned stage) * Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS) * for debug printouts and error messages. */ -const char * _mesa_shader_stage_to_abbrev(unsigned stage) +const char * +_mesa_shader_stage_to_abbrev(unsigned stage) { switch (stage) { case MESA_SHADER_VERTEX: return "VS"; @@ -83,7 +86,8 @@ const char * _mesa_shader_stage_to_abbrev(unsigned stage) unreachable("Unknown shader stage."); } -const char * gl_vert_attrib_name(gl_vert_attrib attrib) +const char * +gl_vert_attrib_name(gl_vert_attrib attrib) { static const char *names[] = { ENUM(VERT_ATTRIB_POS), @@ -124,7 +128,8 @@ const char * gl_vert_attrib_name(gl_vert_attrib attrib) return NAME(attrib); } -const char * gl_varying_slot_name(gl_varying_slot slot) +const char * +gl_varying_slot_name(gl_varying_slot slot) { static const char *names[] = { ENUM(VARYING_SLOT_POS), @@ -190,7 +195,8 @@ const char * gl_varying_slot_name(gl_varying_slot slot) return NAME(slot); } -const char * gl_system_value_name(gl_system_value sysval) +const char * +gl_system_value_name(gl_system_value sysval) { static const char *names[] = { ENUM(SYSTEM_VALUE_VERTEX_ID), @@ -218,7 +224,8 @@ const char * gl_system_value_name(gl_system_value sysval) return NAME(sysval); } -const char * glsl_interp_qualifier_name(enum glsl_interp_qualifier qual) +const char * +glsl_interp_qualifier_name(enum glsl_interp_qualifier qual) { static const char *names[] = { ENUM(INTERP_QUALIFIER_NONE), @@ -230,7 +237,8 @@ const char * glsl_interp_qualifier_name(enum glsl_interp_qualifier qual) return NAME(qual); } -const char * gl_frag_result_name(gl_frag_result result) +const char * +gl_frag_result_name(gl_frag_result result) { static const char *names[] = { ENUM(FRAG_RESULT_DEPTH), diff --git a/src/glsl/nir/shader_enums.h b/src/glsl/nir/shader_enums.h index bc6ea3844b6..3a06b14a46b 100644 --- a/src/glsl/nir/shader_enums.h +++ b/src/glsl/nir/shader_enums.h @@ -47,19 +47,19 @@ typedef enum MESA_SHADER_COMPUTE = 5, } gl_shader_stage; -const char * gl_shader_stage_name(gl_shader_stage stage); +const char *gl_shader_stage_name(gl_shader_stage stage); /** * Translate a gl_shader_stage to a short shader stage name for debug * printouts and error messages. */ -const char * _mesa_shader_stage_to_string(unsigned stage); +const char *_mesa_shader_stage_to_string(unsigned stage); /** * Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS) * for debug printouts and error messages. */ -const char * _mesa_shader_stage_to_abbrev(unsigned stage); +const char *_mesa_shader_stage_to_abbrev(unsigned stage); #define MESA_SHADER_STAGES (MESA_SHADER_COMPUTE + 1) @@ -109,7 +109,7 @@ typedef enum VERT_ATTRIB_MAX = 33 } gl_vert_attrib; -const char * gl_vert_attrib_name(gl_vert_attrib attrib); +const char *gl_vert_attrib_name(gl_vert_attrib attrib); /** * Symbolic constats to help iterating over @@ -254,7 +254,7 @@ typedef enum #define VARYING_SLOT_PATCH0 (VARYING_SLOT_MAX) #define VARYING_SLOT_TESS_MAX (VARYING_SLOT_PATCH0 + MAX_VARYING) -const char * gl_varying_slot_name(gl_varying_slot slot); +const char *gl_varying_slot_name(gl_varying_slot slot); /** * Bitflags for varying slots. @@ -467,7 +467,7 @@ typedef enum SYSTEM_VALUE_MAX /**< Number of values */ } gl_system_value; -const char * gl_system_value_name(gl_system_value sysval); +const char *gl_system_value_name(gl_system_value sysval); /** * The possible interpolation qualifiers that can be applied to a fragment @@ -485,7 +485,7 @@ enum glsl_interp_qualifier INTERP_QUALIFIER_COUNT /**< Number of interpolation qualifiers */ }; -const char * glsl_interp_qualifier_name(enum glsl_interp_qualifier qual); +const char *glsl_interp_qualifier_name(enum glsl_interp_qualifier qual); /** * Fragment program results @@ -516,7 +516,7 @@ typedef enum FRAG_RESULT_DATA7, } gl_frag_result; -const char * gl_frag_result_name(gl_frag_result result); +const char *gl_frag_result_name(gl_frag_result result); #define FRAG_RESULT_MAX (FRAG_RESULT_DATA0 + MAX_DRAW_BUFFERS) diff --git a/src/mapi/Makefile.am b/src/mapi/Makefile.am index 307e05d503f..68a28a2283c 100644 --- a/src/mapi/Makefile.am +++ b/src/mapi/Makefile.am @@ -35,6 +35,7 @@ EXTRA_DIST = \ es2api/ABI-check \ mapi_abi.py \ glapi/SConscript \ + glapi/registry/gl.xml \ shared-glapi/SConscript AM_CFLAGS = \ @@ -106,12 +107,16 @@ if HAVE_SPARC_ASM GLAPI_ASM_SOURCES = glapi/glapi_sparc.S endif -glapi_libglapi_la_SOURCES = glapi/glapi_gentable.c +glapi_libglapi_la_SOURCES = glapi_libglapi_la_CPPFLAGS = \ $(AM_CPPFLAGS) \ -I$(top_srcdir)/src/mapi/glapi \ -I$(top_srcdir)/src/mesa +if HAVE_APPLEDRI +glapi_libglapi_la_SOURCES += glapi/glapi_gentable.c +endif + if HAVE_SHARED_GLAPI glapi_libglapi_la_SOURCES += $(MAPI_BRIDGE_FILES) glapi/glapi_mapi_tmp.h glapi_libglapi_la_CPPFLAGS += \ diff --git a/src/mapi/glapi/gen/GREMEDY_string_marker.xml b/src/mapi/glapi/gen/GREMEDY_string_marker.xml new file mode 100644 index 00000000000..ffa3eac5898 --- /dev/null +++ b/src/mapi/glapi/gen/GREMEDY_string_marker.xml @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + diff --git a/src/mapi/glapi/gen/Makefile.am b/src/mapi/glapi/gen/Makefile.am index 900b61a5d45..cd7feabba24 100644 --- a/src/mapi/glapi/gen/Makefile.am +++ b/src/mapi/glapi/gen/Makefile.am @@ -27,8 +27,11 @@ MESA_GLAPI_OUTPUTS = \ $(MESA_GLAPI_DIR)/glapi_mapi_tmp.h \ $(MESA_GLAPI_DIR)/glprocs.h \ $(MESA_GLAPI_DIR)/glapitemp.h \ - $(MESA_GLAPI_DIR)/glapitable.h \ - $(MESA_GLAPI_DIR)/glapi_gentable.c + $(MESA_GLAPI_DIR)/glapitable.h + +if HAVE_APPLEDRI +MESA_GLAPI_OUTPUTS += $(MESA_GLAPI_DIR)/glapi_gentable.c +endif MESA_GLAPI_ASM_OUTPUTS = if HAVE_X86_ASM @@ -57,6 +60,7 @@ BUILT_SOURCES = \ $(MESA_GLX_DIR)/indirect_size.c EXTRA_DIST= \ $(BUILT_SOURCES) \ + $(MESA_GLAPI_DIR)/glapi_gentable.c \ $(MESA_GLAPI_DIR)/glapi_x86.S \ $(MESA_GLAPI_DIR)/glapi_x86-64.S \ $(MESA_GLAPI_DIR)/glapi_sparc.S \ @@ -88,8 +92,12 @@ XORG_GLAPI_DIR = $(XORG_BASE)/glx XORG_GLAPI_OUTPUTS = \ $(XORG_GLAPI_DIR)/glprocs.h \ $(XORG_GLAPI_DIR)/glapitable.h \ - $(XORG_GLAPI_DIR)/dispatch.h \ + $(XORG_GLAPI_DIR)/dispatch.h + +if HAVE_APPLEDRI +XORG_GLAPI_OUTPUTS += \ $(XORG_GLAPI_DIR)/glapi_gentable.c +endif XORG_OUTPUTS = \ $(XORG_GLAPI_OUTPUTS) \ @@ -188,6 +196,7 @@ API_XML = \ EXT_texture_array.xml \ EXT_texture_integer.xml \ EXT_transform_feedback.xml \ + GREMEDY_string_marker.xml \ INTEL_performance_query.xml \ KHR_debug.xml \ KHR_context_flush_control.xml \ diff --git a/src/mapi/glapi/gen/gl_API.xml b/src/mapi/glapi/gen/gl_API.xml index 593ace49563..d7ab3bff4df 100644 --- a/src/mapi/glapi/gen/gl_API.xml +++ b/src/mapi/glapi/gen/gl_API.xml @@ -12620,6 +12620,8 @@ + + diff --git a/src/mapi/glapi/gen/gl_gentable.py b/src/mapi/glapi/gen/gl_gentable.py index 1b3eb72470d..7cd475aa2b8 100644 --- a/src/mapi/glapi/gen/gl_gentable.py +++ b/src/mapi/glapi/gen/gl_gentable.py @@ -113,6 +113,9 @@ __glapi_gentable_set_remaining_noop(struct _glapi_table *disp) { dispatch[i] = p.v; } +""" + +footer = """ struct _glapi_table * _glapi_create_table_from_handle(void *handle, const char *symbol_prefix) { struct _glapi_table *disp = calloc(_glapi_get_dispatch_table_size(), sizeof(_glapi_proc)); @@ -123,27 +126,28 @@ _glapi_create_table_from_handle(void *handle, const char *symbol_prefix) { if(symbol_prefix == NULL) symbol_prefix = ""; -""" -footer = """ - __glapi_gentable_set_remaining_noop(disp); + /* Note: This code relies on _glapi_table_func_names being sorted by the + * entry point index of each function. + */ + for (int func_index = 0; func_index < GLAPI_TABLE_COUNT; ++func_index) { + const char *name = _glapi_table_func_names[func_index]; + void ** procp = &((void **)disp)[func_index]; - return disp; -} -""" - -body_template = """ - if(!disp->%(name)s) { - void ** procp = (void **) &disp->%(name)s; - snprintf(symboln, sizeof(symboln), "%%s%(entry_point)s", symbol_prefix); + snprintf(symboln, sizeof(symboln), \"%s%s\", symbol_prefix, name); #ifdef _WIN32 *procp = GetProcAddress(handle, symboln); #else *procp = dlsym(handle, symboln); #endif } + __glapi_gentable_set_remaining_noop(disp); + + return disp; +} """ + class PrintCode(gl_XML.gl_print_base): def __init__(self): @@ -180,12 +184,33 @@ class PrintCode(gl_XML.gl_print_base): def printBody(self, api): - for f in api.functionIterateByOffset(): - for entry_point in f.entry_points: - vars = { 'entry_point' : entry_point, - 'name' : f.name } - print body_template % vars + # Determine how many functions have a defined offset. + func_count = 0 + for f in api.functions_by_name.itervalues(): + if f.offset != -1: + func_count += 1 + + # Build the mapping from offset to function name. + funcnames = [None] * func_count + for f in api.functions_by_name.itervalues(): + if f.offset != -1: + if not (funcnames[f.offset] is None): + raise Exception("Function table has more than one function with same offset (offset %d, func %s)" % (f.offset, f.name)) + funcnames[f.offset] = f.name + + # Check that the table has no gaps. We expect a function at every offset, + # and the code which generates the table relies on this. + for i in xrange(0, func_count): + if funcnames[i] is None: + raise Exception("Function table has no function at offset %d" % (i)) + + print "#define GLAPI_TABLE_COUNT %d" % func_count + print "static const char * const _glapi_table_func_names[GLAPI_TABLE_COUNT] = {" + for i in xrange(0, func_count): + print " /* %5d */ \"%s\"," % (i, funcnames[i]) + print "};" + return diff --git a/src/mapi/glapi/glapi.h b/src/mapi/glapi/glapi.h index f269b1701bc..3593c88bbc1 100644 --- a/src/mapi/glapi/glapi.h +++ b/src/mapi/glapi/glapi.h @@ -158,8 +158,10 @@ _GLAPI_EXPORT const char * _glapi_get_proc_name(unsigned int offset); +#ifdef GLX_USE_APPLEGL _GLAPI_EXPORT struct _glapi_table * _glapi_create_table_from_handle(void *handle, const char *symbol_prefix); +#endif _GLAPI_EXPORT void diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources index 86777430a2e..5d69039d1af 100644 --- a/src/mesa/drivers/dri/i965/Makefile.sources +++ b/src/mesa/drivers/dri/i965/Makefile.sources @@ -1,6 +1,7 @@ i965_compiler_FILES = \ brw_cfg.cpp \ brw_cfg.h \ + brw_compiler.c \ brw_compiler.h \ brw_dead_control_flow.cpp \ brw_dead_control_flow.h \ @@ -72,7 +73,9 @@ i965_compiler_FILES = \ brw_vec4_surface_builder.cpp \ brw_vec4_surface_builder.h \ brw_vec4_tcs.cpp \ + brw_vec4_tcs.h \ brw_vec4_tes.cpp \ + brw_vec4_tes.h \ brw_vec4_visitor.cpp \ brw_vec4_vs_visitor.cpp \ brw_vue_map.c \ diff --git a/src/mesa/drivers/dri/i965/brw_binding_tables.c b/src/mesa/drivers/dri/i965/brw_binding_tables.c index 7fa5d602b96..f3a0310861c 100644 --- a/src/mesa/drivers/dri/i965/brw_binding_tables.c +++ b/src/mesa/drivers/dri/i965/brw_binding_tables.c @@ -365,7 +365,7 @@ gen7_disable_hw_binding_tables(struct brw_context *brw) /** * Enable hardware binding tables and set up the binding table pool. */ -static void +void gen7_enable_hw_binding_tables(struct brw_context *brw) { if (!brw->use_resource_streamer) diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c b/src/mesa/drivers/dri/i965/brw_compiler.c new file mode 100644 index 00000000000..f9e22d1d6b5 --- /dev/null +++ b/src/mesa/drivers/dri/i965/brw_compiler.c @@ -0,0 +1,179 @@ +/* + * Copyright © 2015-2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_compiler.h" +#include "brw_context.h" +#include "glsl/nir/nir.h" +#include "main/errors.h" +#include "util/debug.h" + +static void +shader_debug_log_mesa(void *data, const char *fmt, ...) +{ + struct brw_context *brw = (struct brw_context *)data; + va_list args; + + va_start(args, fmt); + GLuint msg_id = 0; + _mesa_gl_vdebug(&brw->ctx, &msg_id, + MESA_DEBUG_SOURCE_SHADER_COMPILER, + MESA_DEBUG_TYPE_OTHER, + MESA_DEBUG_SEVERITY_NOTIFICATION, fmt, args); + va_end(args); +} + +static void +shader_perf_log_mesa(void *data, const char *fmt, ...) +{ + struct brw_context *brw = (struct brw_context *)data; + + va_list args; + va_start(args, fmt); + + if (unlikely(INTEL_DEBUG & DEBUG_PERF)) { + va_list args_copy; + va_copy(args_copy, args); + vfprintf(stderr, fmt, args_copy); + va_end(args_copy); + } + + if (brw->perf_debug) { + GLuint msg_id = 0; + _mesa_gl_vdebug(&brw->ctx, &msg_id, + MESA_DEBUG_SOURCE_SHADER_COMPILER, + MESA_DEBUG_TYPE_PERFORMANCE, + MESA_DEBUG_SEVERITY_MEDIUM, fmt, args); + } + va_end(args); +} + +#define COMMON_OPTIONS \ + /* In order to help allow for better CSE at the NIR level we tell NIR to \ + * split all ffma instructions during opt_algebraic and we then re-combine \ + * them as a later step. \ + */ \ + .lower_ffma = true, \ + .lower_sub = true, \ + .lower_fdiv = true, \ + .lower_scmp = true, \ + .lower_fmod = true, \ + .lower_bitfield_extract = true, \ + .lower_bitfield_insert = true, \ + .lower_uadd_carry = true, \ + .lower_usub_borrow = true, \ + .lower_fdiv = true, \ + .native_integers = true + +static const struct nir_shader_compiler_options scalar_nir_options = { + COMMON_OPTIONS, + .lower_pack_half_2x16 = true, + .lower_pack_snorm_2x16 = true, + .lower_pack_snorm_4x8 = true, + .lower_pack_unorm_2x16 = true, + .lower_pack_unorm_4x8 = true, + .lower_unpack_half_2x16 = true, + .lower_unpack_snorm_2x16 = true, + .lower_unpack_snorm_4x8 = true, + .lower_unpack_unorm_2x16 = true, + .lower_unpack_unorm_4x8 = true, +}; + +static const struct nir_shader_compiler_options vector_nir_options = { + COMMON_OPTIONS, + + /* In the vec4 backend, our dpN instruction replicates its result to all the + * components of a vec4. We would like NIR to give us replicated fdot + * instructions because it can optimize better for us. + */ + .fdot_replicates = true, + + .lower_pack_snorm_2x16 = true, + .lower_pack_unorm_2x16 = true, + .lower_unpack_snorm_2x16 = true, + .lower_unpack_unorm_2x16 = true, + .lower_extract_byte = true, + .lower_extract_word = true, +}; + +struct brw_compiler * +brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo) +{ + struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler); + + compiler->devinfo = devinfo; + compiler->shader_debug_log = shader_debug_log_mesa; + compiler->shader_perf_log = shader_perf_log_mesa; + + brw_fs_alloc_reg_sets(compiler); + brw_vec4_alloc_reg_set(compiler); + + compiler->scalar_stage[MESA_SHADER_VERTEX] = + devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS); + compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false; + compiler->scalar_stage[MESA_SHADER_TESS_EVAL] = + devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true); + compiler->scalar_stage[MESA_SHADER_GEOMETRY] = + devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", false); + compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true; + compiler->scalar_stage[MESA_SHADER_COMPUTE] = true; + + /* We want the GLSL compiler to emit code that uses condition codes */ + for (int i = 0; i < MESA_SHADER_STAGES; i++) { + compiler->glsl_compiler_options[i].MaxUnrollIterations = 32; + compiler->glsl_compiler_options[i].MaxIfDepth = + devinfo->gen < 6 ? 16 : UINT_MAX; + + compiler->glsl_compiler_options[i].EmitCondCodes = true; + compiler->glsl_compiler_options[i].EmitNoNoise = true; + compiler->glsl_compiler_options[i].EmitNoMainReturn = true; + compiler->glsl_compiler_options[i].EmitNoIndirectInput = true; + compiler->glsl_compiler_options[i].EmitNoIndirectUniform = false; + compiler->glsl_compiler_options[i].LowerClipDistance = true; + + bool is_scalar = compiler->scalar_stage[i]; + + compiler->glsl_compiler_options[i].EmitNoIndirectOutput = is_scalar; + compiler->glsl_compiler_options[i].EmitNoIndirectTemp = is_scalar; + compiler->glsl_compiler_options[i].OptimizeForAOS = !is_scalar; + + /* !ARB_gpu_shader5 */ + if (devinfo->gen < 7) + compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true; + + compiler->glsl_compiler_options[i].NirOptions = + is_scalar ? &scalar_nir_options : &vector_nir_options; + + compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true; + } + + compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false; + compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false; + + if (compiler->scalar_stage[MESA_SHADER_GEOMETRY]) + compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false; + + compiler->glsl_compiler_options[MESA_SHADER_COMPUTE] + .LowerShaderSharedVariables = true; + + return compiler; +} diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h index b66869b8a78..62dcb4dad84 100644 --- a/src/mesa/drivers/dri/i965/brw_compiler.h +++ b/src/mesa/drivers/dri/i965/brw_compiler.h @@ -689,6 +689,9 @@ struct brw_gs_prog_data /** @} */ +struct brw_compiler * +brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo); + /** * Compile a vertex shader. * diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 7b0340fc2ab..2a29dfe5eec 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -221,6 +221,7 @@ enum brw_state_id { BRW_STATE_COMPUTE_PROGRAM, BRW_STATE_CS_WORK_GROUPS, BRW_STATE_URB_SIZE, + BRW_STATE_CC_STATE, BRW_NUM_STATE_BITS }; @@ -309,6 +310,7 @@ enum brw_state_id { #define BRW_NEW_COMPUTE_PROGRAM (1ull << BRW_STATE_COMPUTE_PROGRAM) #define BRW_NEW_CS_WORK_GROUPS (1ull << BRW_STATE_CS_WORK_GROUPS) #define BRW_NEW_URB_SIZE (1ull << BRW_STATE_URB_SIZE) +#define BRW_NEW_CC_STATE (1ull << BRW_STATE_CC_STATE) struct brw_state_flags { /** State update flags signalled by mesa internals */ @@ -1262,7 +1264,7 @@ struct brw_context int num_atoms[BRW_NUM_PIPELINES]; const struct brw_tracked_state render_atoms[76]; - const struct brw_tracked_state compute_atoms[10]; + const struct brw_tracked_state compute_atoms[11]; /* If (INTEL_DEBUG & DEBUG_BATCH) */ struct { diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index df7a79fcd89..9edb6f54204 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -1087,6 +1087,18 @@ enum opcode { */ SHADER_OPCODE_BROADCAST, + /** + * Pick the byte from its first source register given by the index + * specified as second source. + */ + SHADER_OPCODE_EXTRACT_BYTE, + + /** + * Pick the word from its first source register given by the index + * specified as second source. + */ + SHADER_OPCODE_EXTRACT_WORD, + VEC4_OPCODE_MOV_BYTES, VEC4_OPCODE_PACK_BYTES, VEC4_OPCODE_UNPACK_UNIFORM, diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp index 21f0b703d00..cbad47ee40a 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp @@ -72,6 +72,13 @@ channel_expressions_predicate(ir_instruction *ir) return false; switch (expr->operation) { + case ir_unop_pack_half_2x16: + case ir_unop_pack_snorm_2x16: + case ir_unop_pack_snorm_4x8: + case ir_unop_pack_unorm_2x16: + case ir_unop_pack_unorm_4x8: + return false; + /* these opcodes need to act on the whole vector, * just like texturing. */ @@ -162,6 +169,11 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir) return visit_continue; switch (expr->operation) { + case ir_unop_pack_half_2x16: + case ir_unop_pack_snorm_2x16: + case ir_unop_pack_snorm_4x8: + case ir_unop_pack_unorm_2x16: + case ir_unop_pack_unorm_4x8: case ir_unop_interpolate_at_centroid: case ir_binop_interpolate_at_offset: case ir_binop_interpolate_at_sample: @@ -399,9 +411,6 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir) case ir_unop_ssbo_unsized_array_length: unreachable("should have been lowered"); - case ir_unop_unpack_half_2x16_split_x: - case ir_unop_unpack_half_2x16_split_y: - case ir_binop_pack_half_2x16_split: case ir_unop_interpolate_at_centroid: case ir_binop_interpolate_at_offset: case ir_binop_interpolate_at_sample: diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp index 3b65a382dc8..cde6566c05c 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp @@ -78,6 +78,8 @@ is_expression(const fs_visitor *v, const fs_inst *const inst) case FS_OPCODE_LINTERP: case SHADER_OPCODE_FIND_LIVE_CHANNEL: case SHADER_OPCODE_BROADCAST: + case SHADER_OPCODE_EXTRACT_BYTE: + case SHADER_OPCODE_EXTRACT_WORD: case SHADER_OPCODE_MOV_INDIRECT: return true; case SHADER_OPCODE_RCP: diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index b1134cff3c8..cac92b37bd5 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -2233,6 +2233,28 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) brw_broadcast(p, dst, src[0], src[1]); break; + case SHADER_OPCODE_EXTRACT_BYTE: { + assert(src[0].type == BRW_REGISTER_TYPE_D || + src[0].type == BRW_REGISTER_TYPE_UD); + + enum brw_reg_type type = + src[0].type == BRW_REGISTER_TYPE_D ? BRW_REGISTER_TYPE_B + : BRW_REGISTER_TYPE_UB; + brw_MOV(p, dst, spread(suboffset(retype(src[0], type), src[1].ud), 4)); + break; + } + + case SHADER_OPCODE_EXTRACT_WORD: { + assert(src[0].type == BRW_REGISTER_TYPE_D || + src[0].type == BRW_REGISTER_TYPE_UD); + + enum brw_reg_type type = + src[0].type == BRW_REGISTER_TYPE_D ? BRW_REGISTER_TYPE_W + : BRW_REGISTER_TYPE_UW; + brw_MOV(p, dst, spread(suboffset(retype(src[0], type), src[1].ud), 2)); + break; + } + case FS_OPCODE_SET_SAMPLE_ID: generate_set_sample_id(inst, dst, src[0], src[1]); break; diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 65a0ffc4d8d..f41854c2c09 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -1126,6 +1126,22 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) inst->predicate = BRW_PREDICATE_NORMAL; break; + case nir_op_extract_ubyte: + case nir_op_extract_ibyte: { + nir_const_value *byte = nir_src_as_const_value(instr->src[1].src); + bld.emit(SHADER_OPCODE_EXTRACT_BYTE, + result, op[0], brw_imm_ud(byte->u[0])); + break; + } + + case nir_op_extract_uword: + case nir_op_extract_iword: { + nir_const_value *word = nir_src_as_const_value(instr->src[1].src); + bld.emit(SHADER_OPCODE_EXTRACT_WORD, + result, op[0], brw_imm_ud(word->u[0])); + break; + } + default: unreachable("unhandled instruction"); } diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp index 234afd554df..ab9d7929c05 100644 --- a/src/mesa/drivers/dri/i965/brw_link.cpp +++ b/src/mesa/drivers/dri/i965/brw_link.cpp @@ -73,36 +73,13 @@ brw_lower_packing_builtins(struct brw_context *brw, gl_shader_stage shader_type, exec_list *ir) { - const struct brw_compiler *compiler = brw->intelScreen->compiler; + /* Gens < 7 don't have instructions to convert to or from half-precision, + * and Gens < 6 don't expose that functionality. + */ + if (brw->gen != 6) + return; - int ops = LOWER_PACK_SNORM_2x16 - | LOWER_UNPACK_SNORM_2x16 - | LOWER_PACK_UNORM_2x16 - | LOWER_UNPACK_UNORM_2x16; - - if (compiler->scalar_stage[shader_type]) { - ops |= LOWER_UNPACK_UNORM_4x8 - | LOWER_UNPACK_SNORM_4x8 - | LOWER_PACK_UNORM_4x8 - | LOWER_PACK_SNORM_4x8; - } - - if (brw->gen >= 7) { - /* Gen7 introduced the f32to16 and f16to32 instructions, which can be - * used to execute packHalf2x16 and unpackHalf2x16. For AOS code, no - * lowering is needed. For SOA code, the Half2x16 ops must be - * scalarized. - */ - if (compiler->scalar_stage[shader_type]) { - ops |= LOWER_PACK_HALF_2x16_TO_SPLIT - | LOWER_UNPACK_HALF_2x16_TO_SPLIT; - } - } else { - ops |= LOWER_PACK_HALF_2x16 - | LOWER_UNPACK_HALF_2x16; - } - - lower_packing_builtins(ir, ops); + lower_packing_builtins(ir, LOWER_PACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16); } static void diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c index cf6ba5b4aeb..319c2a5669f 100644 --- a/src/mesa/drivers/dri/i965/brw_misc_state.c +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c @@ -868,12 +868,146 @@ brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline) const uint32_t _3DSTATE_PIPELINE_SELECT = is_965 ? CMD_PIPELINE_SELECT_965 : CMD_PIPELINE_SELECT_GM45; + if (brw->use_resource_streamer && pipeline != BRW_RENDER_PIPELINE) { + /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] + * PIPELINE_SELECT [DevBWR+]": + * + * Project: HSW, BDW, CHV, SKL, BXT + * + * Hardware Binding Tables are only supported for 3D + * workloads. Resource streamer must be enabled only for 3D + * workloads. Resource streamer must be disabled for Media and GPGPU + * workloads. + */ + BEGIN_BATCH(1); + OUT_BATCH(MI_RS_CONTROL | 0); + ADVANCE_BATCH(); + + gen7_disable_hw_binding_tables(brw); + + /* XXX - Disable gather constant pool too when we start using it. */ + } + + if (brw->gen >= 8 && brw->gen < 10) { + /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT: + * + * Software must clear the COLOR_CALC_STATE Valid field in + * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT + * with Pipeline Select set to GPGPU. + * + * The internal hardware docs recommend the same workaround for Gen9 + * hardware too. + */ + if (pipeline == BRW_COMPUTE_PIPELINE) { + BEGIN_BATCH(2); + OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2)); + OUT_BATCH(0); + ADVANCE_BATCH(); + + brw->ctx.NewDriverState |= BRW_NEW_CC_STATE; + } + + } else if (brw->gen >= 6) { + /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] + * PIPELINE_SELECT [DevBWR+]": + * + * Project: DEVSNB+ + * + * Software must ensure all the write caches are flushed through a + * stalling PIPE_CONTROL command followed by another PIPE_CONTROL + * command to invalidate read only caches prior to programming + * MI_PIPELINE_SELECT command to change the Pipeline Select Mode. + */ + const unsigned dc_flush = + brw->gen >= 7 ? PIPE_CONTROL_DATA_CACHE_INVALIDATE : 0; + + if (brw->gen == 6) { + /* Hardware workaround: SNB B-Spec says: + * + * Before a PIPE_CONTROL with Write Cache Flush Enable = 1, a + * PIPE_CONTROL with any non-zero post-sync-op is required. + */ + brw_emit_post_sync_nonzero_flush(brw); + } + + brw_emit_pipe_control_flush(brw, + PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + dc_flush | + PIPE_CONTROL_NO_WRITE | + PIPE_CONTROL_CS_STALL); + + brw_emit_pipe_control_flush(brw, + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | + PIPE_CONTROL_CONST_CACHE_INVALIDATE | + PIPE_CONTROL_STATE_CACHE_INVALIDATE | + PIPE_CONTROL_INSTRUCTION_INVALIDATE | + PIPE_CONTROL_NO_WRITE); + + } else { + /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] + * PIPELINE_SELECT [DevBWR+]": + * + * Project: PRE-DEVSNB + * + * Software must ensure the current pipeline is flushed via an + * MI_FLUSH or PIPE_CONTROL prior to the execution of PIPELINE_SELECT. + */ + BEGIN_BATCH(1); + OUT_BATCH(MI_FLUSH); + ADVANCE_BATCH(); + } + /* Select the pipeline */ BEGIN_BATCH(1); OUT_BATCH(_3DSTATE_PIPELINE_SELECT << 16 | (brw->gen >= 9 ? (3 << 8) : 0) | (pipeline == BRW_COMPUTE_PIPELINE ? 2 : 0)); ADVANCE_BATCH(); + + if (brw->gen == 7 && !brw->is_haswell && + pipeline == BRW_RENDER_PIPELINE) { + /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] + * PIPELINE_SELECT [DevBWR+]": + * + * Project: DEVIVB, DEVHSW:GT3:A0 + * + * Software must send a pipe_control with a CS stall and a post sync + * operation and then a dummy DRAW after every MI_SET_CONTEXT and + * after any PIPELINE_SELECT that is enabling 3D mode. + */ + gen7_emit_cs_stall_flush(brw); + + BEGIN_BATCH(7); + OUT_BATCH(CMD_3D_PRIM << 16 | (7 - 2)); + OUT_BATCH(_3DPRIM_POINTLIST); + OUT_BATCH(0); + OUT_BATCH(0); + OUT_BATCH(0); + OUT_BATCH(0); + OUT_BATCH(0); + ADVANCE_BATCH(); + } + + if (brw->use_resource_streamer && pipeline == BRW_RENDER_PIPELINE) { + /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] + * PIPELINE_SELECT [DevBWR+]": + * + * Project: HSW, BDW, CHV, SKL, BXT + * + * Hardware Binding Tables are only supported for 3D + * workloads. Resource streamer must be enabled only for 3D + * workloads. Resource streamer must be disabled for Media and GPGPU + * workloads. + */ + BEGIN_BATCH(1); + OUT_BATCH(MI_RS_CONTROL | 1); + ADVANCE_BATCH(); + + gen7_enable_hw_binding_tables(brw); + + /* XXX - Re-enable gather constant pool here. */ + } } /** diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c index d983f58765e..d6987c80ed6 100644 --- a/src/mesa/drivers/dri/i965/brw_nir.c +++ b/src/mesa/drivers/dri/i965/brw_nir.c @@ -60,7 +60,7 @@ struct add_const_offset_to_base_params { }; static bool -add_const_offset_to_base(nir_block *block, void *closure) +add_const_offset_to_base_block(nir_block *block, void *closure) { struct add_const_offset_to_base_params *params = closure; nir_builder *b = ¶ms->b; @@ -85,7 +85,19 @@ add_const_offset_to_base(nir_block *block, void *closure) } } return true; +} +static void +add_const_offset_to_base(nir_shader *nir, nir_variable_mode mode) +{ + struct add_const_offset_to_base_params params = { .mode = mode }; + + nir_foreach_function(nir, f) { + if (f->impl) { + nir_builder_init(¶ms.b, f->impl); + nir_foreach_block(f->impl, add_const_offset_to_base_block, ¶ms); + } + } } static bool @@ -195,10 +207,6 @@ brw_nir_lower_inputs(nir_shader *nir, const struct brw_device_info *devinfo, bool is_scalar) { - struct add_const_offset_to_base_params params = { - .mode = nir_var_shader_in - }; - switch (nir->stage) { case MESA_SHADER_VERTEX: /* Start with the location of the variable's base. */ @@ -212,6 +220,11 @@ brw_nir_lower_inputs(nir_shader *nir, */ nir_lower_io(nir, nir_var_shader_in, type_size_vec4); + /* This pass needs actual constants */ + nir_opt_constant_folding(nir); + + add_const_offset_to_base(nir, nir_var_shader_in); + if (is_scalar) { /* Finally, translate VERT_ATTRIB_* values into the actual registers. * @@ -221,13 +234,8 @@ brw_nir_lower_inputs(nir_shader *nir, */ GLbitfield64 inputs_read = nir->info.inputs_read; - /* This pass needs actual constants */ - nir_opt_constant_folding(nir); - nir_foreach_function(nir, function) { if (function->impl) { - nir_builder_init(¶ms.b, function->impl); - nir_foreach_block(function->impl, add_const_offset_to_base, ¶ms); nir_foreach_block(function->impl, remap_vs_attrs, &inputs_read); } } @@ -270,10 +278,10 @@ brw_nir_lower_inputs(nir_shader *nir, /* This pass needs actual constants */ nir_opt_constant_folding(nir); + add_const_offset_to_base(nir, nir_var_shader_in); + nir_foreach_function(nir, function) { if (function->impl) { - nir_builder_init(¶ms.b, function->impl); - nir_foreach_block(function->impl, add_const_offset_to_base, ¶ms); nir_foreach_block(function->impl, remap_inputs_with_vue_map, &input_vue_map); } @@ -296,10 +304,10 @@ brw_nir_lower_inputs(nir_shader *nir, /* This pass needs actual constants */ nir_opt_constant_folding(nir); + add_const_offset_to_base(nir, nir_var_shader_in); + nir_foreach_function(nir, function) { if (function->impl) { - nir_builder_init(¶ms.b, function->impl); - nir_foreach_block(function->impl, add_const_offset_to_base, ¶ms); nir_builder_init(&state.b, function->impl); nir_foreach_block(function->impl, remap_patch_urb_offsets, &state); } @@ -339,10 +347,6 @@ brw_nir_lower_outputs(nir_shader *nir, } break; case MESA_SHADER_TESS_CTRL: { - struct add_const_offset_to_base_params params = { - .mode = nir_var_shader_out - }; - struct remap_patch_urb_offsets_state state; brw_compute_tess_vue_map(&state.vue_map, nir->info.outputs_written, nir->info.patch_outputs_written); @@ -356,10 +360,10 @@ brw_nir_lower_outputs(nir_shader *nir, /* This pass needs actual constants */ nir_opt_constant_folding(nir); + add_const_offset_to_base(nir, nir_var_shader_out); + nir_foreach_function(nir, function) { if (function->impl) { - nir_builder_init(¶ms.b, function->impl); - nir_foreach_block(function->impl, add_const_offset_to_base, ¶ms); nir_builder_init(&state.b, function->impl); nir_foreach_block(function->impl, remap_patch_urb_offsets, &state); } diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c index d181468f5cb..c20a02817f9 100644 --- a/src/mesa/drivers/dri/i965/brw_sampler_state.c +++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c @@ -582,7 +582,7 @@ brw_upload_sampler_state_table(struct brw_context *brw, batch_offset_for_sampler_state += size_in_bytes; } - if (brw->gen >= 7) { + if (brw->gen >= 7 && stage_state->stage != MESA_SHADER_COMPUTE) { /* Emit a 3DSTATE_SAMPLER_STATE_POINTERS_XS packet. */ gen7_emit_sampler_state_pointers_xs(brw, stage_state); } else { @@ -693,3 +693,23 @@ const struct brw_tracked_state brw_tes_samplers = { }, .emit = brw_upload_tes_samplers, }; + +static void +brw_upload_cs_samplers(struct brw_context *brw) +{ + /* BRW_NEW_COMPUTE_PROGRAM */ + struct gl_program *cs = (struct gl_program *) brw->compute_program; + if (!cs) + return; + + brw_upload_sampler_state_table(brw, cs, &brw->cs.base); +} + +const struct brw_tracked_state brw_cs_samplers = { + .dirty = { + .mesa = _NEW_TEXTURE, + .brw = BRW_NEW_BATCH | + BRW_NEW_COMPUTE_PROGRAM, + }, + .emit = brw_upload_cs_samplers, +}; diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp index fec96bac923..e4ce8cbf748 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.cpp +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp @@ -29,137 +29,6 @@ #include "brw_vec4_tes.h" #include "main/shaderobj.h" #include "main/uniforms.h" -#include "util/debug.h" - -static void -shader_debug_log_mesa(void *data, const char *fmt, ...) -{ - struct brw_context *brw = (struct brw_context *)data; - va_list args; - - va_start(args, fmt); - GLuint msg_id = 0; - _mesa_gl_vdebug(&brw->ctx, &msg_id, - MESA_DEBUG_SOURCE_SHADER_COMPILER, - MESA_DEBUG_TYPE_OTHER, - MESA_DEBUG_SEVERITY_NOTIFICATION, fmt, args); - va_end(args); -} - -static void -shader_perf_log_mesa(void *data, const char *fmt, ...) -{ - struct brw_context *brw = (struct brw_context *)data; - - va_list args; - va_start(args, fmt); - - if (unlikely(INTEL_DEBUG & DEBUG_PERF)) { - va_list args_copy; - va_copy(args_copy, args); - vfprintf(stderr, fmt, args_copy); - va_end(args_copy); - } - - if (brw->perf_debug) { - GLuint msg_id = 0; - _mesa_gl_vdebug(&brw->ctx, &msg_id, - MESA_DEBUG_SOURCE_SHADER_COMPILER, - MESA_DEBUG_TYPE_PERFORMANCE, - MESA_DEBUG_SEVERITY_MEDIUM, fmt, args); - } - va_end(args); -} - -struct brw_compiler * -brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo) -{ - struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler); - - compiler->devinfo = devinfo; - compiler->shader_debug_log = shader_debug_log_mesa; - compiler->shader_perf_log = shader_perf_log_mesa; - - brw_fs_alloc_reg_sets(compiler); - brw_vec4_alloc_reg_set(compiler); - - compiler->scalar_stage[MESA_SHADER_VERTEX] = - devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS); - compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false; - compiler->scalar_stage[MESA_SHADER_TESS_EVAL] = - devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true); - compiler->scalar_stage[MESA_SHADER_GEOMETRY] = - devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", false); - compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true; - compiler->scalar_stage[MESA_SHADER_COMPUTE] = true; - - nir_shader_compiler_options *nir_options = - rzalloc(compiler, nir_shader_compiler_options); - nir_options->native_integers = true; - nir_options->vertex_id_zero_based = true; - nir_options->lower_fdiv = true; - /* In order to help allow for better CSE at the NIR level we tell NIR - * to split all ffma instructions during opt_algebraic and we then - * re-combine them as a later step. - */ - nir_options->lower_ffma = true; - nir_options->lower_sub = true; - nir_options->lower_fdiv = true; - nir_options->lower_scmp = true; - nir_options->lower_fmod = true; - nir_options->lower_bitfield_extract = true; - nir_options->lower_bitfield_insert = true; - nir_options->lower_uadd_carry = true; - nir_options->lower_usub_borrow = true; - - /* In the vec4 backend, our dpN instruction replicates its result to all - * the components of a vec4. We would like NIR to give us replicated fdot - * instructions because it can optimize better for us. - * - * For the FS backend, it should be lowered away by the scalarizing pass so - * we should never see fdot anyway. - */ - nir_options->fdot_replicates = true; - - /* We want the GLSL compiler to emit code that uses condition codes */ - for (int i = 0; i < MESA_SHADER_STAGES; i++) { - compiler->glsl_compiler_options[i].MaxUnrollIterations = 32; - compiler->glsl_compiler_options[i].MaxIfDepth = - devinfo->gen < 6 ? 16 : UINT_MAX; - - compiler->glsl_compiler_options[i].EmitCondCodes = true; - compiler->glsl_compiler_options[i].EmitNoNoise = true; - compiler->glsl_compiler_options[i].EmitNoMainReturn = true; - compiler->glsl_compiler_options[i].EmitNoIndirectInput = true; - compiler->glsl_compiler_options[i].EmitNoIndirectUniform = false; - compiler->glsl_compiler_options[i].LowerClipDistance = true; - - bool is_scalar = compiler->scalar_stage[i]; - - compiler->glsl_compiler_options[i].EmitNoIndirectOutput = is_scalar; - compiler->glsl_compiler_options[i].EmitNoIndirectTemp = is_scalar; - compiler->glsl_compiler_options[i].OptimizeForAOS = !is_scalar; - - /* !ARB_gpu_shader5 */ - if (devinfo->gen < 7) - compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true; - - compiler->glsl_compiler_options[i].NirOptions = nir_options; - - compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true; - } - - compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false; - compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false; - - if (compiler->scalar_stage[MESA_SHADER_GEOMETRY]) - compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false; - - compiler->glsl_compiler_options[MESA_SHADER_COMPUTE] - .LowerShaderSharedVariables = true; - - return compiler; -} extern "C" struct gl_shader * brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type) @@ -444,6 +313,10 @@ brw_instruction_name(enum opcode op) case SHADER_OPCODE_BROADCAST: return "broadcast"; + case SHADER_OPCODE_EXTRACT_BYTE: + return "extract_byte"; + case SHADER_OPCODE_EXTRACT_WORD: + return "extract_word"; case VEC4_OPCODE_MOV_BYTES: return "mov_bytes"; case VEC4_OPCODE_PACK_BYTES: diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h index d29b997b963..f44ccd6e071 100644 --- a/src/mesa/drivers/dri/i965/brw_state.h +++ b/src/mesa/drivers/dri/i965/brw_state.h @@ -75,6 +75,7 @@ extern const struct brw_tracked_state brw_vs_samplers; extern const struct brw_tracked_state brw_tcs_samplers; extern const struct brw_tracked_state brw_tes_samplers; extern const struct brw_tracked_state brw_gs_samplers; +extern const struct brw_tracked_state brw_cs_samplers; extern const struct brw_tracked_state brw_vs_ubo_surfaces; extern const struct brw_tracked_state brw_vs_abo_surfaces; extern const struct brw_tracked_state brw_vs_image_surfaces; @@ -396,6 +397,7 @@ void gen7_update_binding_table_from_array(struct brw_context *brw, gl_shader_stage stage, const uint32_t* binding_table, int num_surfaces); +void gen7_enable_hw_binding_tables(struct brw_context *brw); void gen7_disable_hw_binding_tables(struct brw_context *brw); void gen7_reset_hw_bt_pool_offsets(struct brw_context *brw); diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c index 2a671a58d8c..ee75ca88549 100644 --- a/src/mesa/drivers/dri/i965/brw_state_upload.c +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c @@ -282,6 +282,7 @@ static const struct brw_tracked_state *gen7_compute_atoms[] = &brw_cs_abo_surfaces, &brw_texture_surfaces, &brw_cs_work_groups_surface, + &brw_cs_samplers, &brw_cs_state, }; @@ -396,6 +397,7 @@ static const struct brw_tracked_state *gen8_compute_atoms[] = &brw_cs_abo_surfaces, &brw_texture_surfaces, &brw_cs_work_groups_surface, + &brw_cs_samplers, &brw_cs_state, }; @@ -664,6 +666,7 @@ static struct dirty_bit_map brw_bits[] = { DEFINE_BIT(BRW_NEW_COMPUTE_PROGRAM), DEFINE_BIT(BRW_NEW_CS_WORK_GROUPS), DEFINE_BIT(BRW_NEW_URB_SIZE), + DEFINE_BIT(BRW_NEW_CC_STATE), {0, 0, 0} }; diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index 358a71041fc..394e32169d9 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -1980,11 +1980,11 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data, unsigned *final_assembly_size, char **error_str) { + const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX]; nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex, - compiler->scalar_stage[MESA_SHADER_VERTEX]); - shader = brw_postprocess_nir(shader, compiler->devinfo, - compiler->scalar_stage[MESA_SHADER_VERTEX]); + is_scalar); + shader = brw_postprocess_nir(shader, compiler->devinfo, is_scalar); const unsigned *assembly = NULL; @@ -2010,7 +2010,7 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data, * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in * vec4 mode, the hardware appears to wedge unless we read something. */ - if (compiler->scalar_stage[MESA_SHADER_VERTEX]) + if (is_scalar) prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attributes, 2); else prog_data->base.urb_read_length = DIV_ROUND_UP(MAX2(nr_attributes, 1), 2); @@ -2029,7 +2029,7 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data, else prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4); - if (compiler->scalar_stage[MESA_SHADER_VERTEX]) { + if (is_scalar) { prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8; fs_visitor v(compiler, log_data, mem_ctx, key, &prog_data->base.base, diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp index 531113a9df5..a608dca03ff 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp @@ -1062,7 +1062,7 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) case nir_op_umul_high: { struct brw_reg acc = retype(brw_acc_reg(8), dst.type); - if (devinfo->gen >=8) + if (devinfo->gen >= 8) emit(MUL(acc, op[0], retype(op[1], BRW_REGISTER_TYPE_UW))); else emit(MUL(acc, op[0], op[1])); @@ -1376,6 +1376,24 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) case nir_op_pack_unorm_2x16: unreachable("not reached: should be handled by lower_packing_builtins"); + case nir_op_pack_uvec4_to_uint: + unreachable("not reached"); + + case nir_op_pack_uvec2_to_uint: { + dst_reg tmp1 = dst_reg(this, glsl_type::uint_type); + tmp1.writemask = WRITEMASK_X; + op[0].swizzle = BRW_SWIZZLE_YYYY; + emit(SHL(tmp1, op[0], src_reg(brw_imm_ud(16u)))); + + dst_reg tmp2 = dst_reg(this, glsl_type::uint_type); + tmp2.writemask = WRITEMASK_X; + op[0].swizzle = BRW_SWIZZLE_XXXX; + emit(AND(tmp2, op[0], src_reg(brw_imm_ud(0xffffu)))); + + emit(OR(dst, src_reg(tmp1), src_reg(tmp2))); + break; + } + case nir_op_unpack_half_2x16: /* As NIR does not guarantee that we have a correct swizzle outside the * boundaries of a vector, and the implementation of emit_unpack_half_2x16 diff --git a/src/mesa/drivers/dri/i965/brw_vue_map.c b/src/mesa/drivers/dri/i965/brw_vue_map.c index fea24368e8c..b66c209b24d 100644 --- a/src/mesa/drivers/dri/i965/brw_vue_map.c +++ b/src/mesa/drivers/dri/i965/brw_vue_map.c @@ -248,6 +248,8 @@ brw_compute_tess_vue_map(struct brw_vue_map *vue_map, static const char * varying_name(brw_varying_slot slot) { + assume(slot < BRW_VARYING_SLOT_COUNT); + if (slot < VARYING_SLOT_MAX) return gl_varying_slot_name(slot); @@ -257,7 +259,6 @@ varying_name(brw_varying_slot slot) [BRW_VARYING_SLOT_PNTC - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_PNTC", }; - assert(slot < BRW_VARYING_SLOT_COUNT); return brw_names[slot - VARYING_SLOT_MAX]; } diff --git a/src/mesa/drivers/dri/i965/gen6_cc.c b/src/mesa/drivers/dri/i965/gen6_cc.c index 3bab8f46ae8..cee139b7fd4 100644 --- a/src/mesa/drivers/dri/i965/gen6_cc.c +++ b/src/mesa/drivers/dri/i965/gen6_cc.c @@ -298,6 +298,7 @@ const struct brw_tracked_state gen6_color_calc_state = { .mesa = _NEW_COLOR | _NEW_STENCIL, .brw = BRW_NEW_BATCH | + BRW_NEW_CC_STATE | BRW_NEW_STATE_BASE_ADDRESS, }, .emit = gen6_upload_color_calc_state, diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c b/src/mesa/drivers/dri/i965/gen7_cs_state.c index a025bb9dd66..6d6988c6a41 100644 --- a/src/mesa/drivers/dri/i965/gen7_cs_state.c +++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c @@ -196,6 +196,7 @@ const struct brw_tracked_state brw_cs_state = { .brw = BRW_NEW_BATCH | BRW_NEW_CS_PROG_DATA | BRW_NEW_PUSH_CONSTANT_ALLOCATION | + BRW_NEW_SAMPLER_STATE_TABLE | BRW_NEW_SURFACES, }, .emit = brw_upload_cs_state diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c index 26f873bc9a9..8ede1f06e4e 100644 --- a/src/mesa/main/bufferobj.c +++ b/src/mesa/main/bufferobj.c @@ -953,7 +953,7 @@ _mesa_handle_bind_buffer_gen(struct gl_context *ctx, { struct gl_buffer_object *buf = *buf_handle; - if (!buf && (ctx->API == API_OPENGL_CORE || _mesa_is_gles31(ctx))) { + if (!buf && (ctx->API == API_OPENGL_CORE)) { _mesa_error(ctx, GL_INVALID_OPERATION, "%s(non-gen name)", caller); return false; } diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c index be983d4c86a..f3fd01f395e 100644 --- a/src/mesa/main/context.c +++ b/src/mesa/main/context.c @@ -1930,31 +1930,6 @@ _mesa_check_blend_func_error(struct gl_context *ctx) return GL_TRUE; } -static bool -shader_linked_or_absent(struct gl_context *ctx, - const struct gl_shader_program *shProg, - bool *shader_present, const char *where) -{ - if (shProg) { - *shader_present = true; - - if (!shProg->LinkStatus) { - _mesa_error(ctx, GL_INVALID_OPERATION, "%s(shader not linked)", where); - return false; - } -#if 0 /* not normally enabled */ - { - char errMsg[100]; - if (!_mesa_validate_shader_program(ctx, shProg, errMsg)) { - _mesa_warning(ctx, "Shader program %u is invalid: %s", - shProg->Name, errMsg); - } - } -#endif - } - - return true; -} /** * Prior to drawing anything with glBegin, glDrawArrays, etc. this function @@ -1967,54 +1942,22 @@ shader_linked_or_absent(struct gl_context *ctx, GLboolean _mesa_valid_to_render(struct gl_context *ctx, const char *where) { - unsigned i; - /* This depends on having up to date derived state (shaders) */ if (ctx->NewState) _mesa_update_state(ctx); - if (ctx->API == API_OPENGL_CORE || ctx->API == API_OPENGLES2) { - bool from_glsl_shader[MESA_SHADER_COMPUTE] = { false }; - - for (i = 0; i < MESA_SHADER_COMPUTE; i++) { - if (!shader_linked_or_absent(ctx, ctx->_Shader->CurrentProgram[i], - &from_glsl_shader[i], where)) - return GL_FALSE; - } - - /* In OpenGL Core Profile and OpenGL ES 2.0 / 3.0, there are no assembly - * shaders. Don't check state related to those. - */ - } else { - bool has_vertex_shader = false; - bool has_fragment_shader = false; - - /* In OpenGL Compatibility Profile, there is only vertex shader and - * fragment shader. We take this path also for API_OPENGLES because - * optimizing that path would make the other (more common) paths - * slightly slower. - */ - if (!shader_linked_or_absent(ctx, - ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX], - &has_vertex_shader, where)) - return GL_FALSE; - - if (!shader_linked_or_absent(ctx, - ctx->_Shader->CurrentProgram[MESA_SHADER_FRAGMENT], - &has_fragment_shader, where)) - return GL_FALSE; - + if (ctx->API == API_OPENGL_COMPAT) { /* Any shader stages that are not supplied by the GLSL shader and have * assembly shaders enabled must now be validated. */ - if (!has_vertex_shader + if (!ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX] && ctx->VertexProgram.Enabled && !ctx->VertexProgram._Enabled) { _mesa_error(ctx, GL_INVALID_OPERATION, "%s(vertex program not valid)", where); return GL_FALSE; } - if (!has_fragment_shader) { + if (!ctx->_Shader->CurrentProgram[MESA_SHADER_FRAGMENT]) { if (ctx->FragmentProgram.Enabled && !ctx->FragmentProgram._Enabled) { _mesa_error(ctx, GL_INVALID_OPERATION, "%s(fragment program not valid)", where); diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h index 70ed5633f7b..d4378e51159 100644 --- a/src/mesa/main/dd.h +++ b/src/mesa/main/dd.h @@ -762,6 +762,12 @@ struct dd_function_table { void (*UseProgram)(struct gl_context *ctx, struct gl_shader_program *shProg); /*@}*/ + /** + * \name GREMEDY debug/marker functions + */ + /*@{*/ + void (*EmitStringMarker)(struct gl_context *ctx, const GLchar *string, GLsizei len); + /*@}*/ /** * \name Support for multiple T&L engines diff --git a/src/mesa/main/dlist.c b/src/mesa/main/dlist.c index ba2e670eb9a..cd8e3b6a2f2 100644 --- a/src/mesa/main/dlist.c +++ b/src/mesa/main/dlist.c @@ -5982,9 +5982,8 @@ save_DrawTransformFeedbackStreamInstanced(GLenum mode, GLuint name, } } -/* aka UseProgram() */ static void GLAPIENTRY -save_UseProgramObjectARB(GLhandleARB program) +save_UseProgram(GLuint program) { GET_CURRENT_CONTEXT(ctx); Node *n; @@ -9454,7 +9453,7 @@ _mesa_initialize_save_table(const struct gl_context *ctx) SET_BlitFramebuffer(table, save_BlitFramebufferEXT); - SET_UseProgram(table, save_UseProgramObjectARB); + SET_UseProgram(table, save_UseProgram); SET_Uniform1f(table, save_Uniform1fARB); SET_Uniform2f(table, save_Uniform2fARB); SET_Uniform3f(table, save_Uniform3fARB); diff --git a/src/mesa/main/errors.c b/src/mesa/main/errors.c index 9e6610918c4..674364c7b0c 100644 --- a/src/mesa/main/errors.c +++ b/src/mesa/main/errors.c @@ -1018,6 +1018,13 @@ _mesa_DebugMessageInsert(GLenum source, GLenum type, GLuint id, gl_enum_to_debug_type(type), id, gl_enum_to_debug_severity(severity), length, buf); + + if (type == GL_DEBUG_TYPE_MARKER && ctx->Driver.EmitStringMarker) { + /* if length not specified, string will be null terminated: */ + if (length < 0) + length = strlen(buf); + ctx->Driver.EmitStringMarker(ctx, buf, length); + } } @@ -1276,6 +1283,19 @@ _mesa_free_errors_data(struct gl_context *ctx) mtx_destroy(&ctx->DebugMutex); } +void GLAPIENTRY +_mesa_StringMarkerGREMEDY(GLsizei len, const GLvoid *string) +{ + GET_CURRENT_CONTEXT(ctx); + if (ctx->Extensions.GREMEDY_string_marker) { + /* if length not specified, string will be null terminated: */ + if (len <= 0) + len = strlen(string); + ctx->Driver.EmitStringMarker(ctx, string, len); + } else { + _mesa_error(ctx, GL_INVALID_OPERATION, "StringMarkerGREMEDY"); + } +} /**********************************************************************/ /** \name Diagnostics */ diff --git a/src/mesa/main/errors.h b/src/mesa/main/errors.h index f2919765488..92df2ac868a 100644 --- a/src/mesa/main/errors.h +++ b/src/mesa/main/errors.h @@ -138,6 +138,9 @@ _mesa_PushDebugGroup(GLenum source, GLuint id, GLsizei length, void GLAPIENTRY _mesa_PopDebugGroup(void); +void GLAPIENTRY +_mesa_StringMarkerGREMEDY(GLsizei len, const GLvoid *string); + #ifdef __cplusplus } #endif diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h index aeccb017423..9cec1762dbe 100644 --- a/src/mesa/main/extensions_table.h +++ b/src/mesa/main/extensions_table.h @@ -251,6 +251,8 @@ EXT(EXT_unpack_subimage , dummy_true EXT(EXT_vertex_array , dummy_true , GLL, x , x , x , 1995) EXT(EXT_vertex_array_bgra , EXT_vertex_array_bgra , GLL, GLC, x , x , 2008) +EXT(GREMEDY_string_marker , GREMEDY_string_marker , GLL, GLC, x , x , 2007) + EXT(IBM_multimode_draw_arrays , dummy_true , GLL, GLC, x , x , 1998) EXT(IBM_rasterpos_clip , dummy_true , GLL, x , x , x , 1996) EXT(IBM_texture_mirrored_repeat , dummy_true , GLL, x , x , x , 1998) diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index 1c717feabc2..3a0b89f4572 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -3885,6 +3885,7 @@ struct gl_extensions GLboolean ATI_texture_env_combine3; GLboolean ATI_fragment_shader; GLboolean ATI_separate_stencil; + GLboolean GREMEDY_string_marker; GLboolean INTEL_performance_query; GLboolean KHR_texture_compression_astc_hdr; GLboolean KHR_texture_compression_astc_ldr; diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp index a18b860022d..e902585924a 100644 --- a/src/mesa/main/shader_query.cpp +++ b/src/mesa/main/shader_query.cpp @@ -64,8 +64,8 @@ DECL_RESOURCE_FUNC(XFB, gl_transform_feedback_varying_info); DECL_RESOURCE_FUNC(SUB, gl_subroutine_function); void GLAPIENTRY -_mesa_BindAttribLocation(GLhandleARB program, GLuint index, - const GLcharARB *name) +_mesa_BindAttribLocation(GLuint program, GLuint index, + const GLchar *name) { GET_CURRENT_CONTEXT(ctx); @@ -126,9 +126,9 @@ is_active_attrib(const gl_shader_variable *var) } void GLAPIENTRY -_mesa_GetActiveAttrib(GLhandleARB program, GLuint desired_index, - GLsizei maxLength, GLsizei * length, GLint * size, - GLenum * type, GLcharARB * name) +_mesa_GetActiveAttrib(GLuint program, GLuint desired_index, + GLsizei maxLength, GLsizei * length, GLint * size, + GLenum * type, GLchar * name) { GET_CURRENT_CONTEXT(ctx); struct gl_shader_program *shProg; @@ -191,7 +191,7 @@ _mesa_GetActiveAttrib(GLhandleARB program, GLuint desired_index, } GLint GLAPIENTRY -_mesa_GetAttribLocation(GLhandleARB program, const GLcharARB * name) +_mesa_GetAttribLocation(GLuint program, const GLchar * name) { GET_CURRENT_CONTEXT(ctx); struct gl_shader_program *const shProg = diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c index cdc85f3413b..5854369a28c 100644 --- a/src/mesa/main/shaderapi.c +++ b/src/mesa/main/shaderapi.c @@ -1265,7 +1265,7 @@ _mesa_AttachShader(GLuint program, GLuint shader) void GLAPIENTRY -_mesa_CompileShader(GLhandleARB shaderObj) +_mesa_CompileShader(GLuint shaderObj) { GET_CURRENT_CONTEXT(ctx); if (MESA_VERBOSE & VERBOSE_API) @@ -1315,7 +1315,7 @@ _mesa_DeleteObjectARB(GLhandleARB obj) { if (MESA_VERBOSE & VERBOSE_API) { GET_CURRENT_CONTEXT(ctx); - _mesa_debug(ctx, "glDeleteObjectARB(%u)\n", obj); + _mesa_debug(ctx, "glDeleteObjectARB(%lu)\n", (unsigned long)obj); } if (obj) { @@ -1374,10 +1374,26 @@ _mesa_DetachShader(GLuint program, GLuint shader) void GLAPIENTRY _mesa_GetAttachedObjectsARB(GLhandleARB container, GLsizei maxCount, - GLsizei * count, GLhandleARB * obj) + GLsizei * count, GLhandleARB * objARB) { + int i; + GLuint *obj; + GET_CURRENT_CONTEXT(ctx); + + obj = calloc(maxCount, sizeof(GLuint)); + if (!obj) { + _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetAttachedObjectsARB"); + return; + } + get_attached_shaders(ctx, container, maxCount, count, obj); + + for (i = 0 ; i < *count; i++) { + objARB[i] = (GLhandleARB)obj[i]; + } + + free(obj); } @@ -1479,8 +1495,8 @@ _mesa_GetShaderInfoLog(GLuint shader, GLsizei bufSize, void GLAPIENTRY -_mesa_GetShaderSource(GLhandleARB shader, GLsizei maxLength, - GLsizei *length, GLcharARB *sourceOut) +_mesa_GetShaderSource(GLuint shader, GLsizei maxLength, + GLsizei *length, GLchar *sourceOut) { GET_CURRENT_CONTEXT(ctx); get_shader_source(ctx, shader, maxLength, length, sourceOut); @@ -1512,7 +1528,7 @@ _mesa_IsShader(GLuint name) void GLAPIENTRY -_mesa_LinkProgram(GLhandleARB programObj) +_mesa_LinkProgram(GLuint programObj) { GET_CURRENT_CONTEXT(ctx); if (MESA_VERBOSE & VERBOSE_API) @@ -1641,8 +1657,8 @@ read_shader(const gl_shader_stage stage, const char *source) * and pass it to _mesa_shader_source(). */ void GLAPIENTRY -_mesa_ShaderSource(GLhandleARB shaderObj, GLsizei count, - const GLcharARB * const * string, const GLint * length) +_mesa_ShaderSource(GLuint shaderObj, GLsizei count, + const GLchar * const * string, const GLint * length) { GET_CURRENT_CONTEXT(ctx); GLint *offsets; @@ -1729,7 +1745,7 @@ _mesa_ShaderSource(GLhandleARB shaderObj, GLsizei count, void GLAPIENTRY -_mesa_UseProgram(GLhandleARB program) +_mesa_UseProgram(GLuint program) { GET_CURRENT_CONTEXT(ctx); struct gl_shader_program *shProg; @@ -1791,7 +1807,7 @@ _mesa_UseProgram(GLhandleARB program) void GLAPIENTRY -_mesa_ValidateProgram(GLhandleARB program) +_mesa_ValidateProgram(GLuint program) { GET_CURRENT_CONTEXT(ctx); validate_program(ctx, program); @@ -2530,6 +2546,11 @@ _mesa_UniformSubroutinesuiv(GLenum shadertype, GLsizei count, i = 0; do { struct gl_uniform_storage *uni = sh->SubroutineUniformRemapTable[i]; + if (uni == NULL) { + i++; + continue; + } + int uni_count = uni->array_elements ? uni->array_elements : 1; int j, k; @@ -2557,6 +2578,11 @@ _mesa_UniformSubroutinesuiv(GLenum shadertype, GLsizei count, i = 0; do { struct gl_uniform_storage *uni = sh->SubroutineUniformRemapTable[i]; + if (uni == NULL) { + i++; + continue; + } + int uni_count = uni->array_elements ? uni->array_elements : 1; memcpy(&uni->storage[0], &indices[i], diff --git a/src/mesa/main/shaderapi.h b/src/mesa/main/shaderapi.h index fba767bf4c1..8922c4d0640 100644 --- a/src/mesa/main/shaderapi.h +++ b/src/mesa/main/shaderapi.h @@ -64,7 +64,7 @@ extern void GLAPIENTRY _mesa_AttachObjectARB(GLhandleARB, GLhandleARB); extern void GLAPIENTRY -_mesa_CompileShader(GLhandleARB); +_mesa_CompileShader(GLuint); extern GLhandleARB GLAPIENTRY _mesa_CreateProgramObjectARB(void); @@ -100,7 +100,7 @@ extern void GLAPIENTRY _mesa_GetObjectParameterivARB(GLhandleARB, GLenum, GLint *); extern void GLAPIENTRY -_mesa_GetShaderSource(GLhandleARB, GLsizei, GLsizei *, GLcharARB *); +_mesa_GetShaderSource(GLuint, GLsizei, GLsizei *, GLchar *); extern GLboolean GLAPIENTRY _mesa_IsProgram(GLuint name); @@ -109,20 +109,20 @@ extern GLboolean GLAPIENTRY _mesa_IsShader(GLuint name); extern void GLAPIENTRY -_mesa_LinkProgram(GLhandleARB programObj); +_mesa_LinkProgram(GLuint programObj); extern void GLAPIENTRY -_mesa_ShaderSource(GLhandleARB, GLsizei, const GLcharARB* const *, const GLint *); +_mesa_ShaderSource(GLuint, GLsizei, const GLchar* const *, const GLint *); extern void GLAPIENTRY -_mesa_UseProgram(GLhandleARB); +_mesa_UseProgram(GLuint); extern void GLAPIENTRY -_mesa_ValidateProgram(GLhandleARB); +_mesa_ValidateProgram(GLuint); extern void GLAPIENTRY -_mesa_BindAttribLocation(GLhandleARB, GLuint, const GLcharARB *); +_mesa_BindAttribLocation(GLuint program, GLuint, const GLchar *); extern void GLAPIENTRY _mesa_BindFragDataLocation(GLuint program, GLuint colorNumber, @@ -133,11 +133,11 @@ _mesa_BindFragDataLocationIndexed(GLuint program, GLuint colorNumber, GLuint index, const GLchar *name); extern void GLAPIENTRY -_mesa_GetActiveAttrib(GLhandleARB, GLuint, GLsizei, GLsizei *, GLint *, - GLenum *, GLcharARB *); +_mesa_GetActiveAttrib(GLuint, GLuint, GLsizei, GLsizei *, GLint *, + GLenum *, GLchar *); extern GLint GLAPIENTRY -_mesa_GetAttribLocation(GLhandleARB, const GLcharARB *); +_mesa_GetAttribLocation(GLuint, const GLchar *); diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp index 7610bcbd701..eb1108124e9 100644 --- a/src/mesa/main/tests/dispatch_sanity.cpp +++ b/src/mesa/main/tests/dispatch_sanity.cpp @@ -940,6 +940,9 @@ const struct function common_desktop_functions_possible[] = { { "glGetTextureSubImage", 20, -1 }, { "glGetCompressedTextureSubImage", 20, -1 }, + /* GL_GREMEDY_string_marker */ + { "glStringMarkerGREMEDY", 15, -1 }, + { NULL, 0, -1 } }; diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c index b107a8f8678..e926c7b6cd2 100644 --- a/src/mesa/main/texobj.c +++ b/src/mesa/main/texobj.c @@ -769,7 +769,8 @@ _mesa_test_texobj_completeness( const struct gl_context *ctx, } if (t->Target == GL_TEXTURE_CUBE_MAP_ARB) { - /* Make sure that all six cube map level 0 images are the same size. + /* Make sure that all six cube map level 0 images are the same size and + * format. * Note: we know that the image's width==height (we enforce that * at glTexImage time) so we only need to test the width here. */ @@ -784,6 +785,15 @@ _mesa_test_texobj_completeness( const struct gl_context *ctx, incomplete(t, BASE, "Cube face missing or mismatched size"); return; } + if (t->Image[face][baseLevel]->InternalFormat != + baseImage->InternalFormat) { + incomplete(t, BASE, "Cube face format mismatch"); + return; + } + if (t->Image[face][baseLevel]->Border != baseImage->Border) { + incomplete(t, BASE, "Cube face border size mismatch"); + return; + } } } @@ -858,16 +868,6 @@ _mesa_test_texobj_completeness( const struct gl_context *ctx, img->Depth2); return; } - - /* Extra checks for cube textures */ - if (face > 0) { - /* check that cube faces are the same size */ - if (img->Width2 != t->Image[0][i]->Width2 || - img->Height2 != t->Image[0][i]->Height2) { - incomplete(t, MIPMAP, "CubeMap Image[n][i] bad size"); - return; - } - } } } diff --git a/src/mesa/main/varray.c b/src/mesa/main/varray.c index c71e16a1e56..c2bf2951687 100644 --- a/src/mesa/main/varray.c +++ b/src/mesa/main/varray.c @@ -1744,6 +1744,10 @@ vertex_array_vertex_buffer(struct gl_context *ctx, } else if (buffer != 0) { vbo = _mesa_lookup_bufferobj(ctx, buffer); + if (!vbo && _mesa_is_gles31(ctx)) { + _mesa_error(ctx, GL_INVALID_OPERATION, "%s(non-gen name)", func); + return; + } /* From the GL_ARB_vertex_attrib_array spec: * * "[Core profile only:] diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp index 9da9733438d..88d8337bb3e 100644 --- a/src/mesa/program/ir_to_mesa.cpp +++ b/src/mesa/program/ir_to_mesa.cpp @@ -1245,10 +1245,7 @@ ir_to_mesa_visitor::visit(ir_expression *ir) case ir_unop_unpack_unorm_2x16: case ir_unop_unpack_unorm_4x8: case ir_unop_unpack_half_2x16: - case ir_unop_unpack_half_2x16_split_x: - case ir_unop_unpack_half_2x16_split_y: case ir_unop_unpack_double_2x32: - case ir_binop_pack_half_2x16_split: case ir_unop_bitfield_reverse: case ir_unop_bit_count: case ir_unop_find_msb: diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c index f8b367989e7..0ceb37027e1 100644 --- a/src/mesa/state_tracker/st_cb_texture.c +++ b/src/mesa/state_tracker/st_cb_texture.c @@ -1670,6 +1670,12 @@ st_finalize_texture(struct gl_context *ctx, width = stObj->width0; height = stObj->height0; depth = stObj->depth0; + } else { + /* The width/height/depth may have been previously reset in + * guess_and_alloc_texture. */ + stObj->width0 = width; + stObj->height0 = height; + stObj->depth0 = depth; } /* convert GL dims to Gallium dims */ st_gl_texture_dims_to_pipe_dims(stObj->base.Target, width, height, depth, diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c index 4add50e3ed9..ce1e97aacb5 100644 --- a/src/mesa/state_tracker/st_context.c +++ b/src/mesa/state_tracker/st_context.c @@ -438,6 +438,12 @@ void st_destroy_context( struct st_context *st ) free(ctx); } +static void +st_emit_string_marker(struct gl_context *ctx, const GLchar *string, GLsizei len) +{ + struct st_context *st = ctx->st; + st->pipe->emit_string_marker(st->pipe, string, len); +} void st_init_driver_functions(struct pipe_screen *screen, struct dd_function_table *functions) @@ -476,6 +482,9 @@ void st_init_driver_functions(struct pipe_screen *screen, st_init_vdpau_functions(functions); + if (screen->get_param(screen, PIPE_CAP_STRING_MARKER)) + functions->EmitStringMarker = st_emit_string_marker; + functions->Enable = st_Enable; functions->UpdateState = st_invalidate_state; } diff --git a/src/mesa/state_tracker/st_debug.c b/src/mesa/state_tracker/st_debug.c index 134366db09d..9eb3b53b230 100644 --- a/src/mesa/state_tracker/st_debug.c +++ b/src/mesa/state_tracker/st_debug.c @@ -57,6 +57,7 @@ static const struct debug_named_value st_debug_flags[] = { { "buffer", DEBUG_BUFFER, NULL }, { "wf", DEBUG_WIREFRAME, NULL }, { "precompile", DEBUG_PRECOMPILE, NULL }, + { "gremedy", DEBUG_GREMEDY, "Enable GREMEDY debug extensions" }, DEBUG_NAMED_VALUE_END }; diff --git a/src/mesa/state_tracker/st_debug.h b/src/mesa/state_tracker/st_debug.h index ed3ead82914..a094fdc2bfa 100644 --- a/src/mesa/state_tracker/st_debug.h +++ b/src/mesa/state_tracker/st_debug.h @@ -50,6 +50,7 @@ st_print_current(void); #define DEBUG_BUFFER 0x200 #define DEBUG_WIREFRAME 0x400 #define DEBUG_PRECOMPILE 0x800 +#define DEBUG_GREMEDY 0x1000 #ifdef DEBUG extern int ST_DEBUG; diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c index 2a3e52362e4..53ea6767395 100644 --- a/src/mesa/state_tracker/st_extensions.c +++ b/src/mesa/state_tracker/st_extensions.c @@ -37,6 +37,7 @@ #include "util/u_math.h" #include "st_context.h" +#include "st_debug.h" #include "st_extensions.h" #include "st_format.h" @@ -973,4 +974,8 @@ void st_init_extensions(struct pipe_screen *screen, extensions->ARB_gpu_shader_fp64 = GL_TRUE; extensions->ARB_vertex_attrib_64bit = GL_TRUE; } + + if ((ST_DEBUG & DEBUG_GREMEDY) && + screen->get_param(screen, PIPE_CAP_STRING_MARKER)) + extensions->GREMEDY_string_marker = GL_TRUE; } diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp index d424e3b335f..a06683f31c8 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp @@ -2177,12 +2177,9 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir) case ir_unop_unpack_snorm_2x16: case ir_unop_unpack_unorm_2x16: - case ir_unop_unpack_half_2x16_split_x: - case ir_unop_unpack_half_2x16_split_y: case ir_unop_unpack_snorm_4x8: case ir_unop_unpack_unorm_4x8: - case ir_binop_pack_half_2x16_split: case ir_quadop_vector: case ir_binop_vector_extract: case ir_triop_vector_insert: