diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index 5325f974cda..6f50f714c3f 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -271,6 +271,7 @@ C_SOURCES := \
 	util/u_prim_restart.h \
 	util/u_pstipple.c \
 	util/u_pstipple.h \
+	util/u_pwr8.h \
 	util/u_range.h \
 	util/u_rect.h \
 	util/u_resource.c \
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index 142d78ae49d..b48bdcc779e 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -1618,6 +1618,12 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
    context_ptr               = LLVMGetParam(variant_func, 0);
    io_ptr                    = LLVMGetParam(variant_func, 1);
    vbuffers_ptr              = LLVMGetParam(variant_func, 2);
+   /*
+    * XXX: stride is actually unused. The stride we use is strictly calculated
+    * from the number of outputs (including the draw_extra outputs).
+    * Should probably fix some day (we need a new vs just because of extra
+    * outputs which the generated vs won't touch).
+    */
    stride                    = LLVMGetParam(variant_func, 5 + (elts ? 1 : 0));
    vb_ptr                    = LLVMGetParam(variant_func, 6 + (elts ? 1 : 0));
    system_values.instance_id = LLVMGetParam(variant_func, 7 + (elts ? 1 : 0));
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index cdf6d80c261..0b0f7f0147c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -461,50 +461,49 @@ lp_build_pack2(struct gallivm_state *gallivm,
    assert(src_type.length * 2 == dst_type.length);
 
    /* Check for special cases first */
-   if((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) &&
-       src_type.width * src_type.length >= 128) {
+   if ((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) &&
+        src_type.width * src_type.length >= 128) {
       const char *intrinsic = NULL;
       boolean swap_intrinsic_operands = FALSE;
 
       switch(src_type.width) {
       case 32:
          if (util_cpu_caps.has_sse2) {
-           if(dst_type.sign) {
+           if (dst_type.sign) {
               intrinsic = "llvm.x86.sse2.packssdw.128";
-           }
-           else {
+           } else {
               if (util_cpu_caps.has_sse4_1) {
                  intrinsic = "llvm.x86.sse41.packusdw";
               }
            }
          } else if (util_cpu_caps.has_altivec) {
             if (dst_type.sign) {
-              intrinsic = "llvm.ppc.altivec.vpkswus";
-           } else {
-              intrinsic = "llvm.ppc.altivec.vpkuwus";
-           }
+               intrinsic = "llvm.ppc.altivec.vpkswss";
+            } else {
+               intrinsic = "llvm.ppc.altivec.vpkuwus";
+            }
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
-           swap_intrinsic_operands = TRUE;
+            swap_intrinsic_operands = TRUE;
 #endif
          }
          break;
       case 16:
          if (dst_type.sign) {
             if (util_cpu_caps.has_sse2) {
-              intrinsic = "llvm.x86.sse2.packsswb.128";
+               intrinsic = "llvm.x86.sse2.packsswb.128";
             } else if (util_cpu_caps.has_altivec) {
-              intrinsic = "llvm.ppc.altivec.vpkshss";
+               intrinsic = "llvm.ppc.altivec.vpkshss";
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
-              swap_intrinsic_operands = TRUE;
+               swap_intrinsic_operands = TRUE;
 #endif
             }
          } else {
             if (util_cpu_caps.has_sse2) {
-              intrinsic = "llvm.x86.sse2.packuswb.128";
+               intrinsic = "llvm.x86.sse2.packuswb.128";
             } else if (util_cpu_caps.has_altivec) {
-	      intrinsic = "llvm.ppc.altivec.vpkshus";
+               intrinsic = "llvm.ppc.altivec.vpkshus";
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
-              swap_intrinsic_operands = TRUE;
+               swap_intrinsic_operands = TRUE;
 #endif
             }
          }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
index 3d5e2cb316b..6f75bec5005 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -1536,8 +1536,22 @@ mod_emit_cpu(
    struct lp_build_tgsi_context * bld_base,
    struct lp_build_emit_data * emit_data)
 {
-   emit_data->output[emit_data->chan] = lp_build_mod(&bld_base->int_bld,
-                                   emit_data->args[0], emit_data->args[1]);
+   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+   LLVMValueRef div_mask = lp_build_cmp(&bld_base->uint_bld,
+                                        PIPE_FUNC_EQUAL, emit_data->args[1],
+                                        bld_base->uint_bld.zero);
+   /* We want to make sure that we never divide/mod by zero to not
+    * generate sigfpe. We don't want to crash just because the
+    * shader is doing something weird. */
+   LLVMValueRef divisor = LLVMBuildOr(builder,
+                                      div_mask,
+                                      emit_data->args[1], "");
+   LLVMValueRef result = lp_build_mod(&bld_base->int_bld,
+                                      emit_data->args[0], divisor);
+   /* umod by zero doesn't have a guaranteed return value chose -1 for now. */
+   emit_data->output[emit_data->chan] = LLVMBuildOr(builder,
+                                                    div_mask,
+                                                    result, "");
 }
 
 /* TGSI_OPCODE_NOT */
diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index 7c577592f70..dfda80f228f 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -673,10 +673,6 @@ ttn_get_dest(struct ttn_compile *c, struct tgsi_full_dst_register *tgsi_fdst)
 
    if (tgsi_dst->File == TGSI_FILE_TEMPORARY) {
       if (c->temp_regs[index].var) {
-          nir_builder *b = &c->build;
-          nir_intrinsic_instr *load;
-          struct tgsi_ind_register *indirect =
-                tgsi_dst->Indirect ? &tgsi_fdst->Indirect : NULL;
           nir_register *reg;
 
          /* this works, because TGSI will give us a base offset
@@ -690,26 +686,6 @@ ttn_get_dest(struct ttn_compile *c, struct tgsi_full_dst_register *tgsi_fdst)
          reg->num_components = 4;
          dest.dest.reg.reg = reg;
          dest.dest.reg.base_offset = 0;
-
-         /* since the alu op might not write to all components
-          * of the temporary, we must first do a load_var to
-          * get the previous array elements into the register.
-          * This is one area that NIR could use a bit of
-          * improvement (or opt pass to clean up the mess
-          * once things are scalarized)
-          */
-
-         load = nir_intrinsic_instr_create(c->build.shader,
-                                           nir_intrinsic_load_var);
-         load->num_components = 4;
-         load->variables[0] =
-               ttn_array_deref(c, load, c->temp_regs[index].var,
-                               c->temp_regs[index].offset,
-                               indirect);
-
-         load->dest = nir_dest_for_reg(reg);
-
-         nir_builder_instr_insert(b, &load->instr);
       } else {
          assert(!tgsi_dst->Indirect);
          dest.dest.reg.reg = c->temp_regs[index].reg;
@@ -1886,7 +1862,7 @@ ttn_emit_instruction(struct ttn_compile *c)
       ttn_move_dest(b, dest, nir_fsat(b, ttn_src_for_dest(b, &dest)));
    }
 
-   /* if the dst has a matching var, append store_global to move
+   /* if the dst has a matching var, append store_var to move
     * output from reg to var
     */
    nir_variable *var = ttn_get_var(c, tgsi_dst);
@@ -1899,7 +1875,7 @@ ttn_emit_instruction(struct ttn_compile *c)
                                            &tgsi_dst->Indirect : NULL;
 
       store->num_components = 4;
-      store->const_index[0] = 0xf;
+      store->const_index[0] = dest.write_mask;
       store->variables[0] = ttn_array_deref(c, store, var, offset, indirect);
       store->src[0] = nir_src_for_reg(dest.dest.reg.reg);
 
@@ -1932,6 +1908,7 @@ ttn_add_output_stores(struct ttn_compile *c)
          store->src[0].reg.reg = c->output_regs[loc].reg;
          store->src[0].reg.base_offset = c->output_regs[loc].offset;
          store->const_index[0] = loc;
+         store->const_index[1] = 0xf;  /* writemask */
          store->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
          nir_builder_instr_insert(b, &store->instr);
       }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index ea207461d27..83f50628b40 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -110,6 +110,7 @@ tgsi_default_declaration( void )
    declaration.Invariant = 0;
    declaration.Local = 0;
    declaration.Array = 0;
+   declaration.Atomic = 0;
    declaration.Padding = 0;
 
    return declaration;
diff --git a/src/gallium/auxiliary/util/u_pstipple.c b/src/gallium/auxiliary/util/u_pstipple.c
index 08dec13846d..3428172203b 100644
--- a/src/gallium/auxiliary/util/u_pstipple.c
+++ b/src/gallium/auxiliary/util/u_pstipple.c
@@ -230,6 +230,7 @@ pstip_transform_immed(struct tgsi_transform_context *ctx,
    struct pstip_transform_context *pctx =
       (struct pstip_transform_context *) ctx;
    pctx->numImmed++;
+   ctx->emit_immediate(ctx, immed);
 }
 
 
diff --git a/src/gallium/auxiliary/util/u_pwr8.h b/src/gallium/auxiliary/util/u_pwr8.h
index 1eca6d6df2c..ffd9f923142 100644
--- a/src/gallium/auxiliary/util/u_pwr8.h
+++ b/src/gallium/auxiliary/util/u_pwr8.h
@@ -153,6 +153,12 @@ vec_mullo_epi32 (__m128i a, __m128i b)
    return v;
 }
 
+static inline __m128i
+vec_andnot_si128 (__m128i a, __m128i b)
+{
+   return vec_andc (b, a);
+}
+
 static inline void
 transpose4_epi32(const __m128i * restrict a,
                  const __m128i * restrict b,
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index d7ea123b0e9..b461810644a 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -305,6 +305,7 @@ The integer capabilities:
   for buffers is supported.
 * ``PIPE_CAP_GENERATE_MIPMAP``: Indicates whether pipe_context::generate_mipmap
   is supported.
+* ``PIPE_CAP_STRING_MARKER``: Whether pipe->emit_string_marker() is supported.
 
 
 .. _pipe_capf:
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_context.c b/src/gallium/drivers/freedreno/a2xx/fd2_context.c
index 3bed73573a6..058f8219ed5 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_context.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_context.c
@@ -109,6 +109,7 @@ fd2_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
 	fd2_gmem_init(pctx);
 	fd2_texture_init(pctx);
 	fd2_prog_init(pctx);
+	fd2_emit_init(pctx);
 
 	pctx = fd_context_init(&fd2_ctx->base, pscreen,
 			(screen->gpu_id >= 220) ? a22x_primtypes : a20x_primtypes,
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
index cc0ed59f300..4f667ab7d57 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
@@ -446,3 +446,17 @@ fd2_emit_setup(struct fd_context *ctx)
 	fd_ringbuffer_flush(ring);
 	fd_ringmarker_mark(ctx->draw_start);
 }
+
+static void
+fd2_emit_ib(struct fd_ringbuffer *ring, struct fd_ringmarker *start,
+		struct fd_ringmarker *end)
+{
+	__OUT_IB(ring, false, start, end);
+}
+
+void
+fd2_emit_init(struct pipe_context *pctx)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	ctx->emit_ib = fd2_emit_ib;
+}
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.h b/src/gallium/drivers/freedreno/a2xx/fd2_emit.h
index 8ee04632091..3c146c17151 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.h
@@ -45,4 +45,6 @@ void fd2_emit_vertex_bufs(struct fd_ringbuffer *ring, uint32_t val,
 void fd2_emit_state(struct fd_context *ctx, uint32_t dirty);
 void fd2_emit_setup(struct fd_context *ctx);
 
+void fd2_emit_init(struct pipe_context *pctx);
+
 #endif /* FD2_EMIT_H */
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
index e65a352e7f6..811f58bbba2 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -891,10 +891,18 @@ fd3_emit_restore(struct fd_context *ctx)
 	ctx->needs_rb_fbd = true;
 }
 
+static void
+fd3_emit_ib(struct fd_ringbuffer *ring, struct fd_ringmarker *start,
+		struct fd_ringmarker *end)
+{
+	__OUT_IB(ring, true, start, end);
+}
+
 void
 fd3_emit_init(struct pipe_context *pctx)
 {
 	struct fd_context *ctx = fd_context(pctx);
 	ctx->emit_const = fd3_emit_const;
 	ctx->emit_const_bo = fd3_emit_const_bo;
+	ctx->emit_ib = fd3_emit_ib;
 }
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
index 21fb59e450d..2ce393a41ae 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
@@ -853,7 +853,7 @@ emit_binning_pass(struct fd_context *ctx)
 			A3XX_PC_VSTREAM_CONTROL_N(0));
 
 	/* emit IB to binning drawcmds: */
-	OUT_IB(ring, ctx->binning_start, ctx->binning_end);
+	ctx->emit_ib(ring, ctx->binning_start, ctx->binning_end);
 	fd_reset_wfi(ctx);
 
 	fd_wfi(ctx, ring);
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index bc62a5d9a4b..4a3f1da30ed 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -885,10 +885,18 @@ fd4_emit_restore(struct fd_context *ctx)
 	ctx->needs_rb_fbd = true;
 }
 
+static void
+fd4_emit_ib(struct fd_ringbuffer *ring, struct fd_ringmarker *start,
+		struct fd_ringmarker *end)
+{
+	__OUT_IB(ring, true, start, end);
+}
+
 void
 fd4_emit_init(struct pipe_context *pctx)
 {
 	struct fd_context *ctx = fd_context(pctx);
 	ctx->emit_const = fd4_emit_const;
 	ctx->emit_const_bo = fd4_emit_const_bo;
+	ctx->emit_ib = fd4_emit_ib;
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
index 32b8fce1613..74716fb733f 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -217,6 +217,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 	struct stage s[MAX_STAGES];
 	uint32_t pos_regid, posz_regid, psize_regid, color_regid[8];
 	uint32_t face_regid, coord_regid, zwcoord_regid;
+	enum a3xx_threadsize fssz;
 	int constmode;
 	int i, j, k;
 
@@ -224,6 +225,8 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 
 	setup_stages(emit, s);
 
+	fssz = (s[FS].i->max_reg >= 24) ? TWO_QUADS : FOUR_QUADS;
+
 	/* blob seems to always use constmode currently: */
 	constmode = 1;
 
@@ -258,7 +261,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 	OUT_RING(ring, 0x00000003);
 
 	OUT_PKT0(ring, REG_A4XX_HLSQ_CONTROL_0_REG, 5);
-	OUT_RING(ring, A4XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) |
+	OUT_RING(ring, A4XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(fssz) |
 			A4XX_HLSQ_CONTROL_0_REG_CONSTMODE(constmode) |
 			A4XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE |
 			/* NOTE:  I guess SHADERRESTART and CONSTFULLUPDATE maybe
@@ -385,7 +388,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 			A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) |
 			A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) |
 			A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
-			A4XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) |
+			A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) |
 			A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE |
 			COND(s[FS].v->has_samp, A4XX_SP_FS_CTRL_REG0_PIXLODENABLE));
 	OUT_RING(ring, A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) |
diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c
index 0b6b9fbbe7a..c5ea86f9368 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.c
+++ b/src/gallium/drivers/freedreno/freedreno_context.c
@@ -141,6 +141,32 @@ fd_context_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
 	}
 }
 
+/**
+ * emit marker string as payload of a no-op packet, which can be
+ * decoded by cffdump.
+ */
+static void
+fd_emit_string_marker(struct pipe_context *pctx, const char *string, int len)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	struct fd_ringbuffer *ring = ctx->ring;
+	const uint32_t *buf = (const void *)string;
+
+	OUT_PKT3(ring, CP_NOP, align(len, 4) / 4);
+	while (len >= 4) {
+		OUT_RING(ring, *buf);
+		buf++;
+		len -= 4;
+	}
+
+	/* copy remainder bytes without reading past end of input string: */
+	if (len > 0) {
+		uint32_t w = 0;
+		memcpy(&w, buf, len);
+		OUT_RING(ring, w);
+	}
+}
+
 void
 fd_context_destroy(struct pipe_context *pctx)
 {
@@ -207,6 +233,7 @@ fd_context_init(struct fd_context *ctx, struct pipe_screen *pscreen,
 	pctx->screen = pscreen;
 	pctx->priv = priv;
 	pctx->flush = fd_context_flush;
+	pctx->emit_string_marker = fd_emit_string_marker;
 
 	for (i = 0; i < ARRAY_SIZE(ctx->rings); i++) {
 		ctx->rings[i] = fd_ringbuffer_new(screen->pipe, 0x100000);
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h
index 418b71b95de..9e7130ab915 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -386,6 +386,10 @@ struct fd_context {
 			const uint32_t *dwords, struct pipe_resource *prsc);
 	void (*emit_const_bo)(struct fd_ringbuffer *ring, enum shader_t type, boolean write,
 			uint32_t regid, uint32_t num, struct fd_bo **bos, uint32_t *offsets);
+
+	/* indirect-branch emit: */
+	void (*emit_ib)(struct fd_ringbuffer *ring, struct fd_ringmarker *start,
+			struct fd_ringmarker *end);
 };
 
 static inline struct fd_context *
diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c
index 648db9baee5..0d73349057c 100644
--- a/src/gallium/drivers/freedreno/freedreno_gmem.c
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.c
@@ -331,7 +331,7 @@ render_tiles(struct fd_context *ctx)
 		fd_hw_query_prepare_tile(ctx, i, ctx->ring);
 
 		/* emit IB to drawcmds: */
-		OUT_IB(ctx->ring, ctx->draw_start, ctx->draw_end);
+		ctx->emit_ib(ctx->ring, ctx->draw_start, ctx->draw_end);
 		fd_reset_wfi(ctx);
 
 		/* emit gmem2mem to transfer tile back to system memory: */
@@ -349,7 +349,7 @@ render_sysmem(struct fd_context *ctx)
 	fd_hw_query_prepare_tile(ctx, 0, ctx->ring);
 
 	/* emit IB to drawcmds: */
-	OUT_IB(ctx->ring, ctx->draw_start, ctx->draw_end);
+	ctx->emit_ib(ctx->ring, ctx->draw_start, ctx->draw_end);
 	fd_reset_wfi(ctx);
 }
 
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index a75b04b327a..640f50f5dcb 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -155,6 +155,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_USER_CONSTANT_BUFFERS:
 	case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
 	case PIPE_CAP_VERTEXID_NOBASE:
+	case PIPE_CAP_STRING_MARKER:
 		return 1;
 
 	case PIPE_CAP_SHADER_STENCIL_EXPORT:
@@ -400,9 +401,16 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 		return 1;
 	case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
 	case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+		/* Technically this should be the same as for TEMP/CONST, since
+		 * everything is just normal registers.  This is just temporary
+		 * hack until load_input/store_output handle arrays in a similar
+		 * way as load_var/store_var..
+		 */
+		return 0;
 	case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
 	case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
-		return 1;
+		/* a2xx compiler doesn't handle indirect: */
+		return is_ir3(screen) ? 1 : 0;
 	case PIPE_SHADER_CAP_SUBROUTINES:
 	case PIPE_SHADER_CAP_DOUBLES:
 	case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
@@ -566,6 +574,7 @@ fd_screen_create(struct fd_device *dev)
 		fd3_screen_init(pscreen);
 		break;
 	case 420:
+	case 430:
 		fd4_screen_init(pscreen);
 		break;
 	default:
diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h
index 0d2418e1e00..47dd467f498 100644
--- a/src/gallium/drivers/freedreno/freedreno_util.h
+++ b/src/gallium/drivers/freedreno/freedreno_util.h
@@ -265,8 +265,8 @@ OUT_WFI(struct fd_ringbuffer *ring)
 }
 
 static inline void
-OUT_IB(struct fd_ringbuffer *ring, struct fd_ringmarker *start,
-		struct fd_ringmarker *end)
+__OUT_IB(struct fd_ringbuffer *ring, bool prefetch,
+		struct fd_ringmarker *start, struct fd_ringmarker *end)
 {
 	uint32_t dwords = fd_ringmarker_dwords(start, end);
 
@@ -280,7 +280,7 @@ OUT_IB(struct fd_ringbuffer *ring, struct fd_ringmarker *start,
 	 */
 	emit_marker(ring, 6);
 
-	OUT_PKT3(ring, CP_INDIRECT_BUFFER_PFD, 2);
+	OUT_PKT3(ring, prefetch ? CP_INDIRECT_BUFFER_PFE : CP_INDIRECT_BUFFER_PFD, 2);
 	fd_ringbuffer_emit_reloc_ring(ring, start, end);
 	OUT_RING(ring, dwords);
 
diff --git a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
index 83ed5ffdca0..599872470fc 100644
--- a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
+++ b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
@@ -220,7 +220,7 @@ static void print_instr_cat1(instr_t *instr)
 		else if (cat1->off > 0)
 			printf("%c<a0.x + %d>", type, cat1->off);
 		else
-			printf("c<a0.x>");
+			printf("%c<a0.x>", type);
 	} else {
 		print_reg_src((reg_t)(cat1->src), type_size(cat1->src_type) == 32,
 				cat1->src_r, cat1->src_c, cat1->src_im, false, false, false);
@@ -650,7 +650,7 @@ static void print_instr_cat6(instr_t *instr)
 /* size of largest OPC field of all the instruction categories: */
 #define NOPC_BITS 6
 
-struct opc_info {
+static const struct opc_info {
 	uint16_t cat;
 	uint16_t opc;
 	const char *name;
diff --git a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
index c3fb68d511c..1b1f1f0a797 100644
--- a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
+++ b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
@@ -261,6 +261,7 @@ typedef union PACKED {
 	/* to make compiler happy: */
 	uint32_t dummy32;
 	uint32_t dummy10   : 10;
+	int32_t  idummy10  : 10;
 	uint32_t dummy11   : 11;
 	uint32_t dummy12   : 12;
 	uint32_t dummy13   : 13;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index b24825cff85..7d89142d7a1 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -81,6 +81,7 @@ struct ir3 * ir3_create(struct ir3_compiler *compiler,
 	shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout);
 
 	list_inithead(&shader->block_list);
+	list_inithead(&shader->array_list);
 
 	return shader;
 }
@@ -121,18 +122,19 @@ static uint32_t reg(struct ir3_register *reg, struct ir3_info *info,
 		val.iim_val = reg->iim_val;
 	} else {
 		unsigned components;
+		int16_t max;
 
 		if (reg->flags & IR3_REG_RELATIV) {
 			components = reg->size;
-			val.dummy10 = reg->offset;
+			val.idummy10 = reg->array.offset;
+			max = (reg->array.offset + repeat + components - 1) >> 2;
 		} else {
 			components = util_last_bit(reg->wrmask);
 			val.comp = reg->num & 0x3;
 			val.num  = reg->num >> 2;
+			max = (reg->num + repeat + components - 1) >> 2;
 		}
 
-		int16_t max = (reg->num + repeat + components - 1) >> 2;
-
 		if (reg->flags & IR3_REG_CONST) {
 			info->max_const = MAX2(info->max_const, max);
 		} else if (val.num == 63) {
@@ -233,7 +235,7 @@ static int emit_cat2(struct ir3_instruction *instr, void *ptr,
 	iassert((instr->regs_count == 2) || (instr->regs_count == 3));
 
 	if (src1->flags & IR3_REG_RELATIV) {
-		iassert(src1->num < (1 << 10));
+		iassert(src1->array.offset < (1 << 10));
 		cat2->rel1.src1      = reg(src1, info, instr->repeat,
 				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
 				IR3_REG_HALF | absneg);
@@ -260,7 +262,7 @@ static int emit_cat2(struct ir3_instruction *instr, void *ptr,
 				!((src1->flags ^ src2->flags) & IR3_REG_HALF));
 
 		if (src2->flags & IR3_REG_RELATIV) {
-			iassert(src2->num < (1 << 10));
+			iassert(src2->array.offset < (1 << 10));
 			cat2->rel2.src2      = reg(src2, info, instr->repeat,
 					IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
 					IR3_REG_HALF | absneg);
@@ -333,7 +335,7 @@ static int emit_cat3(struct ir3_instruction *instr, void *ptr,
 	iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF));
 
 	if (src1->flags & IR3_REG_RELATIV) {
-		iassert(src1->num < (1 << 10));
+		iassert(src1->array.offset < (1 << 10));
 		cat3->rel1.src1      = reg(src1, info, instr->repeat,
 				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
 				IR3_REG_HALF | absneg);
@@ -361,7 +363,7 @@ static int emit_cat3(struct ir3_instruction *instr, void *ptr,
 
 
 	if (src3->flags & IR3_REG_RELATIV) {
-		iassert(src3->num < (1 << 10));
+		iassert(src3->array.offset < (1 << 10));
 		cat3->rel2.src3      = reg(src3, info, instr->repeat,
 				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
 				IR3_REG_HALF | absneg);
@@ -404,7 +406,7 @@ static int emit_cat4(struct ir3_instruction *instr, void *ptr,
 	iassert(instr->regs_count == 2);
 
 	if (src->flags & IR3_REG_RELATIV) {
-		iassert(src->num < (1 << 10));
+		iassert(src->array.offset < (1 << 10));
 		cat4->rel.src      = reg(src, info, instr->repeat,
 				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_FNEG |
 				IR3_REG_FABS | IR3_REG_R | IR3_REG_HALF);
@@ -737,6 +739,14 @@ struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
 	return reg;
 }
 
+struct ir3_register * ir3_reg_clone(struct ir3 *shader,
+		struct ir3_register *reg)
+{
+	struct ir3_register *new_reg = reg_create(shader, 0, 0);
+	*new_reg = *reg;
+	return new_reg;
+}
+
 void
 ir3_instr_set_address(struct ir3_instruction *instr,
 		struct ir3_instruction *addr)
@@ -777,3 +787,12 @@ ir3_count_instructions(struct ir3 *ir)
 	}
 	return cnt;
 }
+
+struct ir3_array *
+ir3_lookup_array(struct ir3 *ir, unsigned id)
+{
+	list_for_each_entry (struct ir3_array, arr, &ir->array_list, node)
+		if (arr->id == id)
+			return arr;
+	return NULL;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index 62d14a0ae37..1a109d880e6 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -83,7 +83,8 @@ struct ir3_register {
 		 * before register assignment is done:
 		 */
 		IR3_REG_SSA    = 0x2000,   /* 'instr' is ptr to assigning instr */
-		IR3_REG_PHI_SRC= 0x4000,   /* phi src, regs[0]->instr points to phi */
+		IR3_REG_ARRAY  = 0x4000,
+		IR3_REG_PHI_SRC= 0x8000,   /* phi src, regs[0]->instr points to phi */
 
 	} flags;
 	union {
@@ -97,11 +98,18 @@ struct ir3_register {
 		uint32_t uim_val;
 		float    fim_val;
 		/* relative: */
-		int   offset;
+		struct {
+			uint16_t id;
+			int16_t offset;
+		} array;
 	};
 
-	/* for IR3_REG_SSA, src registers contain ptr back to
-	 * assigning instruction.
+	/* For IR3_REG_SSA, src registers contain ptr back to assigning
+	 * instruction.
+	 *
+	 * For IR3_REG_ARRAY, the pointer is back to the last dependent
+	 * array access (although the net effect is the same, it points
+	 * back to a previous instruction that we depend on).
 	 */
 	struct ir3_instruction *instr;
 
@@ -221,9 +229,6 @@ struct ir3_instruction {
 		struct {
 			int off;              /* component/offset */
 		} fo;
-		struct {
-			int aid;
-		} fi;
 		struct {
 			/* used to temporarily hold reference to nir_phi_instr
 			 * until we resolve the phi srcs
@@ -293,19 +298,6 @@ struct ir3_instruction {
 	 */
 	struct ir3_instruction *address;
 
-	/* in case of a instruction with relative dst instruction, we need to
-	 * capture the dependency on the fanin for the previous values of
-	 * the array elements.  Since we don't know at compile time actually
-	 * which array elements are written, this serves to preserve the
-	 * unconditional write to array elements prior to the conditional
-	 * write.
-	 *
-	 * TODO only cat1 can do indirect write.. we could maybe move this
-	 * into instr->cat1.fanin (but would require the frontend to insert
-	 * the extra mov)
-	 */
-	struct ir3_instruction *fanin;
-
 	/* Entry in ir3_block's instruction list: */
 	struct list_head node;
 
@@ -379,10 +371,41 @@ struct ir3 {
 	/* List of blocks: */
 	struct list_head block_list;
 
+	/* List of ir3_array's: */
+	struct list_head array_list;
+
 	unsigned heap_idx;
 	struct ir3_heap_chunk *chunk;
 };
 
+typedef struct nir_variable nir_variable;
+
+struct ir3_array {
+	struct list_head node;
+	unsigned length;
+	unsigned id;
+
+	nir_variable *var;
+
+	/* We track the last write and last access (read or write) to
+	 * setup dependencies on instructions that read or write the
+	 * array.  Reads can be re-ordered wrt. other reads, but should
+	 * not be re-ordered wrt. to writes.  Writes cannot be reordered
+	 * wrt. any other access to the array.
+	 *
+	 * So array reads depend on last write, and array writes depend
+	 * on the last access.
+	 */
+	struct ir3_instruction *last_write, *last_access;
+
+	/* extra stuff used in RA pass: */
+	unsigned base;      /* base vreg name */
+	unsigned reg;       /* base physical reg */
+	uint16_t start_ip, end_ip;
+};
+
+struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id);
+
 typedef struct nir_block nir_block;
 
 struct ir3_block {
@@ -430,6 +453,8 @@ const char *ir3_instr_name(struct ir3_instruction *instr);
 
 struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
 		int num, int flags);
+struct ir3_register * ir3_reg_clone(struct ir3 *shader,
+		struct ir3_register *reg);
 
 void ir3_instr_set_address(struct ir3_instruction *instr,
 		struct ir3_instruction *addr);
@@ -510,6 +535,9 @@ static inline bool is_same_type_mov(struct ir3_instruction *instr)
 	if (dst->num == regid(REG_A0, 0))
 		return false;
 
+	if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
+		return false;
+
 	if ((instr->category == 1) &&
 			(instr->cat1.src_type == instr->cat1.dst_type))
 		return true;
@@ -623,8 +651,10 @@ static inline bool writes_pred(struct ir3_instruction *instr)
 /* TODO better name */
 static inline struct ir3_instruction *ssa(struct ir3_register *reg)
 {
-	if (reg->flags & IR3_REG_SSA)
+	if (reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) {
+		debug_assert(!(reg->instr && (reg->instr->flags & IR3_INSTR_UNUSED)));
 		return reg->instr;
+	}
 	return NULL;
 }
 
@@ -813,8 +843,6 @@ static inline unsigned ir3_cat3_absneg(opc_t opc)
 
 static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
 {
-	if (instr->fanin)
-		return instr->regs_count + 2;
 	if (instr->address)
 		return instr->regs_count + 1;
 	return instr->regs_count;
@@ -822,8 +850,6 @@ static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
 
 static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n)
 {
-	if (n == (instr->regs_count + 1))
-		return instr->fanin;
 	if (n == (instr->regs_count + 0))
 		return instr->address;
 	return ssa(instr->regs[n]);
@@ -834,8 +860,8 @@ static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr
 /* iterator for an instruction's SSA sources (instr), also returns src #: */
 #define foreach_ssa_src_n(__srcinst, __n, __instr) \
 	if ((__instr)->regs_count) \
-		for (unsigned __cnt = __ssa_src_cnt(__instr) - 1, __n = 0; __n < __cnt; __n++) \
-			if ((__srcinst = __ssa_src_n(__instr, __n + 1)))
+		for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \
+			if ((__srcinst = __ssa_src_n(__instr, __n)))
 
 /* iterator for an instruction's SSA sources (instr): */
 #define foreach_ssa_src(__srcinst, __instr) \
@@ -878,7 +904,15 @@ ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
 	struct ir3_instruction *instr =
 		ir3_instr_create(block, 1, 0);
 	ir3_reg_create(instr, 0, 0);   /* dst */
-	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+	if (src->regs[0]->flags & IR3_REG_ARRAY) {
+		struct ir3_register *src_reg =
+			ir3_reg_create(instr, 0, IR3_REG_ARRAY);
+		src_reg->array = src->regs[0]->array;
+		src_reg->instr = src;
+	} else {
+		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+	}
+	debug_assert(!(src->regs[0]->flags & IR3_REG_RELATIV));
 	instr->cat1.src_type = type;
 	instr->cat1.dst_type = type;
 	return instr;
@@ -894,6 +928,7 @@ ir3_COV(struct ir3_block *block, struct ir3_instruction *src,
 	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
 	instr->cat1.src_type = src_type;
 	instr->cat1.dst_type = dst_type;
+	debug_assert(!(src->regs[0]->flags & IR3_REG_ARRAY));
 	return instr;
 }
 
@@ -1083,7 +1118,7 @@ typedef uint8_t regmask_t[2 * MAX_REG / 8];
 
 static inline unsigned regmask_idx(struct ir3_register *reg)
 {
-	unsigned num = reg->num;
+	unsigned num = (reg->flags & IR3_REG_RELATIV) ? reg->array.offset : reg->num;
 	debug_assert(num < MAX_REG);
 	if (reg->flags & IR3_REG_HALF)
 		num += MAX_REG;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 86afda4ba08..1ea2dd9cbf7 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -46,7 +46,6 @@
 struct ir3_compile {
 	struct ir3_compiler *compiler;
 
-	const struct tgsi_token *tokens;
 	struct nir_shader *s;
 
 	struct ir3 *ir;
@@ -75,8 +74,6 @@ struct ir3_compile {
 	/* mapping from nir_register to defining instruction: */
 	struct hash_table *def_ht;
 
-	/* mapping from nir_variable to ir3_array: */
-	struct hash_table *var_ht;
 	unsigned num_arrays;
 
 	/* a common pattern for indirect addressing is to request the
@@ -143,8 +140,6 @@ compile_init(struct ir3_compiler *compiler,
 	ctx->so = so;
 	ctx->def_ht = _mesa_hash_table_create(ctx,
 			_mesa_hash_pointer, _mesa_key_pointer_equal);
-	ctx->var_ht = _mesa_hash_table_create(ctx,
-			_mesa_hash_pointer, _mesa_key_pointer_equal);
 	ctx->block_ht = _mesa_hash_table_create(ctx,
 			_mesa_hash_pointer, _mesa_key_pointer_equal);
 
@@ -221,206 +216,26 @@ compile_free(struct ir3_compile *ctx)
 	ralloc_free(ctx);
 }
 
-/* global per-array information: */
-struct ir3_array {
-	unsigned length, aid;
-};
-
-/* per-block array state: */
-struct ir3_array_value {
-	/* TODO drop length/aid, and just have ptr back to ir3_array */
-	unsigned length, aid;
-	/* initial array element values are phi's, other than for the
-	 * entry block.  The phi src's get added later in a resolve step
-	 * after we have visited all the blocks, to account for back
-	 * edges in the cfg.
-	 */
-	struct ir3_instruction **phis;
-	/* current array element values (as block is processed).  When
-	 * the array phi's are resolved, it will contain the array state
-	 * at exit of block, so successor blocks can use it to add their
-	 * phi srcs.
-	 */
-	struct ir3_instruction *arr[];
-};
-
-/* track array assignments per basic block.  When an array is read
- * outside of the same basic block, we can use NIR's dominance-frontier
- * information to figure out where phi nodes are needed.
- */
-struct ir3_nir_block_data {
-	unsigned foo;
-	/* indexed by array-id (aid): */
-	struct ir3_array_value *arrs[];
-};
-
-static struct ir3_nir_block_data *
-get_block_data(struct ir3_compile *ctx, struct ir3_block *block)
-{
-	if (!block->data) {
-		struct ir3_nir_block_data *bd = ralloc_size(ctx, sizeof(*bd) +
-				((ctx->num_arrays + 1) * sizeof(bd->arrs[0])));
-		block->data = bd;
-	}
-	return block->data;
-}
-
 static void
 declare_var(struct ir3_compile *ctx, nir_variable *var)
 {
 	unsigned length = glsl_get_length(var->type) * 4;  /* always vec4, at least with ttn */
 	struct ir3_array *arr = ralloc(ctx, struct ir3_array);
+	arr->id = ++ctx->num_arrays;
 	arr->length = length;
-	arr->aid = ++ctx->num_arrays;
-	_mesa_hash_table_insert(ctx->var_ht, var, arr);
+	arr->var = var;
+	list_addtail(&arr->node, &ctx->ir->array_list);
 }
 
-static nir_block *
-nir_block_pred(nir_block *block)
-{
-	assert(block->predecessors->entries < 2);
-	if (block->predecessors->entries == 0)
-		return NULL;
-	return (nir_block *)_mesa_set_next_entry(block->predecessors, NULL)->key;
-}
-
-static struct ir3_array_value *
+static struct ir3_array *
 get_var(struct ir3_compile *ctx, nir_variable *var)
 {
-	struct hash_entry *entry = _mesa_hash_table_search(ctx->var_ht, var);
-	struct ir3_block *block = ctx->block;
-	struct ir3_nir_block_data *bd = get_block_data(ctx, block);
-	struct ir3_array *arr = entry->data;
-
-	if (!bd->arrs[arr->aid]) {
-		struct ir3_array_value *av = ralloc_size(bd, sizeof(*av) +
-				(arr->length * sizeof(av->arr[0])));
-		struct ir3_array_value *defn = NULL;
-		nir_block *pred_block;
-
-		av->length = arr->length;
-		av->aid = arr->aid;
-
-		/* For loops, we have to consider that we have not visited some
-		 * of the blocks who should feed into the phi (ie. back-edges in
-		 * the cfg).. for example:
-		 *
-		 *   loop {
-		 *      block { load_var; ... }
-		 *      if then block {} else block {}
-		 *      block { store_var; ... }
-		 *      if then block {} else block {}
-		 *      block {...}
-		 *   }
-		 *
-		 * We can skip the phi if we can chase the block predecessors
-		 * until finding the block previously defining the array without
-		 * crossing a block that has more than one predecessor.
-		 *
-		 * Otherwise create phi's and resolve them as a post-pass after
-		 * all the blocks have been visited (to handle back-edges).
-		 */
-
-		for (pred_block = block->nblock;
-				pred_block && (pred_block->predecessors->entries < 2) && !defn;
-				pred_block = nir_block_pred(pred_block)) {
-			struct ir3_block *pblock = get_block(ctx, pred_block);
-			struct ir3_nir_block_data *pbd = pblock->data;
-			if (!pbd)
-				continue;
-			defn = pbd->arrs[arr->aid];
-		}
-
-		if (defn) {
-			/* only one possible definer: */
-			for (unsigned i = 0; i < arr->length; i++)
-				av->arr[i] = defn->arr[i];
-		} else if (pred_block) {
-			/* not the first block, and multiple potential definers: */
-			av->phis = ralloc_size(av, arr->length * sizeof(av->phis[0]));
-
-			for (unsigned i = 0; i < arr->length; i++) {
-				struct ir3_instruction *phi;
-
-				phi = ir3_instr_create2(block, -1, OPC_META_PHI,
-						1 + ctx->impl->num_blocks);
-				ir3_reg_create(phi, 0, 0);         /* dst */
-
-				/* phi's should go at head of block: */
-				list_delinit(&phi->node);
-				list_add(&phi->node, &block->instr_list);
-
-				av->phis[i] = av->arr[i] = phi;
-			}
-		} else {
-			/* Some shaders end up reading array elements without
-			 * first writing.. so initialize things to prevent null
-			 * instr ptrs later:
-			 */
-			for (unsigned i = 0; i < arr->length; i++)
-				av->arr[i] = create_immed(block, 0);
-		}
-
-		bd->arrs[arr->aid] = av;
-	}
-
-	return bd->arrs[arr->aid];
-}
-
-static void
-add_array_phi_srcs(struct ir3_compile *ctx, nir_block *nblock,
-		struct ir3_array_value *av, BITSET_WORD *visited)
-{
-	struct ir3_block *block;
-	struct ir3_nir_block_data *bd;
-
-	if (BITSET_TEST(visited, nblock->index))
-		return;
-
-	BITSET_SET(visited, nblock->index);
-
-	block = get_block(ctx, nblock);
-	bd = block->data;
-
-	if (bd && bd->arrs[av->aid]) {
-		struct ir3_array_value *dav = bd->arrs[av->aid];
-		for (unsigned i = 0; i < av->length; i++) {
-			ir3_reg_create(av->phis[i], 0, IR3_REG_SSA)->instr =
-					dav->arr[i];
-		}
-	} else {
-		/* didn't find defn, recurse predecessors: */
-		struct set_entry *entry;
-		set_foreach(nblock->predecessors, entry) {
-			add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited);
-		}
-	}
-}
-
-static void
-resolve_array_phis(struct ir3_compile *ctx, struct ir3_block *block)
-{
-	struct ir3_nir_block_data *bd = block->data;
-	unsigned bitset_words = BITSET_WORDS(ctx->impl->num_blocks);
-
-	if (!bd)
-		return;
-
-	/* TODO use nir dom_frontier to help us with this? */
-
-	for (unsigned i = 1; i <= ctx->num_arrays; i++) {
-		struct ir3_array_value *av = bd->arrs[i];
-		BITSET_WORD visited[bitset_words];
-		struct set_entry *entry;
-
-		if (!(av && av->phis))
-			continue;
-
-		memset(visited, 0, sizeof(visited));
-		set_foreach(block->nblock->predecessors, entry) {
-			add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited);
-		}
+	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+		if (arr->var == var)
+			return arr;
 	}
+	compile_error(ctx, "bogus var: %s\n", var->name);
+	return NULL;
 }
 
 /* allocate a n element value array (to be populated by caller) and
@@ -438,6 +253,7 @@ __get_dst(struct ir3_compile *ctx, void *key, unsigned n)
 static struct ir3_instruction **
 get_dst(struct ir3_compile *ctx, nir_dest *dst, unsigned n)
 {
+	compile_assert(ctx, dst->is_ssa);
 	if (dst->is_ssa) {
 		return __get_dst(ctx, &dst->ssa, n);
 	} else {
@@ -455,6 +271,7 @@ static struct ir3_instruction **
 get_src(struct ir3_compile *ctx, nir_src *src)
 {
 	struct hash_entry *entry;
+	compile_assert(ctx, src->is_ssa);
 	if (src->is_ssa) {
 		entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
 	} else {
@@ -560,7 +377,7 @@ create_uniform(struct ir3_compile *ctx, unsigned n)
 }
 
 static struct ir3_instruction *
-create_uniform_indirect(struct ir3_compile *ctx, unsigned n,
+create_uniform_indirect(struct ir3_compile *ctx, int n,
 		struct ir3_instruction *address)
 {
 	struct ir3_instruction *mov;
@@ -569,7 +386,7 @@ create_uniform_indirect(struct ir3_compile *ctx, unsigned n,
 	mov->cat1.src_type = TYPE_U32;
 	mov->cat1.dst_type = TYPE_U32;
 	ir3_reg_create(mov, 0, 0);
-	ir3_reg_create(mov, n, IR3_REG_CONST | IR3_REG_RELATIV);
+	ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
 
 	ir3_instr_set_address(mov, address);
 
@@ -594,7 +411,7 @@ create_collect(struct ir3_block *block, struct ir3_instruction **arr,
 }
 
 static struct ir3_instruction *
-create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
+create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, int n,
 		struct ir3_instruction *address, struct ir3_instruction *collect)
 {
 	struct ir3_block *block = ctx->block;
@@ -608,17 +425,45 @@ create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
 	src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV);
 	src->instr = collect;
 	src->size  = arrsz;
-	src->offset = n;
+	src->array.offset = n;
 
 	ir3_instr_set_address(mov, address);
 
 	return mov;
 }
 
+/* relative (indirect) if address!=NULL */
 static struct ir3_instruction *
-create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
-		struct ir3_instruction *src, struct ir3_instruction *address,
-		struct ir3_instruction *collect)
+create_var_load(struct ir3_compile *ctx, struct ir3_array *arr, int n,
+		struct ir3_instruction *address)
+{
+	struct ir3_block *block = ctx->block;
+	struct ir3_instruction *mov;
+	struct ir3_register *src;
+
+	mov = ir3_instr_create(block, 1, 0);
+	mov->cat1.src_type = TYPE_U32;
+	mov->cat1.dst_type = TYPE_U32;
+	ir3_reg_create(mov, 0, 0);
+	src = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
+			COND(address, IR3_REG_RELATIV));
+	src->instr = arr->last_write;
+	src->size  = arr->length;
+	src->array.id = arr->id;
+	src->array.offset = n;
+
+	if (address)
+		ir3_instr_set_address(mov, address);
+
+	arr->last_access = mov;
+
+	return mov;
+}
+
+/* relative (indirect) if address!=NULL */
+static struct ir3_instruction *
+create_var_store(struct ir3_compile *ctx, struct ir3_array *arr, int n,
+		struct ir3_instruction *src, struct ir3_instruction *address)
 {
 	struct ir3_block *block = ctx->block;
 	struct ir3_instruction *mov;
@@ -627,14 +472,18 @@ create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
 	mov = ir3_instr_create(block, 1, 0);
 	mov->cat1.src_type = TYPE_U32;
 	mov->cat1.dst_type = TYPE_U32;
-	dst = ir3_reg_create(mov, 0, IR3_REG_RELATIV);
-	dst->size  = arrsz;
-	dst->offset = n;
+	dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
+			COND(address, IR3_REG_RELATIV));
+	dst->instr = arr->last_access;
+	dst->size  = arr->length;
+	dst->array.id = arr->id;
+	dst->array.offset = n;
 	ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src;
-	mov->fanin = collect;
 
 	ir3_instr_set_address(mov, address);
 
+	arr->last_write = arr->last_access = mov;
+
 	return mov;
 }
 
@@ -1151,7 +1000,7 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 	nir_const_value *const_offset;
 	/* UBO addresses are the first driver params: */
 	unsigned ubo = regid(ctx->so->first_driver_param + IR3_UBOS_OFF, 0);
-	unsigned off = intr->const_index[0];
+	int off = intr->const_index[0];
 
 	/* First src is ubo index, which could either be an immed or not: */
 	src0 = get_src(ctx, &intr->src[0])[0];
@@ -1199,7 +1048,7 @@ emit_intrinsic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 {
 	nir_deref_var *dvar = intr->variables[0];
 	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
-	struct ir3_array_value *arr = get_var(ctx, dvar->var);
+	struct ir3_array *arr = get_var(ctx, dvar->var);
 
 	compile_assert(ctx, dvar->deref.child &&
 		(dvar->deref.child->deref_type == nir_deref_type_array));
@@ -1210,19 +1059,17 @@ emit_intrinsic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 		for (int i = 0; i < intr->num_components; i++) {
 			unsigned n = darr->base_offset * 4 + i;
 			compile_assert(ctx, n < arr->length);
-			dst[i] = arr->arr[n];
+			dst[i] = create_var_load(ctx, arr, n, NULL);
 		}
 		break;
 	case nir_deref_array_type_indirect: {
 		/* for indirect, we need to collect all the array elements: */
-		struct ir3_instruction *collect =
-				create_collect(ctx->block, arr->arr, arr->length);
 		struct ir3_instruction *addr =
 				get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
 		for (int i = 0; i < intr->num_components; i++) {
 			unsigned n = darr->base_offset * 4 + i;
 			compile_assert(ctx, n < arr->length);
-			dst[i] = create_indirect_load(ctx, arr->length, n, addr, collect);
+			dst[i] = create_var_load(ctx, arr, n, addr);
 		}
 		break;
 	}
@@ -1239,8 +1086,9 @@ emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 {
 	nir_deref_var *dvar = intr->variables[0];
 	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
-	struct ir3_array_value *arr = get_var(ctx, dvar->var);
-	struct ir3_instruction **src;
+	struct ir3_array *arr = get_var(ctx, dvar->var);
+	struct ir3_instruction *addr, **src;
+	unsigned wrmask = intr->const_index[0];
 
 	compile_assert(ctx, dvar->deref.child &&
 		(dvar->deref.child->deref_type == nir_deref_type_array));
@@ -1249,66 +1097,24 @@ emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 
 	switch (darr->deref_array_type) {
 	case nir_deref_array_type_direct:
-		/* direct access does not require anything special: */
-		for (int i = 0; i < intr->num_components; i++) {
-			/* ttn doesn't generate partial writemasks */
-			assert(intr->const_index[0] ==
-			       (1 << intr->num_components) - 1);
-
-			unsigned n = darr->base_offset * 4 + i;
-			compile_assert(ctx, n < arr->length);
-			arr->arr[n] = src[i];
-		}
+		addr = NULL;
 		break;
-	case nir_deref_array_type_indirect: {
-		/* for indirect, create indirect-store and fan that out: */
-		struct ir3_instruction *collect =
-				create_collect(ctx->block, arr->arr, arr->length);
-		struct ir3_instruction *addr =
-				get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
-		for (int i = 0; i < intr->num_components; i++) {
-			/* ttn doesn't generate partial writemasks */
-			assert(intr->const_index[0] ==
-			       (1 << intr->num_components) - 1);
-
-			struct ir3_instruction *store;
-			unsigned n = darr->base_offset * 4 + i;
-			compile_assert(ctx, n < arr->length);
-
-			store = create_indirect_store(ctx, arr->length,
-					n, src[i], addr, collect);
-
-			store->fanin->fi.aid = arr->aid;
-
-			/* TODO: probably split this out to be used for
-			 * store_output_indirect? or move this into
-			 * create_indirect_store()?
-			 */
-			for (int j = i; j < arr->length; j += intr->num_components) {
-				struct ir3_instruction *split;
-
-				split = ir3_instr_create(ctx->block, -1, OPC_META_FO);
-				split->fo.off = j;
-				ir3_reg_create(split, 0, 0);
-				ir3_reg_create(split, 0, IR3_REG_SSA)->instr = store;
-
-				arr->arr[j] = split;
-			}
-		}
-		/* fixup fanout/split neighbors: */
-		for (int i = 0; i < arr->length; i++) {
-			arr->arr[i]->cp.right = (i < (arr->length - 1)) ?
-					arr->arr[i+1] : NULL;
-			arr->arr[i]->cp.left = (i > 0) ?
-					arr->arr[i-1] : NULL;
-		}
+	case nir_deref_array_type_indirect:
+		addr = get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
 		break;
-	}
 	default:
 		compile_error(ctx, "Unhandled store deref type: %u\n",
 				darr->deref_array_type);
 		break;
 	}
+
+	for (int i = 0; i < intr->num_components; i++) {
+		if (!(wrmask & (1 << i)))
+			continue;
+		unsigned n = darr->base_offset * 4 + i;
+		compile_assert(ctx, n < arr->length);
+		create_var_store(ctx, arr, n, src[i], addr);
+	}
 }
 
 static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot,
@@ -1335,7 +1141,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 	const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
 	struct ir3_instruction **dst, **src;
 	struct ir3_block *b = ctx->block;
-	unsigned idx = intr->const_index[0];
+	int idx = intr->const_index[0];
 	nir_const_value *const_offset;
 
 	if (info->has_dest) {
@@ -1356,7 +1162,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		} else {
 			src = get_src(ctx, &intr->src[0]);
 			for (int i = 0; i < intr->num_components; i++) {
-				unsigned n = idx * 4 + i;
+				int n = idx * 4 + i;
 				dst[i] = create_uniform_indirect(ctx, n,
 						get_addr(ctx, src[0]));
 			}
@@ -1836,8 +1642,6 @@ resolve_phis(struct ir3_compile *ctx, struct ir3_block *block)
 			ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
 		}
 	}
-
-	resolve_array_phis(ctx, block);
 }
 
 static void
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index be4e4e81109..1cc211a7663 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -41,16 +41,22 @@ static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags)
 		struct ir3_register *dst = instr->regs[0];
 		struct ir3_register *src = instr->regs[1];
 		struct ir3_instruction *src_instr = ssa(src);
+
+		/* only if mov src is SSA (not const/immed): */
+		if (!src_instr)
+			return false;
+
+		/* no indirect: */
 		if (dst->flags & IR3_REG_RELATIV)
 			return false;
 		if (src->flags & IR3_REG_RELATIV)
 			return false;
+
 		if (!allow_flags)
 			if (src->flags & (IR3_REG_FABS | IR3_REG_FNEG |
 					IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
 				return false;
-		if (!src_instr)
-			return false;
+
 		/* TODO: remove this hack: */
 		if (is_meta(src_instr) && (src_instr->opc == OPC_META_FO))
 			return false;
@@ -82,10 +88,17 @@ static bool valid_flags(struct ir3_instruction *instr, unsigned n,
 	unsigned valid_flags;
 	flags = cp_flags(flags);
 
+	/* If destination is indirect, then source cannot be.. at least
+	 * I don't think so..
+	 */
+	if ((instr->regs[0]->flags & IR3_REG_RELATIV) &&
+			(flags & IR3_REG_RELATIV))
+		return false;
+
 	/* clear flags that are 'ok' */
 	switch (instr->category) {
 	case 1:
-		valid_flags = IR3_REG_IMMED | IR3_REG_RELATIV;
+		valid_flags = IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV;
 		if (flags & ~valid_flags)
 			return false;
 		break;
@@ -183,9 +196,14 @@ static void combine_flags(unsigned *dstflags, unsigned srcflags)
 		*dstflags ^= IR3_REG_SNEG;
 	if (srcflags & IR3_REG_BNOT)
 		*dstflags ^= IR3_REG_BNOT;
-}
 
-static struct ir3_instruction * instr_cp(struct ir3_instruction *instr, unsigned *flags);
+	*dstflags &= ~IR3_REG_SSA;
+	*dstflags |= srcflags & IR3_REG_SSA;
+	*dstflags |= srcflags & IR3_REG_CONST;
+	*dstflags |= srcflags & IR3_REG_IMMED;
+	*dstflags |= srcflags & IR3_REG_RELATIV;
+	*dstflags |= srcflags & IR3_REG_ARRAY;
+}
 
 /* the "plain" MAD's (ie. the ones that don't shift first src prior to
  * multiply) can swap their first two srcs if src[0] is !CONST and
@@ -206,52 +224,35 @@ static bool is_valid_mad(struct ir3_instruction *instr)
 static void
 reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 {
-	unsigned src_flags = 0, new_flags;
-	struct ir3_instruction *src_instr;
+	struct ir3_instruction *src = ssa(reg);
 
-	if (is_meta(instr)) {
-		/* meta instructions cannot fold up register
-		 * flags.. they are usually src for texture
-		 * fetch, etc, where we cannot specify abs/neg
-		 */
-		reg->instr = instr_cp(reg->instr, NULL);
-		return;
-	}
+	if (is_eligible_mov(src, true)) {
+		/* simple case, no immed/const/relativ, only mov's w/ ssa src: */
+		struct ir3_register *src_reg = src->regs[1];
+		unsigned new_flags = reg->flags;
 
-	src_instr = instr_cp(reg->instr, &src_flags);
+		combine_flags(&new_flags, src_reg->flags);
 
-	new_flags = reg->flags;
-	combine_flags(&new_flags, src_flags);
-
-	reg->flags = new_flags;
-	reg->instr = src_instr;
-
-	if (!valid_flags(instr, n, reg->flags)) {
-		/* insert an absneg.f */
-		if (reg->flags & (IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT)) {
-			debug_assert(!(reg->flags & (IR3_REG_FNEG | IR3_REG_FABS)));
-			reg->instr = ir3_ABSNEG_S(instr->block,
-					reg->instr, cp_flags(src_flags));
-		} else {
-			debug_assert(!(reg->flags & (IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT)));
-			reg->instr = ir3_ABSNEG_F(instr->block,
-					reg->instr, cp_flags(src_flags));
+		if (valid_flags(instr, n, new_flags)) {
+			if (new_flags & IR3_REG_ARRAY) {
+				debug_assert(!(reg->flags & IR3_REG_ARRAY));
+				reg->array = src_reg->array;
+			}
+			reg->flags = new_flags;
+			reg->instr = ssa(src_reg);
 		}
-		reg->flags &= ~cp_flags(src_flags);
-		debug_assert(valid_flags(instr, n, reg->flags));
-		/* send it through instr_cp() again since
-		 * the absneg src might be a mov from const
-		 * that could be cleaned up:
-		 */
-		reg->instr = instr_cp(reg->instr, NULL);
-		return;
-	}
 
-	if (is_same_type_mov(reg->instr)) {
-		struct ir3_register *src_reg = reg->instr->regs[1];
-		unsigned new_flags = src_reg->flags;
+		src = ssa(reg);      /* could be null for IR3_REG_ARRAY case */
+		if (!src)
+			return;
+	} else if (is_same_type_mov(src) &&
+			/* cannot collapse const/immed/etc into meta instrs: */
+			!is_meta(instr)) {
+		/* immed/const/etc cases, which require some special handling: */
+		struct ir3_register *src_reg = src->regs[1];
+		unsigned new_flags = reg->flags;
 
-		combine_flags(&new_flags, reg->flags);
+		combine_flags(&new_flags, src_reg->flags);
 
 		if (!valid_flags(instr, n, new_flags)) {
 			/* special case for "normal" mad instructions, we can
@@ -287,6 +288,16 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 					conflicts(instr->address, reg->instr->address))
 				return;
 
+			/* This seems to be a hw bug, or something where the timings
+			 * just somehow don't work out.  This restriction may only
+			 * apply if the first src is also CONST.
+			 */
+			if ((instr->category == 3) && (n == 2) &&
+					(src_reg->flags & IR3_REG_RELATIV) &&
+					(src_reg->array.offset == 0))
+				return;
+
+			src_reg = ir3_reg_clone(instr->block->shader, src_reg);
 			src_reg->flags = new_flags;
 			instr->regs[n+1] = src_reg;
 
@@ -298,6 +309,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 
 		if ((src_reg->flags & IR3_REG_RELATIV) &&
 				!conflicts(instr->address, reg->instr->address)) {
+			src_reg = ir3_reg_clone(instr->block->shader, src_reg);
 			src_reg->flags = new_flags;
 			instr->regs[n+1] = src_reg;
 			ir3_instr_set_address(instr, reg->instr->address);
@@ -330,8 +342,10 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 			if (new_flags & IR3_REG_BNOT)
 				iim_val = ~iim_val;
 
-			if (!(iim_val & ~0x3ff)) {
+			/* other than category 1 (mov) we can only encode up to 10 bits: */
+			if ((instr->category == 1) || !(iim_val & ~0x3ff)) {
 				new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT);
+				src_reg = ir3_reg_clone(instr->block->shader, src_reg);
 				src_reg->flags = new_flags;
 				src_reg->iim_val = iim_val;
 				instr->regs[n+1] = src_reg;
@@ -342,56 +356,68 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 	}
 }
 
-/**
- * Given an SSA src (instruction), return the one with extraneous
- * mov's removed, ie, for (to copy NIR syntax):
- *
- *   vec1 ssa1 = fadd <something>, <somethingelse>
- *   vec1 ssa2 = fabs ssa1
- *   vec1 ssa3 = fneg ssa1
- *
- * then calling instr_cp(ssa3, &flags) would return ssa1 with
- * (IR3_REG_ABS | IR3_REG_NEGATE) in flags.  If flags is NULL,
- * then disallow eliminating copies which would require flag
- * propagation (for example, we cannot propagate abs/neg into
- * an output).
+/* Handle special case of eliminating output mov, and similar cases where
+ * there isn't a normal "consuming" instruction.  In this case we cannot
+ * collapse flags (ie. output mov from const, or w/ abs/neg flags, cannot
+ * be eliminated)
  */
 static struct ir3_instruction *
-instr_cp(struct ir3_instruction *instr, unsigned *flags)
+eliminate_output_mov(struct ir3_instruction *instr)
+{
+	if (is_eligible_mov(instr, false)) {
+		struct ir3_register *reg = instr->regs[1];
+		if (!(reg->flags & IR3_REG_ARRAY)) {
+			struct ir3_instruction *src_instr = ssa(reg);
+			debug_assert(src_instr);
+			return src_instr;
+		}
+	}
+	return instr;
+}
+
+/**
+ * Find instruction src's which are mov's that can be collapsed, replacing
+ * the mov dst with the mov src
+ */
+static void
+instr_cp(struct ir3_instruction *instr)
 {
 	struct ir3_register *reg;
 
-	if (is_eligible_mov(instr, !!flags)) {
-		struct ir3_register *reg = instr->regs[1];
-		struct ir3_instruction *src_instr = ssa(reg);
-		if (flags)
-			combine_flags(flags, reg->flags);
-		return instr_cp(src_instr, flags);
-	}
+	if (instr->regs_count == 0)
+		return;
 
-	/* Check termination condition before walking children (rather
-	 * than before checking eligible-mov).  A mov instruction may
-	 * appear as ssa-src for multiple other instructions, and we
-	 * want to consider it for removal for each, rather than just
-	 * the first one.  (But regardless of how many places it shows
-	 * up as a src, we only need to recursively walk the children
-	 * once.)
-	 */
 	if (ir3_instr_check_mark(instr))
-		return instr;
+		return;
 
 	/* walk down the graph from each src: */
 	foreach_src_n(reg, n, instr) {
-		if (!(reg->flags & IR3_REG_SSA))
+		struct ir3_instruction *src = ssa(reg);
+
+		if (!src)
+			continue;
+
+		instr_cp(src);
+
+		/* TODO non-indirect access we could figure out which register
+		 * we actually want and allow cp..
+		 */
+		if (reg->flags & IR3_REG_ARRAY)
 			continue;
 
 		reg_cp(instr, reg, n);
 	}
 
-	if (instr->address)
-		ir3_instr_set_address(instr, instr_cp(instr->address, NULL));
+	if (instr->regs[0]->flags & IR3_REG_ARRAY) {
+		struct ir3_instruction *src = ssa(instr->regs[0]);
+		if (src)
+			instr_cp(src);
+	}
 
-	return instr;
+	if (instr->address) {
+		instr_cp(instr->address);
+		ir3_instr_set_address(instr, eliminate_output_mov(instr->address));
+	}
 }
 
 void
@@ -401,19 +427,20 @@ ir3_cp(struct ir3 *ir)
 
 	for (unsigned i = 0; i < ir->noutputs; i++) {
 		if (ir->outputs[i]) {
-			struct ir3_instruction *out =
-					instr_cp(ir->outputs[i], NULL);
-
-			ir->outputs[i] = out;
+			instr_cp(ir->outputs[i]);
+			ir->outputs[i] = eliminate_output_mov(ir->outputs[i]);
 		}
 	}
 
 	for (unsigned i = 0; i < ir->keeps_count; i++) {
-		ir->keeps[i] = instr_cp(ir->keeps[i], NULL);
+		instr_cp(ir->keeps[i]);
+		ir->keeps[i] = eliminate_output_mov(ir->keeps[i]);
 	}
 
 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-		if (block->condition)
-			block->condition = instr_cp(block->condition, NULL);
+		if (block->condition) {
+			instr_cp(block->condition);
+			block->condition = eliminate_output_mov(block->condition);
+		}
 	}
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
index 4bbc0458790..6d294f1a48c 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
@@ -76,7 +76,7 @@ int ir3_delayslots(struct ir3_instruction *assigner,
 		return 6;
 	} else if ((consumer->category == 3) &&
 			(is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
-			(n == 2)) {
+			(n == 3)) {
 		/* special case, 3rd src to cat3 not required on first cycle */
 		return 1;
 	} else {
@@ -118,6 +118,10 @@ ir3_instr_depth(struct ir3_instruction *instr)
 		/* visit child to compute it's depth: */
 		ir3_instr_depth(src);
 
+		/* for array writes, no need to delay on previous write: */
+		if (i == 0)
+			continue;
+
 		sd = ir3_delayslots(src, instr, i) + src->depth;
 
 		instr->depth = MAX2(instr->depth, sd);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c
index a84e7989cf8..ba0c4a57aa3 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_print.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c
@@ -94,7 +94,7 @@ static void print_instr_name(struct ir3_instruction *instr)
 	}
 }
 
-static void print_reg_name(struct ir3_register *reg, bool followssa)
+static void print_reg_name(struct ir3_register *reg)
 {
 	if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) &&
 			(reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)))
@@ -106,20 +106,29 @@ static void print_reg_name(struct ir3_register *reg, bool followssa)
 
 	if (reg->flags & IR3_REG_IMMED) {
 		printf("imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val);
-	} else if (reg->flags & IR3_REG_SSA) {
-		printf("_");
-		if (followssa) {
-			printf("[");
+	} else if (reg->flags & IR3_REG_ARRAY) {
+		printf("arr[id=%u, offset=%d, size=%u", reg->array.id,
+				reg->array.offset, reg->size);
+		/* for ARRAY we could have null src, for example first write
+		 * instruction..
+		 */
+		if (reg->instr) {
+			printf(", _[");
 			print_instr_name(reg->instr);
 			printf("]");
 		}
+		printf("]");
+	} else if (reg->flags & IR3_REG_SSA) {
+		printf("_[");
+		print_instr_name(reg->instr);
+		printf("]");
 	} else if (reg->flags & IR3_REG_RELATIV) {
 		if (reg->flags & IR3_REG_HALF)
 			printf("h");
 		if (reg->flags & IR3_REG_CONST)
-			printf("c<a0.x + %u>", reg->num);
+			printf("c<a0.x + %d>", reg->array.offset);
 		else
-			printf("\x1b[0;31mr<a0.x + %u>\x1b[0m (%u)", reg->num, reg->size);
+			printf("\x1b[0;31mr<a0.x + %d>\x1b[0m (%u)", reg->array.offset, reg->size);
 	} else {
 		if (reg->flags & IR3_REG_HALF)
 			printf("h");
@@ -158,7 +167,7 @@ print_instr(struct ir3_instruction *instr, int lvl)
 	for (i = 0; i < instr->regs_count; i++) {
 		struct ir3_register *reg = instr->regs[i];
 		printf(i ? ", " : " ");
-		print_reg_name(reg, !!i);
+		print_reg_name(reg);
 	}
 
 	if (instr->address) {
@@ -168,13 +177,6 @@ print_instr(struct ir3_instruction *instr, int lvl)
 		printf("]");
 	}
 
-	if (instr->fanin) {
-		printf(", fanin=_");
-		printf("[");
-		print_instr_name(instr->fanin);
-		printf("]");
-	}
-
 	if (instr->cp.left) {
 		printf(", left=_");
 		printf("[");
@@ -192,8 +194,6 @@ print_instr(struct ir3_instruction *instr, int lvl)
 	if (is_meta(instr)) {
 		if (instr->opc == OPC_META_FO) {
 			printf(", off=%d", instr->fo.off);
-		} else if ((instr->opc == OPC_META_FI) && instr->fi.aid) {
-			printf(", aid=%d", instr->fi.aid);
 		}
 	}
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index 74755eb3bc0..2ed78818e61 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -68,25 +68,24 @@
  * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after
  * register assignment.  But for us that is horrible from a scheduling
  * standpoint.  Instead what we do is use idea of 'definer' instruction.
- * Ie. the first instruction (lowest ip) to write to the array is the
+ * Ie. the first instruction (lowest ip) to write to the variable is the
  * one we consider from use/def perspective when building interference
- * graph.  (Other instructions which write other array elements just
- * define the variable some more.)
+ * graph.  (Other instructions which write other variable components
+ * just define the variable some more.)
+ *
+ * Arrays of arbitrary size are handled via pre-coloring a consecutive
+ * sequence of registers.  Additional scalar (single component) reg
+ * names are allocated starting at ctx->class_base[total_class_count]
+ * (see arr->base), which are pre-colored.  In the use/def graph direct
+ * access is treated as a single element use/def, and indirect access
+ * is treated as use or def of all array elements.  (Only the first
+ * def is tracked, in case of multiple indirect writes, etc.)
  */
 
 static const unsigned class_sizes[] = {
 	1, 2, 3, 4,
 	4 + 4, /* txd + 1d/2d */
 	4 + 6, /* txd + 3d */
-	/* temporary: until we can assign arrays, create classes so we
-	 * can round up array to fit.  NOTE with tgsi arrays should
-	 * really all be multiples of four:
-	 */
-	4 * 4,
-	4 * 8,
-	4 * 16,
-	4 * 32,
-
 };
 #define class_count ARRAY_SIZE(class_sizes)
 
@@ -265,13 +264,21 @@ struct ir3_ra_ctx {
 	struct ir3_ra_reg_set *set;
 	struct ra_graph *g;
 	unsigned alloc_count;
-	unsigned class_alloc_count[total_class_count];
-	unsigned class_base[total_class_count];
+	/* one per class, plus one slot for arrays: */
+	unsigned class_alloc_count[total_class_count + 1];
+	unsigned class_base[total_class_count + 1];
 	unsigned instr_cnt;
 	unsigned *def, *use;     /* def/use table */
 	struct ir3_ra_instr_data *instrd;
 };
 
+/* does it conflict? */
+static inline bool
+intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end)
+{
+	return !((a_start >= b_end) || (b_start >= a_end));
+}
+
 static bool
 is_half(struct ir3_instruction *instr)
 {
@@ -329,9 +336,6 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
 	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 	struct ir3_instruction *d = NULL;
 
-	if (instr->fanin)
-		return get_definer(ctx, instr->fanin, sz, off);
-
 	if (id->defn) {
 		*sz = id->sz;
 		*off = id->off;
@@ -485,10 +489,13 @@ ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		/* couple special cases: */
 		if (writes_addr(instr) || writes_pred(instr)) {
 			id->cls = -1;
-			continue;
+		} else if (instr->regs[0]->flags & IR3_REG_ARRAY) {
+			id->cls = total_class_count;
+			id->defn = instr;
+		} else {
+			id->defn = get_definer(ctx, instr, &id->sz, &id->off);
+			id->cls = size_to_class(id->sz, is_half(id->defn));
 		}
-		id->defn = get_definer(ctx, instr, &id->sz, &id->off);
-		id->cls = size_to_class(id->sz, is_half(id->defn));
 	}
 }
 
@@ -518,8 +525,6 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 
 		/* arrays which don't fit in one of the pre-defined class
 		 * sizes are pre-colored:
-		 *
-		 * TODO but we still need to allocate names for them, don't we??
 		 */
 		if (id->cls >= 0) {
 			instr->name = ctx->class_alloc_count[id->cls]++;
@@ -531,7 +536,7 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 static void
 ra_init(struct ir3_ra_ctx *ctx)
 {
-	unsigned n;
+	unsigned n, base;
 
 	ir3_clear_mark(ctx->ir);
 	n = ir3_count_instructions(ctx->ir);
@@ -550,11 +555,20 @@ ra_init(struct ir3_ra_ctx *ctx)
 	 * actual ra name is class_base[cls] + instr->name;
 	 */
 	ctx->class_base[0] = 0;
-	for (unsigned i = 1; i < total_class_count; i++) {
+	for (unsigned i = 1; i <= total_class_count; i++) {
 		ctx->class_base[i] = ctx->class_base[i-1] +
 				ctx->class_alloc_count[i-1];
 	}
 
+	/* and vreg names for array elements: */
+	base = ctx->class_base[total_class_count];
+	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+		arr->base = base;
+		ctx->class_alloc_count[total_class_count] += arr->length;
+		base += arr->length;
+	}
+	ctx->alloc_count += ctx->class_alloc_count[total_class_count];
+
 	ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
 	ralloc_steal(ctx->g, ctx->instrd);
 	ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
@@ -562,15 +576,23 @@ ra_init(struct ir3_ra_ctx *ctx)
 }
 
 static unsigned
-ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
+__ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
 {
 	unsigned name;
 	debug_assert(cls >= 0);
+	debug_assert(cls < total_class_count);  /* we shouldn't get arrays here.. */
 	name = ctx->class_base[cls] + defn->name;
 	debug_assert(name < ctx->alloc_count);
 	return name;
 }
 
+static int
+ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id)
+{
+	/* TODO handle name mapping for arrays */
+	return __ra_name(ctx, id->cls, id->defn);
+}
+
 static void
 ra_destroy(struct ir3_ra_ctx *ctx)
 {
@@ -583,6 +605,22 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 	struct ir3_ra_block_data *bd;
 	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
 
+	void def(unsigned name, struct ir3_instruction *instr)
+	{
+		/* defined on first write: */
+		if (!ctx->def[name])
+			ctx->def[name] = instr->ip;
+		ctx->use[name] = instr->ip;
+		BITSET_SET(bd->def, name);
+	}
+
+	void use(unsigned name, struct ir3_instruction *instr)
+	{
+		ctx->use[name] = MAX2(ctx->use[name], instr->ip);
+		if (!BITSET_TEST(bd->def, name))
+			BITSET_SET(bd->use, name);
+	}
+
 	bd = rzalloc(ctx->g, struct ir3_ra_block_data);
 
 	bd->def     = rzalloc_array(bd, BITSET_WORD, bitset_words);
@@ -594,6 +632,7 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 
 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 		struct ir3_instruction *src;
+		struct ir3_register *reg;
 
 		if (instr->regs_count == 0)
 			continue;
@@ -625,61 +664,101 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 
 		if (writes_gpr(instr)) {
 			struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+			struct ir3_register *dst = instr->regs[0];
 
-			if (id->defn == instr) {
-				/* arrays which don't fit in one of the pre-defined class
-				 * sizes are pre-colored:
+			if (dst->flags & IR3_REG_ARRAY) {
+				struct ir3_array *arr =
+					ir3_lookup_array(ctx->ir, dst->array.id);
+				unsigned i;
+
+				debug_assert(!(dst->flags & IR3_REG_PHI_SRC));
+
+				arr->start_ip = MIN2(arr->start_ip, instr->ip);
+				arr->end_ip = MAX2(arr->end_ip, instr->ip);
+
+				/* set the node class now.. in case we don't encounter
+				 * this array dst again.  From register_alloc algo's
+				 * perspective, these are all single/scalar regs:
 				 */
-				if (id->cls >= 0) {
-					unsigned name = ra_name(ctx, id->cls, id->defn);
+				for (i = 0; i < arr->length; i++) {
+					unsigned name = arr->base + i;
+					ra_set_node_class(ctx->g, name, ctx->set->classes[0]);
+				}
 
-					ctx->def[name] = id->defn->ip;
-					ctx->use[name] = id->defn->ip;
-
-					/* since we are in SSA at this point: */
-					debug_assert(!BITSET_TEST(bd->use, name));
-
-					BITSET_SET(bd->def, name);
-
-					if (is_half(id->defn)) {
-						ra_set_node_class(ctx->g, name,
-								ctx->set->half_classes[id->cls - class_count]);
-					} else {
-						ra_set_node_class(ctx->g, name,
-								ctx->set->classes[id->cls]);
+				/* indirect write is treated like a write to all array
+				 * elements, since we don't know which one is actually
+				 * written:
+				 */
+				if (dst->flags & IR3_REG_RELATIV) {
+					for (i = 0; i < arr->length; i++) {
+						unsigned name = arr->base + i;
+						def(name, instr);
 					}
+				} else {
+					unsigned name = arr->base + dst->array.offset;
+					def(name, instr);
+				}
 
-					/* extend the live range for phi srcs, which may come
-					 * from the bottom of the loop
-					 */
-					if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) {
-						struct ir3_instruction *phi = id->defn->regs[0]->instr;
-						foreach_ssa_src(src, phi) {
-							/* if src is after phi, then we need to extend
-							 * the liverange to the end of src's block:
-							 */
-							if (src->ip > phi->ip) {
-								struct ir3_instruction *last =
+			} else if (id->defn == instr) {
+				unsigned name = ra_name(ctx, id);
+
+				/* since we are in SSA at this point: */
+				debug_assert(!BITSET_TEST(bd->use, name));
+
+				def(name, id->defn);
+
+				if (is_half(id->defn)) {
+					ra_set_node_class(ctx->g, name,
+							ctx->set->half_classes[id->cls - class_count]);
+				} else {
+					ra_set_node_class(ctx->g, name,
+							ctx->set->classes[id->cls]);
+				}
+
+				/* extend the live range for phi srcs, which may come
+				 * from the bottom of the loop
+				 */
+				if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) {
+					struct ir3_instruction *phi = id->defn->regs[0]->instr;
+					foreach_ssa_src(src, phi) {
+						/* if src is after phi, then we need to extend
+						 * the liverange to the end of src's block:
+						 */
+						if (src->ip > phi->ip) {
+							struct ir3_instruction *last =
 									list_last_entry(&src->block->instr_list,
-										struct ir3_instruction, node);
-								ctx->use[name] = MAX2(ctx->use[name], last->ip);
-							}
+											struct ir3_instruction, node);
+							ctx->use[name] = MAX2(ctx->use[name], last->ip);
 						}
 					}
 				}
 			}
 		}
 
-		foreach_ssa_src(src, instr) {
-			if (writes_gpr(src)) {
-				struct ir3_ra_instr_data *id = &ctx->instrd[src->ip];
-
-				if (id->cls >= 0) {
-					unsigned name = ra_name(ctx, id->cls, id->defn);
-					ctx->use[name] = MAX2(ctx->use[name], instr->ip);
-					if (!BITSET_TEST(bd->def, name))
-						BITSET_SET(bd->use, name);
+		foreach_src(reg, instr) {
+			if (reg->flags & IR3_REG_ARRAY) {
+				struct ir3_array *arr =
+					ir3_lookup_array(ctx->ir, reg->array.id);
+				arr->start_ip = MIN2(arr->start_ip, instr->ip);
+				arr->end_ip = MAX2(arr->end_ip, instr->ip);
+				/* indirect read is treated like a read fromall array
+				 * elements, since we don't know which one is actually
+				 * read:
+				 */
+				if (reg->flags & IR3_REG_RELATIV) {
+					unsigned i;
+					for (i = 0; i < arr->length; i++) {
+						unsigned name = arr->base + i;
+						use(name, instr);
+					}
+				} else {
+					unsigned name = arr->base + reg->array.offset;
+					use(name, instr);
+					debug_assert(reg->array.offset < arr->length);
 				}
+			} else if ((src = ssa(reg)) && writes_gpr(src)) {
+				unsigned name = ra_name(ctx, &ctx->instrd[src->ip]);
+				use(name, instr);
 			}
 		}
 	}
@@ -735,6 +814,12 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
 {
 	struct ir3 *ir = ctx->ir;
 
+	/* initialize array live ranges: */
+	list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
+		arr->start_ip = ~0;
+		arr->end_ip = 0;
+	}
+
 	/* compute live ranges (use/def) on a block level, also updating
 	 * block's def/use bitmasks (used below to calculate per-block
 	 * livein/liveout):
@@ -767,18 +852,14 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
 	/* need to fix things up to keep outputs live: */
 	for (unsigned i = 0; i < ir->noutputs; i++) {
 		struct ir3_instruction *instr = ir->outputs[i];
-		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-
-		if (id->cls >= 0) {
-			unsigned name = ra_name(ctx, id->cls, id->defn);
-			ctx->use[name] = ctx->instr_cnt;
-		}
+		unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]);
+		ctx->use[name] = ctx->instr_cnt;
 	}
 
 	for (unsigned i = 0; i < ctx->alloc_count; i++) {
 		for (unsigned j = 0; j < ctx->alloc_count; j++) {
-			if (!((ctx->def[i] >= ctx->use[j]) ||
-					(ctx->def[j] >= ctx->use[i]))) {
+			if (intersects(ctx->def[i], ctx->use[i],
+					ctx->def[j], ctx->use[j])) {
 				ra_add_node_interference(ctx->g, i, j);
 			}
 		}
@@ -836,19 +917,36 @@ static void fixup_half_instr_src(struct ir3_instruction *instr)
 	}
 }
 
+/* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first
+ * array access(es) which do not have any previous access to depend
+ * on from scheduling point of view
+ */
 static void
 reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
 		struct ir3_instruction *instr)
 {
-	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+	struct ir3_ra_instr_data *id;
 
-	if (id->cls >= 0) {
-		unsigned name = ra_name(ctx, id->cls, id->defn);
+	if (reg->flags & IR3_REG_ARRAY) {
+		struct ir3_array *arr =
+			ir3_lookup_array(ctx->ir, reg->array.id);
+		unsigned name = arr->base + reg->array.offset;
+		unsigned r = ra_get_node_reg(ctx->g, name);
+		unsigned num = ctx->set->ra_reg_to_gpr[r];
+
+		if (reg->flags & IR3_REG_RELATIV) {
+			reg->array.offset = num;
+		} else {
+			reg->num = num;
+		}
+
+		reg->flags &= ~IR3_REG_ARRAY;
+	} else if ((id = &ctx->instrd[instr->ip]) && id->defn) {
+		unsigned name = ra_name(ctx, id);
 		unsigned r = ra_get_node_reg(ctx->g, name);
 		unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
 
-		if (reg->flags & IR3_REG_RELATIV)
-			num += reg->offset;
+		debug_assert(!(reg->flags & IR3_REG_RELATIV));
 
 		reg->num = num;
 		reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC);
@@ -875,9 +973,9 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 
 		foreach_src_n(reg, n, instr) {
 			struct ir3_instruction *src = reg->instr;
-			if (!src)
+			/* Note: reg->instr could be null for IR3_REG_ARRAY */
+			if (!(src || (reg->flags & IR3_REG_ARRAY)))
 				continue;
-
 			reg_assign(ctx, instr->regs[n+1], src);
 			if (instr->regs[n+1]->flags & IR3_REG_HALF)
 				fixup_half_instr_src(instr);
@@ -888,6 +986,8 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 static int
 ra_alloc(struct ir3_ra_ctx *ctx)
 {
+	unsigned n = 0;
+
 	/* frag shader inputs get pre-assigned, since we have some
 	 * constraints/unknowns about setup for some of these regs:
 	 */
@@ -897,7 +997,7 @@ ra_alloc(struct ir3_ra_ctx *ctx)
 		if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) {
 			struct ir3_instruction *instr = ir->inputs[i];
 			int cls = size_to_class(1, true);
-			unsigned name = ra_name(ctx, cls, instr);
+			unsigned name = __ra_name(ctx, cls, instr);
 			unsigned reg = ctx->set->gpr_to_ra_reg[cls][0];
 
 			/* if we have frag_face, it gets hr0.x */
@@ -905,7 +1005,8 @@ ra_alloc(struct ir3_ra_ctx *ctx)
 			i += 4;
 		}
 
-		for (j = 0; i < ir->ninputs; i++) {
+		j = 0;
+		for (; i < ir->ninputs; i++) {
 			struct ir3_instruction *instr = ir->inputs[i];
 			if (instr) {
 				struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
@@ -913,7 +1014,7 @@ ra_alloc(struct ir3_ra_ctx *ctx)
 				if (id->defn == instr) {
 					unsigned name, reg;
 
-					name = ra_name(ctx, id->cls, id->defn);
+					name = ra_name(ctx, id);
 					reg = ctx->set->gpr_to_ra_reg[id->cls][j];
 
 					ra_set_node_reg(ctx->g, name, reg);
@@ -921,6 +1022,46 @@ ra_alloc(struct ir3_ra_ctx *ctx)
 				}
 			}
 		}
+		n = j;
+	}
+
+	/* pre-assign array elements:
+	 */
+	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+		unsigned base = n;
+
+		if (arr->end_ip == 0)
+			continue;
+
+		/* figure out what else we conflict with which has already
+		 * been assigned:
+		 */
+retry:
+		list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) {
+			if (arr2 == arr)
+				break;
+			if (arr2->end_ip == 0)
+				continue;
+			/* if it intersects with liverange AND register range.. */
+			if (intersects(arr->start_ip, arr->end_ip,
+					arr2->start_ip, arr2->end_ip) &&
+				intersects(base, base + arr->length,
+					arr2->reg, arr2->reg + arr2->length)) {
+				base = MAX2(base, arr2->reg + arr2->length);
+				goto retry;
+			}
+		}
+
+		arr->reg = base;
+
+		for (unsigned i = 0; i < arr->length; i++) {
+			unsigned name, reg;
+
+			name = arr->base + i;
+			reg = ctx->set->gpr_to_ra_reg[0][base++];
+
+			ra_set_node_reg(ctx->g, name, reg);
+		}
 	}
 
 	if (!ra_allocate(ctx->g))
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index 6aaa16edbfe..8f640febc5d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -187,6 +187,9 @@ delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 
 	foreach_ssa_src_n(src, i, instr) {
 		unsigned d;
+		/* for array writes, no need to delay on previous write: */
+		if (i == 0)
+			continue;
 		if (src->block != instr->block)
 			continue;
 		d = delay_calc_srcn(ctx, src, instr, i);
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index ede5558a11e..6b0ab587001 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -261,6 +261,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_STRING_MARKER:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
diff --git a/src/gallium/drivers/i915/i915_state_derived.c b/src/gallium/drivers/i915/i915_state_derived.c
index bd0f448f645..177b8545985 100644
--- a/src/gallium/drivers/i915/i915_state_derived.c
+++ b/src/gallium/drivers/i915/i915_state_derived.c
@@ -184,7 +184,7 @@ static void calculate_vertex_layout(struct i915_context *i915)
 struct i915_tracked_state i915_update_vertex_layout = {
    "vertex_layout",
    calculate_vertex_layout,
-   I915_NEW_FS | I915_NEW_VS
+   I915_NEW_RASTERIZER | I915_NEW_FS | I915_NEW_VS
 };
 
 
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index fa327571b9b..5171cca9ea6 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -485,6 +485,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_STRING_MARKER:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index 62d99bbaac8..d4bd02d0225 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -82,8 +82,6 @@ struct llvmpipe_context {
    struct pipe_viewport_state viewports[PIPE_MAX_VIEWPORTS];
    struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
    struct pipe_index_buffer index_buffer;
-   struct pipe_resource *mapped_vs_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS];
-   struct pipe_resource *mapped_gs_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS];
 
    unsigned num_samplers[PIPE_SHADER_TYPES];
    unsigned num_sampler_views[PIPE_SHADER_TYPES];
diff --git a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
index edfb2040969..22ef5fc17f9 100644
--- a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
+++ b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
@@ -149,9 +149,6 @@ llvmpipe_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
          draw_vs_reset_so(lp->vs);
       }
    }
-   
-   llvmpipe_cleanup_vertex_sampling(lp);
-   llvmpipe_cleanup_geometry_sampling(lp);
 
    /*
     * TODO: Flush only when a user vertex/index buffer is present
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index fb52f5dc063..879a2e7d2f0 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -310,6 +310,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_STRING_MARKER:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index bd850519468..34d3c812b60 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -476,27 +476,30 @@ lp_setup_try_clear_zs(struct lp_setup_context *setup,
    uint64_t zsvalue = 0;
    uint32_t zmask32;
    uint8_t smask8;
+   enum pipe_format format = setup->fb.zsbuf->format;
 
    LP_DBG(DEBUG_SETUP, "%s state %d\n", __FUNCTION__, setup->state);
 
    zmask32 = (flags & PIPE_CLEAR_DEPTH) ? ~0 : 0;
    smask8 = (flags & PIPE_CLEAR_STENCIL) ? ~0 : 0;
 
-   zsvalue = util_pack64_z_stencil(setup->fb.zsbuf->format,
-                                   depth,
-                                   stencil);
+   zsvalue = util_pack64_z_stencil(format, depth, stencil);
 
-   /*
-    * XXX: should make a full mask here for things like D24X8,
-    * otherwise we'll do a read-modify-write clear later which
-    * should be unnecessary.
-    */
-   zsmask = util_pack64_mask_z_stencil(setup->fb.zsbuf->format,
-                                       zmask32,
-                                       smask8);
+   zsmask = util_pack64_mask_z_stencil(format, zmask32, smask8);
 
    zsvalue &= zsmask;
 
+   if (format == PIPE_FORMAT_Z24X8_UNORM ||
+       format == PIPE_FORMAT_X8Z24_UNORM) {
+      /*
+       * Make full mask if there's "X" bits so we can do full
+       * clear (without rmw).
+       */
+      uint32_t zsmask_full = 0;
+      zsmask_full = util_pack_mask_z_stencil(format, ~0, ~0);
+      zsmask |= ~zsmask_full;
+   }
+
    if (setup->state == SETUP_ACTIVE) {
       struct lp_scene *scene = setup->scene;
 
@@ -796,13 +799,15 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
                                     unsigned num,
                                     struct pipe_sampler_view **views)
 {
-   unsigned i;
+   unsigned i, max_tex_num;
 
    LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
 
    assert(num <= PIPE_MAX_SHADER_SAMPLER_VIEWS);
 
-   for (i = 0; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; i++) {
+   max_tex_num = MAX2(num, setup->fs.current_tex_num);
+
+   for (i = 0; i < max_tex_num; i++) {
       struct pipe_sampler_view *view = i < num ? views[i] : NULL;
 
       if (view) {
@@ -922,7 +927,11 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
             assert(jit_tex->base);
          }
       }
+      else {
+         pipe_resource_reference(&setup->fs.current_tex[i], NULL);
+      }
    }
+   setup->fs.current_tex_num = num;
 
    setup->dirty |= LP_SETUP_NEW_FS;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h
index 80acd74bddd..03bb8ce2b6f 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h
@@ -133,6 +133,7 @@ struct lp_setup_context
       const struct lp_rast_state *stored; /**< what's in the scene */
       struct lp_rast_state current;  /**< currently set state */
       struct pipe_resource *current_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS];
+      unsigned current_tex_num;
    } fs;
 
    /** fragment shader constants */
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index aa241761586..907129dbd1b 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -556,7 +556,7 @@ do_triangle_ccw(struct lp_setup_context *setup,
 
       /* Calculate trivial reject values:
        */
-      eo = vec_sub_epi32(vec_andc(dcdy_neg_mask, dcdy),
+      eo = vec_sub_epi32(vec_andnot_si128(dcdy_neg_mask, dcdy),
                          vec_and(dcdx_neg_mask, dcdx));
 
       /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
diff --git a/src/gallium/drivers/llvmpipe/lp_state.h b/src/gallium/drivers/llvmpipe/lp_state.h
index 2da6caaef16..78918cf984d 100644
--- a/src/gallium/drivers/llvmpipe/lp_state.h
+++ b/src/gallium/drivers/llvmpipe/lp_state.h
@@ -130,16 +130,10 @@ void
 llvmpipe_prepare_vertex_sampling(struct llvmpipe_context *ctx,
                                  unsigned num,
                                  struct pipe_sampler_view **views);
-void
-llvmpipe_cleanup_vertex_sampling(struct llvmpipe_context *ctx);
-
 
 void
 llvmpipe_prepare_geometry_sampling(struct llvmpipe_context *ctx,
                                    unsigned num,
                                    struct pipe_sampler_view **views);
-void
-llvmpipe_cleanup_geometry_sampling(struct llvmpipe_context *ctx);
-
 
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index 34961cbbac5..c90f2f270fe 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -190,8 +190,10 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
       llvmpipe->tex_timestamp = lp_screen->timestamp;
       llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW;
    }
-      
-   if (llvmpipe->dirty & (LP_NEW_FS |
+
+   /* This needs LP_NEW_RASTERIZER because of draw_prepare_shader_outputs(). */
+   if (llvmpipe->dirty & (LP_NEW_RASTERIZER |
+                          LP_NEW_FS |
                           LP_NEW_VS))
       compute_vertex_info(llvmpipe);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index 1e055878f7c..32bf9fdd25d 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -98,8 +98,9 @@ llvmpipe_bind_sampler_states(struct pipe_context *pipe,
                         llvmpipe->samplers[shader],
                         llvmpipe->num_samplers[shader]);
    }
-
-   llvmpipe->dirty |= LP_NEW_SAMPLER;
+   else {
+      llvmpipe->dirty |= LP_NEW_SAMPLER;
+   }
 }
 
 
@@ -128,6 +129,15 @@ llvmpipe_set_sampler_views(struct pipe_context *pipe,
        */
       pipe_sampler_view_release(pipe,
                                 &llvmpipe->sampler_views[shader][start + i]);
+      /*
+       * Warn if someone tries to set a view created in a different context
+       * (which is why we need the hack above in the first place).
+       * An assert would be better but st/mesa relies on it...
+       */
+      if (views[i] && views[i]->context != pipe) {
+         debug_printf("Illegal setting of sampler_view %d created in another "
+                      "context\n", i);
+      }
       pipe_sampler_view_reference(&llvmpipe->sampler_views[shader][start + i],
                                   views[i]);
    }
@@ -146,8 +156,9 @@ llvmpipe_set_sampler_views(struct pipe_context *pipe,
                              llvmpipe->sampler_views[shader],
                              llvmpipe->num_sampler_views[shader]);
    }
-
-   llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW;
+   else {
+      llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW;
+   }
 }
 
 
@@ -228,8 +239,7 @@ prepare_shader_sampling(
    struct llvmpipe_context *lp,
    unsigned num,
    struct pipe_sampler_view **views,
-   unsigned shader_type,
-   struct pipe_resource *mapped_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS])
+   unsigned shader_type)
 {
 
    unsigned i;
@@ -242,7 +252,7 @@ prepare_shader_sampling(
    if (!num)
       return;
 
-   for (i = 0; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; i++) {
+   for (i = 0; i < num; i++) {
       struct pipe_sampler_view *view = i < num ? views[i] : NULL;
 
       if (view) {
@@ -253,11 +263,6 @@ prepare_shader_sampling(
          unsigned first_level = 0;
          unsigned last_level = 0;
 
-         /* We're referencing the texture's internal data, so save a
-          * reference to it.
-          */
-         pipe_resource_reference(&mapped_tex[i], tex);
-
          if (!lp_tex->dt) {
             /* regular texture - setup array of mipmap level offsets */
             struct pipe_resource *res = view->texture;
@@ -335,47 +340,28 @@ prepare_shader_sampling(
 
 
 /**
- * Called during state validation when LP_NEW_SAMPLER_VIEW is set.
+ * Called whenever we're about to draw (no dirty flag, FIXME?).
  */
 void
 llvmpipe_prepare_vertex_sampling(struct llvmpipe_context *lp,
                                  unsigned num,
                                  struct pipe_sampler_view **views)
 {
-   prepare_shader_sampling(lp, num, views, PIPE_SHADER_VERTEX,
-                           lp->mapped_vs_tex);
-}
-
-void
-llvmpipe_cleanup_vertex_sampling(struct llvmpipe_context *ctx)
-{
-   unsigned i;
-   for (i = 0; i < Elements(ctx->mapped_vs_tex); i++) {
-      pipe_resource_reference(&ctx->mapped_vs_tex[i], NULL);
-   }
+   prepare_shader_sampling(lp, num, views, PIPE_SHADER_VERTEX);
 }
 
 
 /**
- * Called during state validation when LP_NEW_SAMPLER_VIEW is set.
+ * Called whenever we're about to draw (no dirty flag, FIXME?).
  */
 void
 llvmpipe_prepare_geometry_sampling(struct llvmpipe_context *lp,
                                    unsigned num,
                                    struct pipe_sampler_view **views)
 {
-   prepare_shader_sampling(lp, num, views, PIPE_SHADER_GEOMETRY,
-                           lp->mapped_gs_tex);
+   prepare_shader_sampling(lp, num, views, PIPE_SHADER_GEOMETRY);
 }
 
-void
-llvmpipe_cleanup_geometry_sampling(struct llvmpipe_context *ctx)
-{
-   unsigned i;
-   for (i = 0; i < Elements(ctx->mapped_gs_tex); i++) {
-      pipe_resource_reference(&ctx->mapped_gs_tex[i], NULL);
-   }
-}
 
 void
 llvmpipe_init_sampler_funcs(struct llvmpipe_context *llvmpipe)
diff --git a/src/gallium/drivers/llvmpipe/lp_state_so.c b/src/gallium/drivers/llvmpipe/lp_state_so.c
index 2af04cdf1c3..b2afd6fbf70 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_so.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_so.c
@@ -70,6 +70,15 @@ llvmpipe_set_so_targets(struct pipe_context *pipe,
    int i;
    for (i = 0; i < num_targets; i++) {
       const boolean append = (offsets[i] == (unsigned)-1);
+      /*
+       * Warn if the so target was created in another context.
+       * XXX Not entirely sure if mesa/st may rely on this?
+       * Otherwise should just assert.
+       */
+      if (targets[i] && targets[i]->context != pipe) {
+         debug_printf("Illegal setting of so target with target %d created in "
+                       "another context\n", i);
+      }
       pipe_so_target_reference((struct pipe_stream_output_target **)&llvmpipe->so_targets[i], targets[i]);
       /* If we're not appending then lets set the internal
          offset to what was requested */
diff --git a/src/gallium/drivers/llvmpipe/lp_state_surface.c b/src/gallium/drivers/llvmpipe/lp_state_surface.c
index c879ba9751d..b20b9c5cdd5 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_surface.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_surface.c
@@ -52,6 +52,7 @@ llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
    struct llvmpipe_context *lp = llvmpipe_context(pipe);
 
    boolean changed = !util_framebuffer_state_equal(&lp->framebuffer, fb);
+   unsigned i;
 
    assert(fb->width <= LP_MAX_WIDTH);
    assert(fb->height <= LP_MAX_HEIGHT);
@@ -66,10 +67,22 @@ llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
       const struct util_format_description *depth_desc =
          util_format_description(depth_format);
 
+      if (lp->framebuffer.zsbuf && lp->framebuffer.zsbuf->context != pipe) {
+         debug_printf("Illegal setting of fb state with zsbuf created in "
+                       "another context\n");
+      }
+      for (i = 0; i < fb->nr_cbufs; i++) {
+         if (lp->framebuffer.cbufs[i] &&
+             lp->framebuffer.cbufs[i]->context != pipe) {
+            debug_printf("Illegal setting of fb state with cbuf %d created in "
+                          "another context\n", i);
+         }
+      }
+
       util_copy_framebuffer_state(&lp->framebuffer, fb);
 
       if (LP_PERF & PERF_NO_DEPTH) {
-	 pipe_surface_reference(&lp->framebuffer.zsbuf, NULL);
+         pipe_surface_reference(&lp->framebuffer.zsbuf, NULL);
       }
 
       /*
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
index 1bf7240e131..f58cf97646e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
@@ -615,6 +615,7 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
          case FILE_MEMORY_CONST:
          case FILE_MEMORY_SHARED:
          case FILE_SHADER_INPUT:
+         case FILE_SHADER_OUTPUT:
             hi->getSrc(s)->reg.data.offset += 4;
             break;
          default:
@@ -625,7 +626,7 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
       }
    }
    if (srcNr == 2) {
-      lo->setDef(1, carry);
+      lo->setFlagsDef(1, carry);
       hi->setFlagsSrc(hi->srcCount(), carry);
    }
    return hi;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index b1064bf0a92..17cb484d2ba 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -75,7 +75,8 @@ private:
    void emitLOAD(const Instruction *);
    void emitSTORE(const Instruction *);
    void emitMOV(const Instruction *);
-   void emitMEMBAR(const Instruction *);
+   void emitATOM(const Instruction *);
+   void emitCCTL(const Instruction *);
 
    void emitINTERP(const Instruction *);
    void emitAFETCH(const Instruction *);
@@ -123,6 +124,7 @@ private:
    void emitPIXLD(const Instruction *);
 
    void emitBAR(const Instruction *);
+   void emitMEMBAR(const Instruction *);
 
    void emitFlow(const Instruction *);
 
@@ -698,6 +700,10 @@ CodeEmitterGK110::emitIMAD(const Instruction *i)
 
    if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
       code[1] |= 1 << 25;
+
+   if (i->flagsDef >= 0) code[1] |= 1 << 18;
+   if (i->flagsSrc >= 0) code[1] |= 1 << 20;
+
    SAT_(35);
 }
 
@@ -1252,8 +1258,32 @@ CodeEmitterGK110::emitPIXLD(const Instruction *i)
 void
 CodeEmitterGK110::emitBAR(const Instruction *i)
 {
-   /* TODO */
-   emitNOP(i);
+   code[0] = 0x00000002;
+   code[1] = 0x85400000;
+
+   switch (i->subOp) {
+   case NV50_IR_SUBOP_BAR_ARRIVE:   code[1] |= 0x08; break;
+   case NV50_IR_SUBOP_BAR_RED_AND:  code[1] |= 0x50; break;
+   case NV50_IR_SUBOP_BAR_RED_OR:   code[1] |= 0x90; break;
+   case NV50_IR_SUBOP_BAR_RED_POPC: code[1] |= 0x10; break;
+   default:
+      code[1] |= 0x20;
+      assert(i->subOp == NV50_IR_SUBOP_BAR_SYNC);
+      break;
+   }
+
+   emitPredicate(i);
+
+   srcId(i->src(0), 10);
+   srcId(i->src(1), 23);
+}
+
+void CodeEmitterGK110::emitMEMBAR(const Instruction *i)
+{
+   code[0] = 0x00000002 | NV50_IR_SUBOP_MEMBAR_SCOPE(i->subOp) << 8;
+   code[1] = 0x7cc00000;
+
+   emitPredicate(i);
 }
 
 void
@@ -1587,6 +1617,10 @@ CodeEmitterGK110::emitSTORE(const Instruction *i)
 
    srcId(i->src(1), 2);
    srcId(i->src(0).getIndirect(0), 10);
+   if (i->src(0).getFile() == FILE_MEMORY_GLOBAL &&
+       i->src(0).isIndirect(0) &&
+       i->getIndirect(0, 0)->reg.size == 8)
+      code[1] |= 1 << 23;
 }
 
 void
@@ -1597,7 +1631,7 @@ CodeEmitterGK110::emitLOAD(const Instruction *i)
    switch (i->src(0).getFile()) {
    case FILE_MEMORY_GLOBAL: code[1] = 0xc0000000; code[0] = 0x00000000; break;
    case FILE_MEMORY_LOCAL:  code[1] = 0x7a000000; code[0] = 0x00000002; break;
-   case FILE_MEMORY_SHARED: code[1] = 0x7ac00000; code[0] = 0x00000002; break;
+   case FILE_MEMORY_SHARED: code[1] = 0x7a400000; code[0] = 0x00000002; break;
    case FILE_MEMORY_CONST:
       if (!i->src(0).isIndirect(0) && typeSizeof(i->dType) == 4) {
          emitMOV(i);
@@ -1628,7 +1662,13 @@ CodeEmitterGK110::emitLOAD(const Instruction *i)
    emitPredicate(i);
 
    defId(i->def(0), 2);
-   srcId(i->src(0).getIndirect(0), 10);
+   if (i->getIndirect(0, 0)) {
+      srcId(i->src(0).getIndirect(0), 10);
+      if (i->getIndirect(0, 0)->reg.size == 8)
+         code[1] |= 1 << 23;
+   } else {
+      code[0] |= 255 << 10;
+   }
 }
 
 uint8_t
@@ -1683,10 +1723,83 @@ CodeEmitterGK110::emitMOV(const Instruction *i)
    }
 }
 
-void CodeEmitterGK110::emitMEMBAR(const Instruction *i)
+static inline bool
+uses64bitAddress(const Instruction *ldst)
 {
-   code[0] = 0x00000002 | NV50_IR_SUBOP_MEMBAR_SCOPE(i->subOp) << 8;
-   code[1] = 0x7cc00000;
+   return ldst->src(0).getFile() == FILE_MEMORY_GLOBAL &&
+      ldst->src(0).isIndirect(0) &&
+      ldst->getIndirect(0, 0)->reg.size == 8;
+}
+
+void
+CodeEmitterGK110::emitATOM(const Instruction *i)
+{
+   code[0] = 0x00000002;
+   if (i->subOp == NV50_IR_SUBOP_ATOM_CAS)
+      code[1] = 0x77800000;
+   else
+      code[1] = 0x68000000;
+
+   switch (i->subOp) {
+   case NV50_IR_SUBOP_ATOM_CAS: break;
+   case NV50_IR_SUBOP_ATOM_EXCH: code[1] |= 0x04000000; break;
+   default: code[1] |= i->subOp << 23; break;
+   }
+
+   switch (i->dType) {
+   case TYPE_U32: break;
+   case TYPE_S32: code[1] |= 0x00100000; break;
+   case TYPE_U64: code[1] |= 0x00200000; break;
+   case TYPE_F32: code[1] |= 0x00300000; break;
+   case TYPE_B128: code[1] |= 0x00400000; break; /* TODO: U128 */
+   case TYPE_S64: code[1] |= 0x00500000; break;
+   default: assert(!"unsupported type"); break;
+   }
+
+   emitPredicate(i);
+
+   /* TODO: cas: check that src regs line up */
+   /* TODO: cas: flip bits if $r255 is used */
+   srcId(i->src(1), 23);
+
+   if (i->defExists(0))
+      defId(i->def(0), 2);
+   else
+      code[0] |= 255 << 2;
+
+   const int32_t offset = SDATA(i->src(0)).offset;
+   assert(offset < 0x80000 && offset >= -0x80000);
+   code[0] |= (offset & 1) << 31;
+   code[1] |= (offset & 0xffffe) >> 1;
+
+   if (i->getIndirect(0, 0)) {
+      srcId(i->getIndirect(0, 0), 10);
+      if (i->getIndirect(0, 0)->reg.size == 8)
+         code[1] |= 1 << 19;
+   } else {
+      code[0] |= 255 << 10;
+   }
+}
+
+void
+CodeEmitterGK110::emitCCTL(const Instruction *i)
+{
+   int32_t offset = SDATA(i->src(0)).offset;
+
+   code[0] = 0x00000002 | (i->subOp << 2);
+
+   if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) {
+      code[1] = 0x7b000000;
+   } else {
+      code[1] = 0x7c000000;
+      offset &= 0xffffff;
+   }
+   code[0] |= offset << 23;
+   code[1] |= offset >> 9;
+
+   if (uses64bitAddress(i))
+      code[1] |= 1 << 23;
+   srcId(i->src(0).getIndirect(0), 10);
 
    emitPredicate(i);
 }
@@ -1925,6 +2038,12 @@ CodeEmitterGK110::emitInstruction(Instruction *insn)
    case OP_MEMBAR:
       emitMEMBAR(insn);
       break;
+   case OP_ATOM:
+      emitATOM(insn);
+      break;
+   case OP_CCTL:
+      emitCCTL(insn);
+      break;
    case OP_PHI:
    case OP_UNION:
    case OP_CONSTRAINT:
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index ec74e7ac811..1fa0eb6da6d 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -176,6 +176,8 @@ private:
    void emitISBERD();
    void emitAL2P();
    void emitIPA();
+   void emitATOM();
+   void emitCCTL();
 
    void emitPIXLD();
 
@@ -1552,11 +1554,13 @@ CodeEmitterGM107::emitLOP()
          break;
       }
       emitPRED (0x30);
+      emitX    (0x2b);
       emitField(0x29, 2, lop);
       emitINV  (0x28, insn->src(1));
       emitINV  (0x27, insn->src(0));
    } else {
       emitInsn (0x04000000);
+      emitX    (0x39);
       emitINV  (0x38, insn->src(1));
       emitINV  (0x37, insn->src(0));
       emitField(0x35, 2, lop);
@@ -1624,9 +1628,11 @@ CodeEmitterGM107::emitIADD()
       emitNEG(0x31, insn->src(0));
       emitNEG(0x30, insn->src(1));
       emitCC (0x2f);
+      emitX  (0x2b);
    } else {
       emitInsn(0x1c000000);
       emitSAT (0x36);
+      emitX   (0x35);
       emitCC  (0x34);
       emitIMMD(0x14, 32, insn->src(1));
    }
@@ -2146,6 +2152,7 @@ CodeEmitterGM107::emitLD()
    emitPRED (0x3a);
    emitLDSTc(0x38);
    emitLDSTs(0x35, insn->dType);
+   emitField(0x34, 1, insn->src(0).getIndirect(0)->getSize() == 8);
    emitADDR (0x08, 0x14, 32, 0, insn->src(0));
    emitGPR  (0x00, insn->def(0));
 }
@@ -2176,6 +2183,7 @@ CodeEmitterGM107::emitST()
    emitPRED (0x3a);
    emitLDSTc(0x38);
    emitLDSTs(0x35, insn->dType);
+   emitField(0x34, 1, insn->src(0).getIndirect(0)->getSize() == 8);
    emitADDR (0x08, 0x14, 32, 0, insn->src(0));
    emitGPR  (0x00, insn->src(1));
 }
@@ -2296,6 +2304,50 @@ CodeEmitterGM107::emitIPA()
       emitGPR(0x27);
 }
 
+void
+CodeEmitterGM107::emitATOM()
+{
+   unsigned dType, subOp;
+   switch (insn->dType) {
+   case TYPE_U32: dType = 0; break;
+   case TYPE_S32: dType = 1; break;
+   case TYPE_U64: dType = 2; break;
+   case TYPE_F32: dType = 3; break;
+   case TYPE_B128: dType = 4; break;
+   case TYPE_S64: dType = 5; break;
+   default: assert(!"unexpected dType"); dType = 0; break;
+   }
+   if (insn->subOp == NV50_IR_SUBOP_ATOM_EXCH)
+      subOp = 8;
+   else
+      subOp = insn->subOp;
+   assert(insn->subOp != NV50_IR_SUBOP_ATOM_CAS); /* XXX */
+
+   emitInsn (0xed000000);
+   emitField(0x34, 4, subOp);
+   emitField(0x31, 3, dType);
+   emitField(0x30, 1, insn->src(0).getIndirect(0)->getSize() == 8);
+   emitGPR  (0x14, insn->src(1));
+   emitADDR (0x08, 0x1c, 20, 0, insn->src(0));
+   emitGPR  (0x00, insn->def(0));
+}
+
+void
+CodeEmitterGM107::emitCCTL()
+{
+   unsigned width;
+   if (insn->src(0).getFile() == FILE_MEMORY_GLOBAL) {
+      emitInsn(0xef600000);
+      width = 30;
+   } else {
+      emitInsn(0xef800000);
+      width = 22;
+   }
+   emitField(0x34, 1, insn->src(0).getIndirect(0)->getSize() == 8);
+   emitADDR (0x08, 0x16, width, 2, insn->src(0));
+   emitField(0x00, 4, insn->subOp);
+}
+
 /*******************************************************************************
  * surface
  ******************************************************************************/
@@ -2795,6 +2847,12 @@ CodeEmitterGM107::emitInstruction(Instruction *i)
          break;
       }
       break;
+   case OP_ATOM:
+      emitATOM();
+      break;
+   case OP_CCTL:
+      emitCCTL();
+      break;
    case OP_VFETCH:
       emitALD();
       break;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index c126c085daf..bc8354deba1 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -1463,6 +1463,7 @@ CodeEmitterNV50::emitSFnOp(const Instruction *i, uint8_t subOp)
 
    if (i->encSize == 4) {
       assert(i->op == OP_RCP);
+      assert(!i->saturate);
       code[0] |= i->src(0).mod.abs() << 15;
       code[0] |= i->src(0).mod.neg() << 22;
       emitForm_MUL(i);
@@ -1470,6 +1471,10 @@ CodeEmitterNV50::emitSFnOp(const Instruction *i, uint8_t subOp)
       code[1] = subOp << 29;
       code[1] |= i->src(0).mod.abs() << 20;
       code[1] |= i->src(0).mod.neg() << 26;
+      if (i->saturate) {
+         assert(subOp == 6 && i->op == OP_EX2);
+         code[1] |= 1 << 27;
+      }
       emitForm_MAD(i);
    }
 }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 7b313f3c39c..9c4a38f291b 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -95,6 +95,13 @@ public:
          return tgsi_util_get_src_register_swizzle(&reg, chan);
       }
 
+      int getArrayId() const
+      {
+         if (isIndirect(0))
+            return fsr->Indirect.ArrayID;
+         return 0;
+      }
+
       nv50_ir::Modifier getMod(int chan) const;
 
       SrcRegister getIndirect(int dim) const
@@ -154,6 +161,13 @@ public:
          return SrcRegister(fdr->Indirect);
       }
 
+      int getArrayId() const
+      {
+         if (isIndirect(0))
+            return fdr->Indirect.ArrayID;
+         return 0;
+      }
+
    private:
       const struct tgsi_dst_register reg;
       const struct tgsi_full_dst_register *fdr;
@@ -809,7 +823,10 @@ public:
    // these registers are per-subroutine, cannot be used for parameter passing
    std::set<Location> locals;
 
-   bool mainTempsInLMem;
+   std::set<int> indirectTempArrays;
+   std::map<int, int> indirectTempOffsets;
+   std::map<int, std::pair<int, int> > tempArrayInfo;
+   std::vector<int> tempArrayId;
 
    int clipVertexOutput;
 
@@ -841,8 +858,6 @@ Source::Source(struct nv50_ir_prog_info *prog) : info(prog)
 
    if (prog->dbgFlags & NV50_IR_DEBUG_BASIC)
       tgsi_dump(tokens, 0);
-
-   mainTempsInLMem = false;
 }
 
 Source::~Source()
@@ -872,6 +887,7 @@ bool Source::scanSource()
 
    textureViews.resize(scan.file_max[TGSI_FILE_SAMPLER_VIEW] + 1);
    //resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1);
+   tempArrayId.resize(scan.file_max[TGSI_FILE_TEMPORARY] + 1);
 
    info->immd.bufSize = 0;
 
@@ -917,8 +933,16 @@ bool Source::scanSource()
    }
    tgsi_parse_free(&parse);
 
-   if (mainTempsInLMem)
-      info->bin.tlsSpace += (scan.file_max[TGSI_FILE_TEMPORARY] + 1) * 16;
+   if (indirectTempArrays.size()) {
+      int tempBase = 0;
+      for (std::set<int>::const_iterator it = indirectTempArrays.begin();
+           it != indirectTempArrays.end(); ++it) {
+         std::pair<int, int>& info = tempArrayInfo[*it];
+         indirectTempOffsets.insert(std::make_pair(*it, tempBase - info.first));
+         tempBase += info.second;
+      }
+      info->bin.tlsSpace += tempBase * 16;
+   }
 
    if (info->io.genUserClip > 0) {
       info->io.clipDistances = info->io.genUserClip;
@@ -1028,6 +1052,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
    unsigned sn = TGSI_SEMANTIC_GENERIC;
    unsigned si = 0;
    const unsigned first = decl->Range.First, last = decl->Range.Last;
+   const int arrayId = decl->Array.ArrayID;
 
    if (decl->Declaration.Semantic) {
       sn = decl->Semantic.Name;
@@ -1172,8 +1197,14 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
       for (i = first; i <= last; ++i)
          textureViews[i].target = decl->SamplerView.Resource;
       break;
-   case TGSI_FILE_NULL:
    case TGSI_FILE_TEMPORARY:
+      for (i = first; i <= last; ++i)
+         tempArrayId[i] = arrayId;
+      if (arrayId)
+         tempArrayInfo.insert(std::make_pair(arrayId, std::make_pair(
+                                                   first, last - first + 1)));
+      break;
+   case TGSI_FILE_NULL:
    case TGSI_FILE_ADDRESS:
    case TGSI_FILE_CONSTANT:
    case TGSI_FILE_IMMEDIATE:
@@ -1223,7 +1254,7 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
       } else
       if (insn.getDst(0).getFile() == TGSI_FILE_TEMPORARY) {
          if (insn.getDst(0).isIndirect(0))
-            mainTempsInLMem = true;
+            indirectTempArrays.insert(insn.getDst(0).getArrayId());
       }
    }
 
@@ -1231,7 +1262,7 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
       Instruction::SrcRegister src = insn.getSrc(s);
       if (src.getFile() == TGSI_FILE_TEMPORARY) {
          if (src.isIndirect(0))
-            mainTempsInLMem = true;
+            indirectTempArrays.insert(src.getArrayId());
       } else
 /*
       if (src.getFile() == TGSI_FILE_RESOURCE) {
@@ -1337,6 +1368,7 @@ private:
    void storeDst(const tgsi::Instruction::DstRegister dst, int c,
                  Value *val, Value *ptr);
 
+   void adjustTempIndex(int arrayId, int &idx, int &idx2d) const;
    Value *applySrcMod(Value *, int s, int c);
 
    Symbol *makeSym(uint file, int fileIndex, int idx, int c, uint32_t addr);
@@ -1416,6 +1448,7 @@ private:
    DataType srcTy;
 
    DataArray tData; // TGSI_FILE_TEMPORARY
+   DataArray lData; // TGSI_FILE_TEMPORARY, for indirect arrays
    DataArray aData; // TGSI_FILE_ADDRESS
    DataArray pData; // TGSI_FILE_PREDICATE
    DataArray oData; // TGSI_FILE_OUTPUT (if outputs in registers)
@@ -1619,7 +1652,7 @@ Converter::getArrayForFile(unsigned file, int idx)
 {
    switch (file) {
    case TGSI_FILE_TEMPORARY:
-      return &tData;
+      return idx == 0 ? &tData : &lData;
    case TGSI_FILE_PREDICATE:
       return &pData;
    case TGSI_FILE_ADDRESS:
@@ -1641,11 +1674,23 @@ Converter::shiftAddress(Value *index)
    return mkOp2v(OP_SHL, TYPE_U32, getSSA(4, FILE_ADDRESS), index, mkImm(4));
 }
 
+void
+Converter::adjustTempIndex(int arrayId, int &idx, int &idx2d) const
+{
+   std::map<int, int>::const_iterator it =
+      code->indirectTempOffsets.find(arrayId);
+   if (it == code->indirectTempOffsets.end())
+      return;
+
+   idx2d = 1;
+   idx += it->second;
+}
+
 Value *
 Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
 {
-   const int idx2d = src.is2D() ? src.getIndex(1) : 0;
-   const int idx = src.getIndex(0);
+   int idx2d = src.is2D() ? src.getIndex(1) : 0;
+   int idx = src.getIndex(0);
    const int swz = src.getSwizzle(c);
    Instruction *ld;
 
@@ -1686,6 +1731,13 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
       ld = mkOp1(OP_RDSV, TYPE_U32, getSSA(), srcToSym(src, c));
       ld->perPatch = info->sv[idx].patch;
       return ld->getDef(0);
+   case TGSI_FILE_TEMPORARY: {
+      int arrayid = src.getArrayId();
+      if (!arrayid)
+         arrayid = code->tempArrayId[idx];
+      adjustTempIndex(arrayid, idx, idx2d);
+   }
+      /* fallthrough */
    default:
       return getArrayForFile(src.getFile(), idx2d)->load(
          sub.cur->values, idx, swz, shiftAddress(ptr));
@@ -1697,8 +1749,8 @@ Converter::acquireDst(int d, int c)
 {
    const tgsi::Instruction::DstRegister dst = tgsi.getDst(d);
    const unsigned f = dst.getFile();
-   const int idx = dst.getIndex(0);
-   const int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
+   int idx = dst.getIndex(0);
+   int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
 
    if (dst.isMasked(c)/* || f == TGSI_FILE_RESOURCE*/)
       return NULL;
@@ -1708,6 +1760,13 @@ Converter::acquireDst(int d, int c)
        (f == TGSI_FILE_OUTPUT && prog->getType() != Program::TYPE_FRAGMENT))
       return getScratch();
 
+   if (f == TGSI_FILE_TEMPORARY) {
+      int arrayid = dst.getArrayId();
+      if (!arrayid)
+         arrayid = code->tempArrayId[idx];
+      adjustTempIndex(arrayid, idx, idx2d);
+   }
+
    return getArrayForFile(f, idx2d)-> acquire(sub.cur->values, idx, c);
 }
 
@@ -1739,8 +1798,8 @@ Converter::storeDst(const tgsi::Instruction::DstRegister dst, int c,
                     Value *val, Value *ptr)
 {
    const unsigned f = dst.getFile();
-   const int idx = dst.getIndex(0);
-   const int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
+   int idx = dst.getIndex(0);
+   int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
 
    if (f == TGSI_FILE_SYSTEM_VALUE) {
       assert(!ptr);
@@ -1763,6 +1822,13 @@ Converter::storeDst(const tgsi::Instruction::DstRegister dst, int c,
        f == TGSI_FILE_PREDICATE ||
        f == TGSI_FILE_ADDRESS ||
        f == TGSI_FILE_OUTPUT) {
+      if (f == TGSI_FILE_TEMPORARY) {
+         int arrayid = dst.getArrayId();
+         if (!arrayid)
+            arrayid = code->tempArrayId[idx];
+         adjustTempIndex(arrayid, idx, idx2d);
+      }
+
       getArrayForFile(f, idx2d)->store(sub.cur->values, idx, c, ptr, val);
    } else {
       assert(!"invalid dst file");
@@ -3326,18 +3392,17 @@ Converter::exportOutputs()
 Converter::Converter(Program *ir, const tgsi::Source *code) : BuildUtil(ir),
      code(code),
      tgsi(NULL),
-     tData(this), aData(this), pData(this), oData(this)
+     tData(this), lData(this), aData(this), pData(this), oData(this)
 {
    info = code->info;
 
-   const DataFile tFile = code->mainTempsInLMem ? FILE_MEMORY_LOCAL : FILE_GPR;
-
    const unsigned tSize = code->fileSize(TGSI_FILE_TEMPORARY);
    const unsigned pSize = code->fileSize(TGSI_FILE_PREDICATE);
    const unsigned aSize = code->fileSize(TGSI_FILE_ADDRESS);
    const unsigned oSize = code->fileSize(TGSI_FILE_OUTPUT);
 
-   tData.setup(TGSI_FILE_TEMPORARY, 0, 0, tSize, 4, 4, tFile, 0);
+   tData.setup(TGSI_FILE_TEMPORARY, 0, 0, tSize, 4, 4, FILE_GPR, 0);
+   lData.setup(TGSI_FILE_TEMPORARY, 1, 0, tSize, 4, 4, FILE_MEMORY_LOCAL, 0);
    pData.setup(TGSI_FILE_PREDICATE, 0, 0, pSize, 4, 4, FILE_PREDICATE, 0);
    aData.setup(TGSI_FILE_ADDRESS, 0, 0, aSize, 4, 4, FILE_GPR, 0);
    oData.setup(TGSI_FILE_OUTPUT, 0, 0, oSize, 4, 4, FILE_GPR, 0);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 6530078b938..dc1ab769b98 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -540,6 +540,12 @@ NVC0LegalizePostRA::visit(BasicBlock *bb)
          // It seems like barriers are never required for tessellation since
          // the warp size is 32, and there are always at most 32 tcs threads.
          bb->remove(i);
+      } else
+      if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LDC_IS) {
+         int offset = i->src(0).get()->reg.data.offset;
+         if (abs(offset) > 0x10000)
+            i->src(0).get()->reg.fileIndex += offset >> 16;
+         i->src(0).get()->reg.data.offset = (int)(short)offset;
       } else {
          // TODO: Move this to before register allocation for operations that
          // need the $c register !
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index f5c590eef10..95e9fdfc57d 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -171,7 +171,10 @@ LoadPropagation::isImmdLoad(Instruction *ld)
    if (!ld || (ld->op != OP_MOV) ||
        ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8)))
       return false;
-   return ld->src(0).getFile() == FILE_IMMEDIATE;
+
+   // A 0 can be replaced with a register, so it doesn't count as an immediate.
+   ImmediateValue val;
+   return ld->src(0).getImmediate(val) && !val.isInteger(0);
 }
 
 bool
@@ -187,7 +190,8 @@ LoadPropagation::isAttribOrSharedLoad(Instruction *ld)
 void
 LoadPropagation::checkSwapSrc01(Instruction *insn)
 {
-   if (!prog->getTarget()->getOpInfo(insn).commutative)
+   const Target *targ = prog->getTarget();
+   if (!targ->getOpInfo(insn).commutative)
       if (insn->op != OP_SET && insn->op != OP_SLCT)
          return;
    if (insn->src(1).getFile() != FILE_GPR)
@@ -196,14 +200,15 @@ LoadPropagation::checkSwapSrc01(Instruction *insn)
    Instruction *i0 = insn->getSrc(0)->getInsn();
    Instruction *i1 = insn->getSrc(1)->getInsn();
 
-   if (isCSpaceLoad(i0)) {
-      if (!isCSpaceLoad(i1))
-         insn->swapSources(0, 1);
-      else
-         return;
-   } else
-   if (isImmdLoad(i0)) {
-      if (!isCSpaceLoad(i1) && !isImmdLoad(i1))
+   // Swap sources to inline the less frequently used source. That way,
+   // optimistically, it will eventually be able to remove the instruction.
+   int i0refs = insn->getSrc(0)->refCount();
+   int i1refs = insn->getSrc(1)->refCount();
+
+   if ((isCSpaceLoad(i0) || isImmdLoad(i0)) && targ->insnCanLoad(insn, 1, i0)) {
+      if ((!isImmdLoad(i1) && !isCSpaceLoad(i1)) ||
+          !targ->insnCanLoad(insn, 1, i1) ||
+          i0refs < i1refs)
          insn->swapSources(0, 1);
       else
          return;
@@ -1224,6 +1229,8 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
             adds = 1;
          else
             return;
+         if (si->src(!adds).mod != Modifier(0))
+            return;
          // SHL(ADD(x, y), z) = ADD(SHL(x, z), SHL(y, z))
 
          // This is more operations, but if one of x, y is an immediate, then
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
index 673f8811ff3..e6e1912adae 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
@@ -192,7 +192,7 @@ public:
    virtual bool insnCanLoad(const Instruction *insn, int s,
                             const Instruction *ld) const = 0;
    virtual bool insnCanLoadOffset(const Instruction *insn, int s,
-                                  int offset) const { return true; }
+                                  int offset) const = 0;
    virtual bool isOpSupported(operation, DataType) const = 0;
    virtual bool isAccessSupported(DataFile, DataType) const = 0;
    virtual bool isModSupported(const Instruction *,
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
index 101082e7491..2c4d7f53d60 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -99,6 +99,7 @@ static const struct opProperties _initProps[] =
    { OP_SET,    0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 },
    { OP_PREEX2, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
    { OP_PRESIN, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_EX2,    0x0, 0x0, 0x0, 0x8, 0x0, 0x0, 0x0, 0x0 },
    { OP_LG2,    0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
    { OP_RCP,    0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
    { OP_RSQ,    0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
index 014c652eede..a03afa8dc8d 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -383,6 +383,16 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s,
    return true;
 }
 
+bool
+TargetNVC0::insnCanLoadOffset(const Instruction *insn, int s, int offset) const
+{
+   const ValueRef& ref = insn->src(s);
+   if (ref.getFile() == FILE_MEMORY_CONST &&
+       (insn->op != OP_LOAD || insn->subOp != NV50_IR_SUBOP_LDC_IS))
+      return offset >= -0x8000 && offset < 0x8000;
+   return true;
+}
+
 bool
 TargetNVC0::isAccessSupported(DataFile file, DataType ty) const
 {
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
index 3c5c7480405..7d11cd96315 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
@@ -48,6 +48,8 @@ public:
 
    virtual bool insnCanLoad(const Instruction *insn, int s,
                             const Instruction *ld) const;
+   virtual bool insnCanLoadOffset(const Instruction *insn, int s,
+                                  int offset) const;
    virtual bool isOpSupported(operation, DataType) const;
    virtual bool isAccessSupported(DataFile, DataType) const;
    virtual bool isModSupported(const Instruction *, int s, Modifier) const;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 933330f107a..61d91fd4cce 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -183,6 +183,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_STRING_MARKER:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 712835c1ce1..32da60e0a23 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -226,6 +226,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_STRING_MARKER:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index ccf96fb2815..84dbd69b8a5 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -215,6 +215,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_STRING_MARKER:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -295,9 +296,10 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
       if (shader == PIPE_SHADER_COMPUTE && class_3d >= NVE4_3D_CLASS)
          return NVE4_MAX_PIPE_CONSTBUFS_COMPUTE;
       return NVC0_MAX_PIPE_CONSTBUFS;
-   case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
    case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
       return shader != PIPE_SHADER_FRAGMENT;
+   case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
+      return shader != PIPE_SHADER_FRAGMENT || class_3d < GM107_3D_CLASS;
    case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
    case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
       return 1;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
index c53f946a762..af072a8acdc 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
@@ -64,7 +64,7 @@ nvc0_decoder_bsp_next(struct nouveau_vp3_decoder *dec,
       bsp_size += num_bytes[i];
    bsp_size += 256; /* the 4 end markers */
 
-   if (!bsp_bo || bsp_size > bsp_bo->size) {
+   if (bsp_size > bsp_bo->size) {
       union nouveau_bo_config cfg;
       struct nouveau_bo *tmp_bo = NULL;
 
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 8823b8d3197..90c4f71a945 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -209,6 +209,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
         case PIPE_CAP_INVALIDATE_BUFFER:
         case PIPE_CAP_GENERATE_MIPMAP:
+        case PIPE_CAP_STRING_MARKER:
             return 0;
 
         /* SWTCL-only features. */
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 08fdd361049..9b0f31270df 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -68,6 +68,7 @@ static const struct debug_named_value r600_debug_options[] = {
 static void r600_destroy_context(struct pipe_context *context)
 {
 	struct r600_context *rctx = (struct r600_context *)context;
+	unsigned sh;
 
 	r600_isa_destroy(rctx->isa);
 
@@ -76,6 +77,11 @@ static void r600_destroy_context(struct pipe_context *context)
 	pipe_resource_reference((struct pipe_resource**)&rctx->dummy_cmask, NULL);
 	pipe_resource_reference((struct pipe_resource**)&rctx->dummy_fmask, NULL);
 
+	for (sh = 0; sh < PIPE_SHADER_TYPES; sh++) {
+		rctx->b.b.set_constant_buffer(&rctx->b.b, sh, R600_BUFFER_INFO_CONST_BUFFER, NULL);
+		free(rctx->driver_consts[sh].constants);
+	}
+
 	if (rctx->fixed_func_tcs_shader)
 		rctx->b.b.delete_tcs_state(&rctx->b.b, rctx->fixed_func_tcs_shader);
 
@@ -357,6 +363,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
 	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
 	case PIPE_CAP_GENERATE_MIPMAP:
+	case PIPE_CAP_STRING_MARKER:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
index 6592c5bdeca..c7984c47304 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -210,8 +210,8 @@ static void r600_buffer_destroy(struct pipe_screen *screen,
 }
 
 static bool
-r600_do_invalidate_resource(struct r600_common_context *rctx,
-			    struct r600_resource *rbuffer)
+r600_invalidate_buffer(struct r600_common_context *rctx,
+		       struct r600_resource *rbuffer)
 {
 	/* In AMD_pinned_memory, the user pointer association only gets
 	 * broken when the buffer is explicitly re-allocated.
@@ -236,7 +236,9 @@ void r600_invalidate_resource(struct pipe_context *ctx,
 	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
 	struct r600_resource *rbuffer = r600_resource(resource);
 
-	(void)r600_do_invalidate_resource(rctx, rbuffer);
+	/* We currently only do anyting here for buffers */
+	if (resource->target == PIPE_BUFFER)
+		(void)r600_invalidate_buffer(rctx, rbuffer);
 }
 
 static void *r600_buffer_get_transfer(struct pipe_context *ctx,
@@ -306,7 +308,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 	    !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
 		assert(usage & PIPE_TRANSFER_WRITE);
 
-		if (r600_do_invalidate_resource(rctx, rbuffer)) {
+		if (r600_invalidate_buffer(rctx, rbuffer)) {
 			/* At this point, the buffer is always idle. */
 			usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
 		}
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index f6ff4a81bd4..3e20c3b81fa 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -349,6 +349,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
 	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
 	case PIPE_CAP_GENERATE_MIPMAP:
+	case PIPE_CAP_STRING_MARKER:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index cc9718e42d3..2de7def8dd2 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3728,6 +3728,9 @@ void si_shader_binary_read_config(struct radeon_shader_binary *binary,
 		case R_0286CC_SPI_PS_INPUT_ENA:
 			conf->spi_ps_input_ena = value;
 			break;
+		case R_0286D0_SPI_PS_INPUT_ADDR:
+			/* Not used yet, but will be in the future */
+			break;
 		case R_0286E8_SPI_TMPRING_SIZE:
 		case R_00B860_COMPUTE_TMPRING_SIZE:
 			/* WAVESIZE is in units of 256 dwords. */
@@ -3735,8 +3738,15 @@ void si_shader_binary_read_config(struct radeon_shader_binary *binary,
 				G_00B860_WAVESIZE(value) * 256 * 4 * 1;
 			break;
 		default:
-			fprintf(stderr, "Warning: Compiler emitted unknown "
-				"config register: 0x%x\n", reg);
+			{
+				static bool printed;
+
+				if (!printed) {
+					fprintf(stderr, "Warning: LLVM emitted unknown "
+						"config register: 0x%x\n", reg);
+					printed = true;
+				}
+			}
 			break;
 		}
 	}
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 143702a5650..3bc580899d4 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -260,6 +260,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_STRING_MARKER:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index b21634f3d73..8d04222a0cd 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -357,6 +357,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_STRING_MARKER:
       return 0;
    case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
       return 64;
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index fb41877017d..08c2dad8406 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -198,6 +198,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
         case PIPE_CAP_INVALIDATE_BUFFER:
         case PIPE_CAP_GENERATE_MIPMAP:
+        case PIPE_CAP_STRING_MARKER:
                 return 0;
 
                 /* Stream output. */
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index 4b551ed0b41..f69a75be50e 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -678,6 +678,13 @@ struct pipe_context {
    void (*dump_debug_state)(struct pipe_context *ctx, FILE *stream,
                             unsigned flags);
 
+   /**
+    * Emit string marker in cmdstream
+    */
+   void (*emit_string_marker)(struct pipe_context *ctx,
+                              const char *string,
+                              int len);
+
    /**
     * Generate mipmap.
     * \return TRUE if mipmap generation succeeds, FALSE otherwise
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index cb837cd2597..b46187bc8a1 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -644,6 +644,7 @@ enum pipe_cap
    PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT,
    PIPE_CAP_INVALIDATE_BUFFER,
    PIPE_CAP_GENERATE_MIPMAP,
+   PIPE_CAP_STRING_MARKER,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)
diff --git a/src/gallium/state_trackers/dri/dri_drawable.c b/src/gallium/state_trackers/dri/dri_drawable.c
index f0cc4a2a3ef..adc51284767 100644
--- a/src/gallium/state_trackers/dri/dri_drawable.c
+++ b/src/gallium/state_trackers/dri/dri_drawable.c
@@ -492,8 +492,10 @@ dri_flush(__DRIcontext *cPriv,
 
       if (pipe->invalidate_resource &&
           (flags & __DRI2_FLUSH_INVALIDATE_ANCILLARY)) {
-         pipe->invalidate_resource(pipe, drawable->textures[ST_ATTACHMENT_DEPTH_STENCIL]);
-         pipe->invalidate_resource(pipe, drawable->msaa_textures[ST_ATTACHMENT_DEPTH_STENCIL]);
+         if (drawable->textures[ST_ATTACHMENT_DEPTH_STENCIL])
+            pipe->invalidate_resource(pipe, drawable->textures[ST_ATTACHMENT_DEPTH_STENCIL]);
+         if (drawable->msaa_textures[ST_ATTACHMENT_DEPTH_STENCIL])
+            pipe->invalidate_resource(pipe, drawable->msaa_textures[ST_ATTACHMENT_DEPTH_STENCIL]);
       }
    }
 
diff --git a/src/gallium/state_trackers/va/context.c b/src/gallium/state_trackers/va/context.c
index 37a011799e2..b25c381d968 100644
--- a/src/gallium/state_trackers/va/context.c
+++ b/src/gallium/state_trackers/va/context.c
@@ -31,6 +31,7 @@
 #include "util/u_memory.h"
 #include "util/u_handle_table.h"
 #include "util/u_video.h"
+#include "vl/vl_deint_filter.h"
 #include "vl/vl_winsys.h"
 
 #include "va_private.h"
@@ -296,6 +297,10 @@ vlVaDestroyContext(VADriverContextP ctx, VAContextID context_id)
       }
       context->decoder->destroy(context->decoder);
    }
+   if (context->deint) {
+      vl_deint_filter_cleanup(context->deint);
+      FREE(context->deint);
+   }
    FREE(context);
    handle_table_remove(drv->htab, context_id);
    pipe_mutex_unlock(drv->mutex);
diff --git a/src/gallium/state_trackers/va/postproc.c b/src/gallium/state_trackers/va/postproc.c
index 0cec0c88124..d06f01617df 100644
--- a/src/gallium/state_trackers/va/postproc.c
+++ b/src/gallium/state_trackers/va/postproc.c
@@ -29,6 +29,7 @@
 
 #include "vl/vl_defines.h"
 #include "vl/vl_video_buffer.h"
+#include "vl/vl_deint_filter.h"
 
 #include "va_private.h"
 
@@ -174,6 +175,51 @@ static VAStatus vlVaPostProcBlit(vlVaDriver *drv, vlVaContext *context,
    return VA_STATUS_SUCCESS;
 }
 
+static struct pipe_video_buffer *
+vlVaApplyDeint(vlVaDriver *drv, vlVaContext *context,
+               VAProcPipelineParameterBuffer *param,
+               struct pipe_video_buffer *current,
+               unsigned field)
+{
+   vlVaSurface *prevprev, *prev, *next;
+
+   if (param->num_forward_references < 1 ||
+       param->num_backward_references < 2)
+      return current;
+
+   prevprev = handle_table_get(drv->htab, param->backward_references[1]);
+   prev = handle_table_get(drv->htab, param->backward_references[0]);
+   next = handle_table_get(drv->htab, param->forward_references[0]);
+
+   if (!prevprev || !prev || !next)
+      return current;
+
+   if (context->deint && (context->deint->video_width != current->width ||
+       context->deint->video_height != current->height)) {
+      vl_deint_filter_cleanup(context->deint);
+      FREE(context->deint);
+      context->deint = NULL;
+   }
+
+   if (!context->deint) {
+      context->deint = MALLOC(sizeof(struct vl_deint_filter));
+      if (!vl_deint_filter_init(context->deint, drv->pipe, current->width,
+                                current->height, false, false)) {
+         FREE(context->deint);
+         context->deint = NULL;
+         return current;
+      }
+   }
+
+   if (!vl_deint_filter_check_buffers(context->deint, prevprev->buffer,
+                                      prev->buffer, current, next->buffer))
+      return current;
+
+   vl_deint_filter_render(context->deint, prevprev->buffer, prev->buffer,
+                          current, next->buffer, field);
+   return context->deint->video_buffer;
+}
+
 VAStatus
 vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf)
 {
@@ -181,6 +227,7 @@ vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *contex
    VARectangle def_src_region, def_dst_region;
    const VARectangle *src_region, *dst_region;
    VAProcPipelineParameterBuffer *param;
+   struct pipe_video_buffer *src;
    vlVaSurface *src_surface;
    unsigned i;
 
@@ -199,6 +246,8 @@ vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *contex
    if (!src_surface || !src_surface->buffer)
       return VA_STATUS_ERROR_INVALID_SURFACE;
 
+   src = src_surface->buffer;
+
    for (i = 0; i < param->num_filters; i++) {
       vlVaBuffer *buf = handle_table_get(drv->htab, param->filters[i]);
       VAProcFilterParameterBufferBase *filter;
@@ -222,6 +271,11 @@ vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *contex
             deinterlace = VL_COMPOSITOR_WEAVE;
             break;
 
+         case VAProcDeinterlacingMotionAdaptive:
+            src = vlVaApplyDeint(drv, context, param, src,
+				 !!(deint->flags & VA_DEINTERLACING_BOTTOM_FIELD));
+            break;
+
          default:
             return VA_STATUS_ERROR_UNIMPLEMENTED;
          }
@@ -239,10 +293,8 @@ vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *contex
 
    if (context->target->buffer_format != PIPE_FORMAT_NV12)
       return vlVaPostProcCompositor(drv, context, src_region, dst_region,
-                                    src_surface->buffer, context->target,
-                                    deinterlace);
+                                    src, context->target, deinterlace);
    else
       return vlVaPostProcBlit(drv, context, src_region, dst_region,
-                              src_surface->buffer, context->target,
-                              deinterlace);
+                              src, context->target, deinterlace);
 }
diff --git a/src/gallium/state_trackers/va/surface.c b/src/gallium/state_trackers/va/surface.c
index f23a88901f5..84a94949c47 100644
--- a/src/gallium/state_trackers/va/surface.c
+++ b/src/gallium/state_trackers/va/surface.c
@@ -691,13 +691,14 @@ vlVaQueryVideoProcFilterCaps(VADriverContextP ctx, VAContextID context,
    case VAProcFilterDeinterlacing: {
       VAProcFilterCapDeinterlacing *deint = filter_caps;
 
-      if (*num_filter_caps < 2) {
-         *num_filter_caps = 2;
+      if (*num_filter_caps < 3) {
+         *num_filter_caps = 3;
          return VA_STATUS_ERROR_MAX_NUM_EXCEEDED;
       }
 
       deint[i++].type = VAProcDeinterlacingBob;
       deint[i++].type = VAProcDeinterlacingWeave;
+      deint[i++].type = VAProcDeinterlacingMotionAdaptive;
       break;
    }
 
@@ -750,9 +751,24 @@ vlVaQueryVideoProcPipelineCaps(VADriverContextP ctx, VAContextID context,
 
    for (i = 0; i < num_filters; i++) {
       vlVaBuffer *buf = handle_table_get(VL_VA_DRIVER(ctx)->htab, filters[i]);
+      VAProcFilterParameterBufferBase *filter;
 
-      if (!buf || buf->type >= VABufferTypeMax)
+      if (!buf || buf->type != VAProcFilterParameterBufferType)
          return VA_STATUS_ERROR_INVALID_BUFFER;
+
+      filter = buf->data;
+      switch (filter->type) {
+      case VAProcFilterDeinterlacing: {
+         VAProcFilterParameterBufferDeinterlacing *deint = buf->data;
+         if (deint->algorithm == VAProcDeinterlacingMotionAdaptive) {
+            pipeline_cap->num_forward_references = 1;
+            pipeline_cap->num_backward_references = 2;
+         }
+         break;
+      }
+      default:
+         return VA_STATUS_ERROR_UNIMPLEMENTED;
+      }
    }
 
    return VA_STATUS_SUCCESS;
diff --git a/src/gallium/state_trackers/va/va_private.h b/src/gallium/state_trackers/va/va_private.h
index 7afd81a196d..614fa98fef7 100644
--- a/src/gallium/state_trackers/va/va_private.h
+++ b/src/gallium/state_trackers/va/va_private.h
@@ -236,6 +236,8 @@ typedef struct {
       VAPictureParameterBufferMPEG4 pps;
       uint8_t start_code[32];
    } mpeg4;
+
+   struct vl_deint_filter *deint;
 } vlVaContext;
 
 typedef struct {
diff --git a/src/glsl/ast.h b/src/glsl/ast.h
index f8ab0b71b7b..03df6c08b2b 100644
--- a/src/glsl/ast.h
+++ b/src/glsl/ast.h
@@ -699,17 +699,18 @@ struct ast_type_qualifier {
 
    bool merge_qualifier(YYLTYPE *loc,
 			_mesa_glsl_parse_state *state,
-			const ast_type_qualifier &q);
+                        const ast_type_qualifier &q,
+                        bool is_single_layout_merge);
 
    bool merge_out_qualifier(YYLTYPE *loc,
                            _mesa_glsl_parse_state *state,
                            const ast_type_qualifier &q,
-                           ast_node* &node);
+                           ast_node* &node, bool create_node);
 
    bool merge_in_qualifier(YYLTYPE *loc,
                            _mesa_glsl_parse_state *state,
                            const ast_type_qualifier &q,
-                           ast_node* &node);
+                           ast_node* &node, bool create_node);
 
    ast_subroutine_list *subroutine_list;
 };
diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index 02e6e2d3d45..8d66131b2ca 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -487,15 +487,17 @@ unary_arithmetic_result_type(const struct glsl_type *type,
  * If the given types to the bit-logic operator are invalid, return
  * glsl_type::error_type.
  *
- * \param type_a Type of LHS of bit-logic op
- * \param type_b Type of RHS of bit-logic op
+ * \param value_a LHS of bit-logic op
+ * \param value_b RHS of bit-logic op
  */
 static const struct glsl_type *
-bit_logic_result_type(const struct glsl_type *type_a,
-                      const struct glsl_type *type_b,
+bit_logic_result_type(ir_rvalue * &value_a, ir_rvalue * &value_b,
                       ast_operators op,
                       struct _mesa_glsl_parse_state *state, YYLTYPE *loc)
 {
+   const glsl_type *type_a = value_a->type;
+   const glsl_type *type_b = value_b->type;
+
    if (!state->check_bitwise_operations_allowed(loc)) {
       return glsl_type::error_type;
    }
@@ -517,6 +519,36 @@ bit_logic_result_type(const struct glsl_type *type_a,
       return glsl_type::error_type;
    }
 
+   /* Prior to GLSL 4.0 / GL_ARB_gpu_shader5, implicit conversions didn't
+    * make sense for bitwise operations, as they don't operate on floats.
+    *
+    * GLSL 4.0 added implicit int -> uint conversions, which are relevant
+    * here.  It wasn't clear whether or not we should apply them to bitwise
+    * operations.  However, Khronos has decided that they should in future
+    * language revisions.  Applications also rely on this behavior.  We opt
+    * to apply them in general, but issue a portability warning.
+    *
+    * See https://www.khronos.org/bugzilla/show_bug.cgi?id=1405
+    */
+   if (type_a->base_type != type_b->base_type) {
+      if (!apply_implicit_conversion(type_a, value_b, state)
+          && !apply_implicit_conversion(type_b, value_a, state)) {
+         _mesa_glsl_error(loc, state,
+                          "could not implicitly convert operands to "
+                          "`%s` operator",
+                          ast_expression::operator_string(op));
+         return glsl_type::error_type;
+      } else {
+         _mesa_glsl_warning(loc, state,
+                            "some implementations may not support implicit "
+                            "int -> uint conversions for `%s' operators; "
+                            "consider casting explicitly for portability",
+                            ast_expression::operator_string(op));
+      }
+      type_a = value_a->type;
+      type_b = value_b->type;
+   }
+
    /*     "The fundamental types of the operands (signed or unsigned) must
     *     match,"
     */
@@ -1435,8 +1467,7 @@ ast_expression::do_hir(exec_list *instructions,
    case ast_bit_or:
       op[0] = this->subexpressions[0]->hir(instructions, state);
       op[1] = this->subexpressions[1]->hir(instructions, state);
-      type = bit_logic_result_type(op[0]->type, op[1]->type, this->oper,
-                                   state, &loc);
+      type = bit_logic_result_type(op[0], op[1], this->oper, state, &loc);
       result = new(ctx) ir_expression(operations[this->oper], type,
                                       op[0], op[1]);
       error_emitted = op[0]->type->is_error() || op[1]->type->is_error();
@@ -1626,8 +1657,7 @@ ast_expression::do_hir(exec_list *instructions,
    case ast_or_assign: {
       op[0] = this->subexpressions[0]->hir(instructions, state);
       op[1] = this->subexpressions[1]->hir(instructions, state);
-      type = bit_logic_result_type(op[0]->type, op[1]->type, this->oper,
-                                   state, &loc);
+      type = bit_logic_result_type(op[0], op[1], this->oper, state, &loc);
       ir_rvalue *temp_rhs = new(ctx) ir_expression(operations[this->oper],
                                                    type, op[0], op[1]);
       error_emitted =
@@ -6329,7 +6359,7 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
              qual_stream != block_stream) {
             _mesa_glsl_error(&loc, state, "stream layout qualifier on "
                              "interface block member does not match "
-                             "the interface block (%d vs %d)", qual->stream,
+                             "the interface block (%u vs %u)", qual_stream,
                              block_stream);
          }
       }
diff --git a/src/glsl/ast_type.cpp b/src/glsl/ast_type.cpp
index 8643b7bfb76..e0e331152dd 100644
--- a/src/glsl/ast_type.cpp
+++ b/src/glsl/ast_type.cpp
@@ -74,9 +74,11 @@ ast_type_qualifier::has_layout() const
           || this->flags.q.row_major
           || this->flags.q.packed
           || this->flags.q.explicit_location
+          || this->flags.q.explicit_image_format
           || this->flags.q.explicit_index
           || this->flags.q.explicit_binding
-          || this->flags.q.explicit_offset;
+          || this->flags.q.explicit_offset
+          || this->flags.q.explicit_stream;
 }
 
 bool
@@ -113,10 +115,16 @@ ast_type_qualifier::interpolation_string() const
       return NULL;
 }
 
+/**
+ * This function merges both duplicate identifies within a single layout and
+ * multiple layout qualifiers on a single variable declaration. The
+ * is_single_layout_merge param is used differentiate between the two.
+ */
 bool
 ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
 				    _mesa_glsl_parse_state *state,
-				    const ast_type_qualifier &q)
+                                    const ast_type_qualifier &q,
+                                    bool is_single_layout_merge)
 {
    ast_type_qualifier ubo_mat_mask;
    ubo_mat_mask.flags.i = 0;
@@ -156,7 +164,8 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
       allowed_duplicates_mask.flags.i |=
          stream_layout_mask.flags.i;
 
-   if ((this->flags.i & q.flags.i & ~allowed_duplicates_mask.flags.i) != 0) {
+   if (is_single_layout_merge && !state->has_enhanced_layouts() &&
+       (this->flags.i & q.flags.i & ~allowed_duplicates_mask.flags.i) != 0) {
       _mesa_glsl_error(loc, state,
 		       "duplicate layout qualifiers used");
       return false;
@@ -207,11 +216,6 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
             this->flags.q.stream = 1;
             this->stream = state->out_qualifier->stream;
          }
-      } else {
-         if (q.flags.q.explicit_stream) {
-            _mesa_glsl_error(loc, state,
-                             "duplicate layout `stream' qualifier");
-         }
       }
    }
 
@@ -294,13 +298,35 @@ bool
 ast_type_qualifier::merge_out_qualifier(YYLTYPE *loc,
                                         _mesa_glsl_parse_state *state,
                                         const ast_type_qualifier &q,
-                                        ast_node* &node)
+                                        ast_node* &node, bool create_node)
 {
    void *mem_ctx = state;
-   const bool r = this->merge_qualifier(loc, state, q);
+   const bool r = this->merge_qualifier(loc, state, q, false);
 
-   if (state->stage == MESA_SHADER_TESS_CTRL) {
-      node = new(mem_ctx) ast_tcs_output_layout(*loc);
+   if (state->stage == MESA_SHADER_GEOMETRY) {
+      if (q.flags.q.prim_type) {
+         /* Make sure this is a valid output primitive type. */
+         switch (q.prim_type) {
+         case GL_POINTS:
+         case GL_LINE_STRIP:
+         case GL_TRIANGLE_STRIP:
+            break;
+         default:
+            _mesa_glsl_error(loc, state, "invalid geometry shader output "
+                             "primitive type");
+            break;
+         }
+      }
+
+      /* Allow future assigments of global out's stream id value */
+      this->flags.q.explicit_stream = 0;
+   } else if (state->stage == MESA_SHADER_TESS_CTRL) {
+      if (create_node) {
+         node = new(mem_ctx) ast_tcs_output_layout(*loc);
+      }
+   } else {
+      _mesa_glsl_error(loc, state, "out layout qualifiers only valid in "
+                       "tessellation control or geometry shaders");
    }
 
    return r;
@@ -310,7 +336,7 @@ bool
 ast_type_qualifier::merge_in_qualifier(YYLTYPE *loc,
                                        _mesa_glsl_parse_state *state,
                                        const ast_type_qualifier &q,
-                                       ast_node* &node)
+                                       ast_node* &node, bool create_node)
 {
    void *mem_ctx = state;
    bool create_gs_ast = false;
@@ -450,10 +476,12 @@ ast_type_qualifier::merge_in_qualifier(YYLTYPE *loc,
       this->point_mode = q.point_mode;
    }
 
-   if (create_gs_ast) {
-      node = new(mem_ctx) ast_gs_input_layout(*loc, q.prim_type);
-   } else if (create_cs_ast) {
-      node = new(mem_ctx) ast_cs_input_layout(*loc, q.local_size);
+   if (create_node) {
+      if (create_gs_ast) {
+         node = new(mem_ctx) ast_gs_input_layout(*loc, q.prim_type);
+      } else if (create_cs_ast) {
+         node = new(mem_ctx) ast_cs_input_layout(*loc, q.local_size);
+      }
    }
 
    return true;
diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy
index 51796a65df9..10198758944 100644
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -299,6 +299,10 @@ static bool match_layout_qualifier(const char *s1, const char *s2,
 %type <node> for_init_statement
 %type <for_rest_statement> for_rest_statement
 %type <node> layout_defaults
+%type <node> layout_uniform_defaults
+%type <node> layout_buffer_defaults
+%type <node> layout_in_defaults
+%type <node> layout_out_defaults
 
 %right THEN ELSE
 %%
@@ -953,7 +957,7 @@ parameter_qualifier:
                                       "or precise");
 
       $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
    }
    | precision_qualifier parameter_qualifier
    {
@@ -970,7 +974,7 @@ parameter_qualifier:
    | memory_qualifier parameter_qualifier
    {
       $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
    }
 
 parameter_direction_qualifier:
@@ -1149,7 +1153,7 @@ layout_qualifier_id_list:
    | layout_qualifier_id_list ',' layout_qualifier_id
    {
       $$ = $1;
-      if (!$$.merge_qualifier(& @3, state, $3)) {
+      if (!$$.merge_qualifier(& @3, state, $3, true)) {
          YYERROR;
       }
    }
@@ -1758,7 +1762,7 @@ type_qualifier:
       }
 
       $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
    }
    | layout_qualifier type_qualifier
    {
@@ -1775,12 +1779,12 @@ type_qualifier:
          _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers");
 
       $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
    }
    | subroutine_qualifier type_qualifier
    {
       $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
    }
    | auxiliary_storage_qualifier type_qualifier
    {
@@ -1796,7 +1800,7 @@ type_qualifier:
                           "just before storage qualifiers");
       }
       $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
    }
    | storage_qualifier type_qualifier
    {
@@ -1816,7 +1820,7 @@ type_qualifier:
       }
 
       $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
    }
    | precision_qualifier type_qualifier
    {
@@ -1833,7 +1837,7 @@ type_qualifier:
    | memory_qualifier type_qualifier
    {
       $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
    }
    ;
 
@@ -2585,7 +2589,7 @@ interface_block:
          YYERROR;
       }
 
-      if (!block->layout.merge_qualifier(& @1, state, $1)) {
+      if (!block->layout.merge_qualifier(& @1, state, $1, false)) {
          YYERROR;
       }
 
@@ -2602,7 +2606,7 @@ interface_block:
                              "memory qualifiers can only be used in the "
                              "declaration of shader storage blocks");
       }
-      if (!block->layout.merge_qualifier(& @1, state, $1)) {
+      if (!block->layout.merge_qualifier(& @1, state, $1, false)) {
          YYERROR;
       }
       $$ = block;
@@ -2737,18 +2741,48 @@ member_declaration:
    }
    ;
 
-layout_defaults:
-   layout_qualifier UNIFORM ';'
+layout_uniform_defaults:
+   layout_qualifier layout_uniform_defaults
    {
-      if (!state->default_uniform_qualifier->merge_qualifier(& @1, state, $1)) {
+      $$ = NULL;
+      if (!state->has_420pack_or_es31()) {
+         _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers");
+         YYERROR;
+      } else {
+         if (!state->default_uniform_qualifier->
+                merge_qualifier(& @1, state, $1, false)) {
+            YYERROR;
+         }
+      }
+   }
+   | layout_qualifier UNIFORM ';'
+   {
+      if (!state->default_uniform_qualifier->
+             merge_qualifier(& @1, state, $1, false)) {
          YYERROR;
       }
       $$ = NULL;
    }
+   ;
 
+layout_buffer_defaults:
+   layout_qualifier layout_buffer_defaults
+   {
+      $$ = NULL;
+      if (!state->has_420pack_or_es31()) {
+         _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers");
+         YYERROR;
+      } else {
+         if (!state->default_shader_storage_qualifier->
+                merge_qualifier(& @1, state, $1, false)) {
+            YYERROR;
+         }
+      }
+   }
    | layout_qualifier BUFFER ';'
    {
-      if (!state->default_shader_storage_qualifier->merge_qualifier(& @1, state, $1)) {
+      if (!state->default_shader_storage_qualifier->
+             merge_qualifier(& @1, state, $1, false)) {
          YYERROR;
       }
 
@@ -2764,43 +2798,58 @@ layout_defaults:
 
       $$ = NULL;
    }
+   ;
 
+layout_in_defaults:
+   layout_qualifier layout_in_defaults
+   {
+      $$ = NULL;
+      if (!state->has_420pack_or_es31()) {
+         _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers");
+         YYERROR;
+      } else {
+         if (!state->in_qualifier->
+                merge_in_qualifier(& @1, state, $1, $$, false)) {
+            YYERROR;
+         }
+      }
+   }
    | layout_qualifier IN_TOK ';'
    {
       $$ = NULL;
-      if (!state->in_qualifier->merge_in_qualifier(& @1, state, $1, $$)) {
+      if (!state->in_qualifier->
+             merge_in_qualifier(& @1, state, $1, $$, true)) {
          YYERROR;
       }
    }
+   ;
 
+layout_out_defaults:
+   layout_qualifier layout_out_defaults
+   {
+      $$ = NULL;
+      if (!state->has_420pack_or_es31()) {
+         _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers");
+         YYERROR;
+      } else {
+         if (!state->out_qualifier->
+                merge_out_qualifier(& @1, state, $1, $$, false)) {
+            YYERROR;
+         }
+      }
+   }
    | layout_qualifier OUT_TOK ';'
    {
       $$ = NULL;
-      if (state->stage == MESA_SHADER_GEOMETRY) {
-         if ($1.flags.q.prim_type) {
-            /* Make sure this is a valid output primitive type. */
-            switch ($1.prim_type) {
-            case GL_POINTS:
-            case GL_LINE_STRIP:
-            case GL_TRIANGLE_STRIP:
-               break;
-            default:
-               _mesa_glsl_error(&@1, state, "invalid geometry shader output "
-                                "primitive type");
-               break;
-            }
-         }
-         if (!state->out_qualifier->merge_qualifier(& @1, state, $1))
-            YYERROR;
-
-         /* Allow future assigments of global out's stream id value */
-         state->out_qualifier->flags.q.explicit_stream = 0;
-      } else if (state->stage == MESA_SHADER_TESS_CTRL) {
-         if (!state->out_qualifier->merge_out_qualifier(& @1, state, $1, $$))
-            YYERROR;
-      } else {
-         _mesa_glsl_error(& @1, state,
-                          "out layout qualifiers only valid in "
-                          "tessellation control or geometry shaders");
-      }
+      if (!state->out_qualifier->
+             merge_out_qualifier(& @1, state, $1, $$, true))
+         YYERROR;
    }
+   ;
+
+layout_defaults:
+   layout_uniform_defaults
+   | layout_buffer_defaults
+   | layout_in_defaults
+   | layout_out_defaults
+   ;
diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp
index b424edd8e96..db1947453ea 100644
--- a/src/glsl/ir.cpp
+++ b/src/glsl/ir.cpp
@@ -298,8 +298,6 @@ ir_expression::ir_expression(int op, ir_rvalue *op0)
       break;
 
    case ir_unop_noise:
-   case ir_unop_unpack_half_2x16_split_x:
-   case ir_unop_unpack_half_2x16_split_y:
       this->type = glsl_type::float_type;
       break;
 
@@ -422,10 +420,6 @@ ir_expression::ir_expression(int op, ir_rvalue *op0, ir_rvalue *op1)
       this->type = op0->type->get_base_type();
       break;
 
-   case ir_binop_pack_half_2x16_split:
-      this->type = glsl_type::uint_type;
-      break;
-
    case ir_binop_imul_high:
    case ir_binop_carry:
    case ir_binop_borrow:
@@ -555,8 +549,6 @@ static const char *const operator_strs[] = {
    "unpackUnorm2x16",
    "unpackUnorm4x8",
    "unpackHalf2x16",
-   "unpackHalf2x16_split_x",
-   "unpackHalf2x16_split_y",
    "bitfield_reverse",
    "bit_count",
    "find_msb",
@@ -599,7 +591,6 @@ static const char *const operator_strs[] = {
    "min",
    "max",
    "pow",
-   "packHalf2x16_split",
    "ubo_load",
    "ldexp",
    "vector_extract",
diff --git a/src/glsl/ir.h b/src/glsl/ir.h
index 5b845c6e856..b453187c32a 100644
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -1401,16 +1401,6 @@ enum ir_expression_operation {
    ir_unop_unpack_half_2x16,
    /*@}*/
 
-   /**
-    * \name Lowered floating point unpacking operations.
-    *
-    * \see lower_packing_builtins_visitor::split_unpack_half_2x16
-    */
-   /*@{*/
-   ir_unop_unpack_half_2x16_split_x,
-   ir_unop_unpack_half_2x16_split_y,
-   /*@}*/
-
    /**
     * \name Bit operations, part of ARB_gpu_shader5.
     */
@@ -1541,15 +1531,6 @@ enum ir_expression_operation {
 
    ir_binop_pow,
 
-   /**
-    * \name Lowered floating point packing operations.
-    *
-    * \see lower_packing_builtins_visitor::split_pack_half_2x16
-    */
-   /*@{*/
-   ir_binop_pack_half_2x16_split,
-   /*@}*/
-
    /**
     * Load a value the size of a given GLSL type from a uniform block.
     *
diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h
index be86f547f77..b56413a1500 100644
--- a/src/glsl/ir_optimization.h
+++ b/src/glsl/ir_optimization.h
@@ -58,17 +58,14 @@ enum lower_packing_builtins_op {
    LOWER_PACK_HALF_2x16                 = 0x0010,
    LOWER_UNPACK_HALF_2x16               = 0x0020,
 
-   LOWER_PACK_HALF_2x16_TO_SPLIT        = 0x0040,
-   LOWER_UNPACK_HALF_2x16_TO_SPLIT      = 0x0080,
+   LOWER_PACK_SNORM_4x8                 = 0x0040,
+   LOWER_UNPACK_SNORM_4x8               = 0x0080,
 
-   LOWER_PACK_SNORM_4x8                 = 0x0100,
-   LOWER_UNPACK_SNORM_4x8               = 0x0200,
+   LOWER_PACK_UNORM_4x8                 = 0x0100,
+   LOWER_UNPACK_UNORM_4x8               = 0x0200,
 
-   LOWER_PACK_UNORM_4x8                 = 0x0400,
-   LOWER_UNPACK_UNORM_4x8               = 0x0800,
-
-   LOWER_PACK_USE_BFI                   = 0x1000,
-   LOWER_PACK_USE_BFE                   = 0x2000,
+   LOWER_PACK_USE_BFI                   = 0x0400,
+   LOWER_PACK_USE_BFE                   = 0x0800,
 };
 
 bool do_common_optimization(exec_list *ir, bool linked,
diff --git a/src/glsl/ir_validate.cpp b/src/glsl/ir_validate.cpp
index 94814799b9b..12928836597 100644
--- a/src/glsl/ir_validate.cpp
+++ b/src/glsl/ir_validate.cpp
@@ -372,12 +372,6 @@ ir_validate::visit_leave(ir_expression *ir)
       assert(ir->operands[0]->type == glsl_type::uint_type);
       break;
 
-   case ir_unop_unpack_half_2x16_split_x:
-   case ir_unop_unpack_half_2x16_split_y:
-      assert(ir->type == glsl_type::float_type);
-      assert(ir->operands[0]->type == glsl_type::uint_type);
-      break;
-
    case ir_unop_unpack_double_2x32:
       assert(ir->type == glsl_type::uvec2_type);
       assert(ir->operands[0]->type == glsl_type::double_type);
@@ -567,12 +561,6 @@ ir_validate::visit_leave(ir_expression *ir)
       assert(ir->operands[0]->type == ir->operands[1]->type);
       break;
 
-   case ir_binop_pack_half_2x16_split:
-      assert(ir->type == glsl_type::uint_type);
-      assert(ir->operands[0]->type == glsl_type::float_type);
-      assert(ir->operands[1]->type == glsl_type::float_type);
-      break;
-
    case ir_binop_ubo_load:
       assert(ir->operands[0]->type == glsl_type::uint_type);
 
diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index 7cc58800765..09f80d0f39d 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -968,10 +968,12 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
    }
 
    if ((consumer_var == NULL && producer_var->type->contains_integer()) ||
-       consumer_stage != MESA_SHADER_FRAGMENT) {
+       (consumer_stage != -1 && consumer_stage != MESA_SHADER_FRAGMENT)) {
       /* Since this varying is not being consumed by the fragment shader, its
-       * interpolation type varying cannot possibly affect rendering.  Also,
-       * this variable is non-flat and is (or contains) an integer.
+       * interpolation type varying cannot possibly affect rendering.
+       * Also, this variable is non-flat and is (or contains) an integer.
+       * If the consumer stage is unknown, don't modify the interpolation
+       * type as it could affect rendering later with separate shaders.
        *
        * lower_packed_varyings requires all integer varyings to flat,
        * regardless of where they appear.  We can trivially satisfy that
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 564c4712871..6657777d74c 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -992,7 +992,17 @@ cross_validate_globals(struct gl_shader_program *prog,
 
 	       existing->data.location = var->data.location;
 	       existing->data.explicit_location = true;
-	    }
+	    } else {
+               /* Check if uniform with implicit location was marked explicit
+                * by earlier shader stage. If so, mark it explicit in this stage
+                * too to make sure later processing does not treat it as
+                * implicit one.
+                */
+               if (existing->data.explicit_location) {
+	          var->data.location = existing->data.location;
+	          var->data.explicit_location = true;
+               }
+            }
 
             /* From the GLSL 4.20 specification:
              * "A link error will result if two compilation units in a program
@@ -3152,7 +3162,7 @@ check_explicit_uniform_locations(struct gl_context *ctx,
 
          if (var->data.explicit_location) {
             bool ret;
-            if (var->type->is_subroutine())
+            if (var->type->without_array()->is_subroutine())
                ret = reserve_subroutine_explicit_locations(prog, sh, var);
             else
                ret = reserve_explicit_locations(prog, uniform_map, var);
diff --git a/src/glsl/lower_packing_builtins.cpp b/src/glsl/lower_packing_builtins.cpp
index 7f18238bc6e..a41627bd561 100644
--- a/src/glsl/lower_packing_builtins.cpp
+++ b/src/glsl/lower_packing_builtins.cpp
@@ -43,13 +43,6 @@ public:
       : op_mask(op_mask),
         progress(false)
    {
-      /* Mutually exclusive options. */
-      assert(!((op_mask & LOWER_PACK_HALF_2x16) &&
-               (op_mask & LOWER_PACK_HALF_2x16_TO_SPLIT)));
-
-      assert(!((op_mask & LOWER_UNPACK_HALF_2x16) &&
-               (op_mask & LOWER_UNPACK_HALF_2x16_TO_SPLIT)));
-
       factory.instructions = &factory_instructions;
    }
 
@@ -96,9 +89,6 @@ public:
       case LOWER_PACK_HALF_2x16:
          *rvalue = lower_pack_half_2x16(op0);
          break;
-      case LOWER_PACK_HALF_2x16_TO_SPLIT:
-         *rvalue = split_pack_half_2x16(op0);
-         break;
       case LOWER_UNPACK_SNORM_2x16:
          *rvalue = lower_unpack_snorm_2x16(op0);
          break;
@@ -114,9 +104,6 @@ public:
       case LOWER_UNPACK_HALF_2x16:
          *rvalue = lower_unpack_half_2x16(op0);
          break;
-      case LOWER_UNPACK_HALF_2x16_TO_SPLIT:
-         *rvalue = split_unpack_half_2x16(op0);
-         break;
       case LOWER_PACK_UNPACK_NONE:
       case LOWER_PACK_USE_BFI:
       case LOWER_PACK_USE_BFE:
@@ -161,7 +148,7 @@ private:
          result = op_mask & LOWER_PACK_UNORM_4x8;
          break;
       case ir_unop_pack_half_2x16:
-         result = op_mask & (LOWER_PACK_HALF_2x16 | LOWER_PACK_HALF_2x16_TO_SPLIT);
+         result = op_mask & LOWER_PACK_HALF_2x16;
          break;
       case ir_unop_unpack_snorm_2x16:
          result = op_mask & LOWER_UNPACK_SNORM_2x16;
@@ -176,7 +163,7 @@ private:
          result = op_mask & LOWER_UNPACK_UNORM_4x8;
          break;
       case ir_unop_unpack_half_2x16:
-         result = op_mask & (LOWER_UNPACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16_TO_SPLIT);
+         result = op_mask & LOWER_UNPACK_HALF_2x16;
          break;
       default:
          result = LOWER_PACK_UNPACK_NONE;
@@ -1092,41 +1079,6 @@ private:
       return result;
    }
 
-   /**
-    * \brief Split packHalf2x16's vec2 operand into two floats.
-    *
-    * \param vec2_rval is packHalf2x16's input
-    * \return a uint rvalue
-    *
-    * Some code generators, such as the i965 fragment shader, require that all
-    * vector expressions be lowered to a sequence of scalar expressions.
-    * However, packHalf2x16 cannot be scalarized by the same mechanism as
-    * a true vector operation because its input and output have a differing
-    * number of vector components.
-    *
-    * This method scalarizes packHalf2x16 by transforming it from an unary
-    * operation having vector input to a binary operation having scalar input.
-    * That is, it transforms
-    *
-    *    packHalf2x16(VEC2_RVAL);
-    *
-    * into
-    *
-    *    vec2 v = VEC2_RVAL;
-    *    return packHalf2x16_split(v.x, v.y);
-    */
-   ir_rvalue*
-   split_pack_half_2x16(ir_rvalue *vec2_rval)
-   {
-      assert(vec2_rval->type == glsl_type::vec2_type);
-
-      ir_variable *v = factory.make_temp(glsl_type::vec2_type,
-                                         "tmp_split_pack_half_2x16_v");
-      factory.emit(assign(v, vec2_rval));
-
-      return expr(ir_binop_pack_half_2x16_split, swizzle_x(v), swizzle_y(v));
-   }
-
    /**
     * \brief Lower the component-wise calculation of unpackHalf2x16.
     *
@@ -1341,59 +1293,6 @@ private:
       assert(result->type == glsl_type::vec2_type);
       return result;
    }
-
-   /**
-    * \brief Split unpackHalf2x16 into two operations.
-    *
-    * \param uint_rval is unpackHalf2x16's input
-    * \return a vec2 rvalue
-    *
-    * Some code generators, such as the i965 fragment shader, require that all
-    * vector expressions be lowered to a sequence of scalar expressions.
-    * However, unpackHalf2x16 cannot be scalarized by the same method as
-    * a true vector operation because the number of components of its input
-    * and output differ.
-    *
-    * This method scalarizes unpackHalf2x16 by transforming it from a single
-    * operation having vec2 output to a pair of operations each having float
-    * output. That is, it transforms
-    *
-    *   unpackHalf2x16(UINT_RVAL)
-    *
-    * into
-    *
-    *   uint u = UINT_RVAL;
-    *   vec2 v;
-    *
-    *   v.x = unpackHalf2x16_split_x(u);
-    *   v.y = unpackHalf2x16_split_y(u);
-    *
-    *   return v;
-    */
-   ir_rvalue*
-   split_unpack_half_2x16(ir_rvalue *uint_rval)
-   {
-      assert(uint_rval->type == glsl_type::uint_type);
-
-      /* uint u = uint_rval; */
-      ir_variable *u = factory.make_temp(glsl_type::uint_type,
-                                          "tmp_split_unpack_half_2x16_u");
-      factory.emit(assign(u, uint_rval));
-
-      /* vec2 v; */
-      ir_variable *v = factory.make_temp(glsl_type::vec2_type,
-                                          "tmp_split_unpack_half_2x16_v");
-
-      /* v.x = unpack_half_2x16_split_x(u); */
-      factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_x, u),
-                           WRITEMASK_X));
-
-      /* v.y = unpack_half_2x16_split_y(u); */
-      factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_y, u),
-                           WRITEMASK_Y));
-
-      return deref(v).val;
-   }
 };
 
 } // namespace anonymous
diff --git a/src/glsl/lower_subroutine.cpp b/src/glsl/lower_subroutine.cpp
index a0df5e1df81..ac8ade13d99 100644
--- a/src/glsl/lower_subroutine.cpp
+++ b/src/glsl/lower_subroutine.cpp
@@ -44,6 +44,7 @@ public:
    }
 
    ir_visitor_status visit_leave(ir_call *);
+   ir_call *call_clone(ir_call *call, ir_function_signature *callee);
    bool progress;
    struct _mesa_glsl_parse_state *state;
 };
@@ -58,6 +59,23 @@ lower_subroutine(exec_list *instructions, struct _mesa_glsl_parse_state *state)
    return v.progress;
 }
 
+ir_call *
+lower_subroutine_visitor::call_clone(ir_call *call, ir_function_signature *callee)
+{
+   void *mem_ctx = ralloc_parent(call);
+   ir_dereference_variable *new_return_ref = NULL;
+   if (call->return_deref != NULL)
+      new_return_ref = call->return_deref->clone(mem_ctx, NULL);
+
+   exec_list new_parameters;
+
+   foreach_in_list(ir_instruction, ir, &call->actual_parameters) {
+      new_parameters.push_tail(ir->clone(mem_ctx, NULL));
+   }
+
+   return new(mem_ctx) ir_call(callee, new_return_ref, &new_parameters);
+}
+
 ir_visitor_status
 lower_subroutine_visitor::visit_leave(ir_call *ir)
 {
@@ -66,7 +84,6 @@ lower_subroutine_visitor::visit_leave(ir_call *ir)
 
    void *mem_ctx = ralloc_parent(ir);
    ir_if *last_branch = NULL;
-   ir_dereference_variable *return_deref = ir->return_deref;
 
    for (int s = this->state->num_subroutines - 1; s >= 0; s--) {
       ir_rvalue *var;
@@ -92,14 +109,11 @@ lower_subroutine_visitor::visit_leave(ir_call *ir)
          fn->exact_matching_signature(this->state,
                                       &ir->actual_parameters);
 
-      ir_call *new_call = new(mem_ctx) ir_call(sub_sig, return_deref, &ir->actual_parameters);
+      ir_call *new_call = call_clone(ir, sub_sig);
       if (!last_branch)
          last_branch = if_tree(equal(subr_to_int(var), lc), new_call);
       else
          last_branch = if_tree(equal(subr_to_int(var), lc), new_call, last_branch);
-
-      if (return_deref && s > 0)
-        return_deref = return_deref->clone(mem_ctx, NULL);
    }
    if (last_branch)
       ir->insert_before(last_branch);
diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp
index 5a1bbc43243..f29377cc260 100644
--- a/src/glsl/nir/glsl_to_nir.cpp
+++ b/src/glsl/nir/glsl_to_nir.cpp
@@ -1442,12 +1442,6 @@ nir_visitor::visit(ir_expression *ir)
    case ir_unop_unpack_half_2x16:
       result = nir_unpack_half_2x16(&b, srcs[0]);
       break;
-   case ir_unop_unpack_half_2x16_split_x:
-      result = nir_unpack_half_2x16_split_x(&b, srcs[0]);
-      break;
-   case ir_unop_unpack_half_2x16_split_y:
-      result = nir_unpack_half_2x16_split_y(&b, srcs[0]);
-      break;
    case ir_unop_bitfield_reverse:
       result = nir_bitfield_reverse(&b, srcs[0]);
       break;
@@ -1731,9 +1725,6 @@ nir_visitor::visit(ir_expression *ir)
       }
       break;
 
-   case ir_binop_pack_half_2x16_split:
-         result = nir_pack_half_2x16_split(&b, srcs[0], srcs[1]);
-         break;
    case ir_binop_ldexp: result = nir_ldexp(&b, srcs[0], srcs[1]); break;
    case ir_triop_fma:
       result = nir_ffma(&b, srcs[0], srcs[1], srcs[2]);
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 4e3533189e4..ec6595b091d 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -140,7 +140,7 @@ typedef enum {
  * ir_variable - it should be easy to translate between the two.
  */
 
-typedef struct {
+typedef struct nir_variable {
    struct exec_node node;
 
    /**
@@ -383,7 +383,7 @@ nir_variable_get_io_mask(nir_variable *var, gl_shader_stage stage)
    return ((1ull << slots) - 1) << var->data.location;
 }
 
-typedef struct {
+typedef struct nir_register {
    struct exec_node node;
 
    unsigned num_components; /** < number of vector components */
@@ -477,7 +477,7 @@ nir_instr_is_last(nir_instr *instr)
    return exec_node_is_tail_sentinel(exec_node_get_next(&instr->node));
 }
 
-typedef struct {
+typedef struct nir_ssa_def {
    /** for debugging only, can be NULL */
    const char* name;
 
@@ -1530,6 +1530,20 @@ typedef struct nir_shader_compiler_options {
    /** lowers ffract to fsub+ffloor: */
    bool lower_ffract;
 
+   bool lower_pack_half_2x16;
+   bool lower_pack_unorm_2x16;
+   bool lower_pack_snorm_2x16;
+   bool lower_pack_unorm_4x8;
+   bool lower_pack_snorm_4x8;
+   bool lower_unpack_half_2x16;
+   bool lower_unpack_unorm_2x16;
+   bool lower_unpack_snorm_2x16;
+   bool lower_unpack_unorm_4x8;
+   bool lower_unpack_snorm_4x8;
+
+   bool lower_extract_byte;
+   bool lower_extract_word;
+
    /**
     * Does the driver support real 32-bit integers?  (Otherwise, integers
     * are simulated by floats.)
diff --git a/src/glsl/nir/nir_builder.h b/src/glsl/nir/nir_builder.h
index e842b2252ff..1c7c78acae8 100644
--- a/src/glsl/nir/nir_builder.h
+++ b/src/glsl/nir/nir_builder.h
@@ -134,6 +134,20 @@ nir_imm_int(nir_builder *build, int x)
    return nir_build_imm(build, 1, v);
 }
 
+static inline nir_ssa_def *
+nir_imm_ivec4(nir_builder *build, int x, int y, int z, int w)
+{
+   nir_const_value v;
+
+   memset(&v, 0, sizeof(v));
+   v.i[0] = x;
+   v.i[1] = y;
+   v.i[2] = z;
+   v.i[3] = w;
+
+   return nir_build_imm(build, 4, v);
+}
+
 static inline nir_ssa_def *
 nir_build_alu(nir_builder *build, nir_op op, nir_ssa_def *src0,
               nir_ssa_def *src1, nir_ssa_def *src2, nir_ssa_def *src3)
diff --git a/src/glsl/nir/nir_lower_alu_to_scalar.c b/src/glsl/nir/nir_lower_alu_to_scalar.c
index 0a27e66cf0f..37cb0221e0b 100644
--- a/src/glsl/nir/nir_lower_alu_to_scalar.c
+++ b/src/glsl/nir/nir_lower_alu_to_scalar.c
@@ -97,6 +97,20 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
        */
       return;
 
+   case nir_op_pack_half_2x16:
+      if (!b->shader->options->lower_pack_half_2x16)
+         return;
+
+      nir_ssa_def *val =
+         nir_pack_half_2x16_split(b, nir_channel(b, instr->src[0].src.ssa,
+                                                 instr->src[0].swizzle[0]),
+                                     nir_channel(b, instr->src[0].src.ssa,
+                                                 instr->src[0].swizzle[1]));
+
+      nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(val));
+      nir_instr_remove(&instr->instr);
+      return;
+
    case nir_op_unpack_unorm_4x8:
    case nir_op_unpack_snorm_4x8:
    case nir_op_unpack_unorm_2x16:
@@ -106,11 +120,51 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
        */
       return;
 
-   case nir_op_unpack_half_2x16:
-      /* We could split this into unpack_half_2x16_split_[xy], but should
-       * we?
-       */
+   case nir_op_unpack_half_2x16: {
+      if (!b->shader->options->lower_unpack_half_2x16)
+         return;
+
+      nir_ssa_def *comps[2];
+      comps[0] = nir_unpack_half_2x16_split_x(b, instr->src[0].src.ssa);
+      comps[1] = nir_unpack_half_2x16_split_y(b, instr->src[0].src.ssa);
+      nir_ssa_def *vec = nir_vec(b, comps, 2);
+
+      nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(vec));
+      nir_instr_remove(&instr->instr);
       return;
+   }
+
+   case nir_op_pack_uvec2_to_uint: {
+      assert(b->shader->options->lower_pack_snorm_2x16 ||
+             b->shader->options->lower_pack_unorm_2x16);
+
+      nir_ssa_def *word =
+         nir_extract_uword(b, instr->src[0].src.ssa, nir_imm_int(b, 0));
+      nir_ssa_def *val =
+         nir_ior(b, nir_ishl(b, nir_channel(b, word, 1), nir_imm_int(b, 16)),
+                                nir_channel(b, word, 0));
+
+      nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(val));
+      nir_instr_remove(&instr->instr);
+      break;
+   }
+
+   case nir_op_pack_uvec4_to_uint: {
+      assert(b->shader->options->lower_pack_snorm_4x8 ||
+             b->shader->options->lower_pack_unorm_4x8);
+
+      nir_ssa_def *byte =
+         nir_extract_ubyte(b, instr->src[0].src.ssa, nir_imm_int(b, 0));
+      nir_ssa_def *val =
+         nir_ior(b, nir_ior(b, nir_ishl(b, nir_channel(b, byte, 3), nir_imm_int(b, 24)),
+                               nir_ishl(b, nir_channel(b, byte, 2), nir_imm_int(b, 16))),
+                    nir_ior(b, nir_ishl(b, nir_channel(b, byte, 1), nir_imm_int(b, 8)),
+                               nir_channel(b, byte, 0)));
+
+      nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(val));
+      nir_instr_remove(&instr->instr);
+      break;
+   }
 
    case nir_op_fdph: {
       nir_ssa_def *sum[4];
diff --git a/src/glsl/nir/nir_opcodes.py b/src/glsl/nir/nir_opcodes.py
index c5fb0420bb6..0eff89783dd 100644
--- a/src/glsl/nir/nir_opcodes.py
+++ b/src/glsl/nir/nir_opcodes.py
@@ -105,7 +105,7 @@ def opcode(name, output_size, output_type, input_sizes, input_types,
    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
                           input_types, algebraic_properties, const_expr)
 
-def unop_convert(name, in_type, out_type, const_expr):
+def unop_convert(name, out_type, in_type, const_expr):
    opcode(name, 0, out_type, [0], [in_type], "", const_expr)
 
 def unop(name, ty, const_expr):
@@ -155,17 +155,17 @@ unop("frsq", tfloat, "1.0f / sqrtf(src0)")
 unop("fsqrt", tfloat, "sqrtf(src0)")
 unop("fexp2", tfloat, "exp2f(src0)")
 unop("flog2", tfloat, "log2f(src0)")
-unop_convert("f2i", tfloat, tint, "src0") # Float-to-integer conversion.
-unop_convert("f2u", tfloat, tuint, "src0") # Float-to-unsigned conversion
-unop_convert("i2f", tint, tfloat, "src0") # Integer-to-float conversion.
+unop_convert("f2i", tint, tfloat, "src0") # Float-to-integer conversion.
+unop_convert("f2u", tuint, tfloat, "src0") # Float-to-unsigned conversion
+unop_convert("i2f", tfloat, tint, "src0") # Integer-to-float conversion.
 # Float-to-boolean conversion
-unop_convert("f2b", tfloat, tbool, "src0 != 0.0f")
+unop_convert("f2b", tbool, tfloat, "src0 != 0.0f")
 # Boolean-to-float conversion
-unop_convert("b2f", tbool, tfloat, "src0 ? 1.0f : 0.0f")
+unop_convert("b2f", tfloat, tbool, "src0 ? 1.0f : 0.0f")
 # Int-to-boolean conversion
-unop_convert("i2b", tint, tbool, "src0 != 0")
-unop_convert("b2i", tbool, tint, "src0 ? 1 : 0") # Boolean-to-int conversion
-unop_convert("u2f", tuint, tfloat, "src0") # Unsigned-to-float conversion.
+unop_convert("i2b", tbool, tint, "src0 != 0")
+unop_convert("b2i", tint, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
+unop_convert("u2f", tfloat, tuint, "src0") # Unsigned-to-float conversion.
 
 # Unary floating-point rounding operations.
 
@@ -238,6 +238,16 @@ unpack_2x16("unorm")
 unpack_4x8("unorm")
 unpack_2x16("half")
 
+unop_horiz("pack_uvec2_to_uint", 0, tuint, 2, tuint, """
+dst = (src0.x & 0xffff) | (src0.y >> 16);
+""")
+
+unop_horiz("pack_uvec4_to_uint", 0, tuint, 4, tuint, """
+dst = (src0.x <<  0) |
+      (src0.y <<  8) |
+      (src0.z << 16) |
+      (src0.w << 24);
+""")
 
 # Lowered floating point unpacking operations.
 
@@ -265,7 +275,7 @@ for (unsigned bit = 0; bit < 32; bit++) {
 }
 """)
 
-unop_convert("ufind_msb", tuint, tint, """
+unop_convert("ufind_msb", tint, tuint, """
 dst = -1;
 for (int bit = 31; bit > 0; bit--) {
    if ((src0 >> bit) & 1) {
@@ -551,6 +561,15 @@ dst.x = src0.x;
 dst.y = src1.x;
 """)
 
+# Byte extraction
+binop("extract_ubyte", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
+binop("extract_ibyte", tint, "", "(int8_t)(src0 >> (src1 * 8))")
+
+# Word extraction
+binop("extract_uword", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
+binop("extract_iword", tint, "", "(int16_t)(src0 >> (src1 * 16))")
+
+
 def triop(name, ty, const_expr):
    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
diff --git a/src/glsl/nir/nir_opt_algebraic.py b/src/glsl/nir/nir_opt_algebraic.py
index a46cbf711ac..190e4b7b43b 100644
--- a/src/glsl/nir/nir_opt_algebraic.py
+++ b/src/glsl/nir/nir_opt_algebraic.py
@@ -245,6 +245,70 @@ optimizations = [
     ('bcsel', ('ult', 31, 'bits'), 'value',
               ('ubfe', 'value', 'offset', 'bits')),
     'options->lower_bitfield_extract'),
+
+   (('extract_ibyte', a, b),
+    ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 8),
+    'options->lower_extract_byte'),
+
+   (('extract_ubyte', a, b),
+    ('iand', ('ushr', a, ('imul', b, 8)), 0xff),
+    'options->lower_extract_byte'),
+
+   (('extract_iword', a, b),
+    ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16),
+    'options->lower_extract_word'),
+
+   (('extract_uword', a, b),
+    ('iand', ('ushr', a, ('imul', b, 16)), 0xffff),
+    'options->lower_extract_word'),
+
+    (('pack_unorm_2x16', 'v'),
+     ('pack_uvec2_to_uint',
+        ('f2u', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))),
+     'options->lower_pack_unorm_2x16'),
+
+    (('pack_unorm_4x8', 'v'),
+     ('pack_uvec4_to_uint',
+        ('f2u', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))),
+     'options->lower_pack_unorm_4x8'),
+
+    (('pack_snorm_2x16', 'v'),
+     ('pack_uvec2_to_uint',
+        ('f2i', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))),
+     'options->lower_pack_snorm_2x16'),
+
+    (('pack_snorm_4x8', 'v'),
+     ('pack_uvec4_to_uint',
+        ('f2i', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))),
+     'options->lower_pack_snorm_4x8'),
+
+    (('unpack_unorm_2x16', 'v'),
+     ('fdiv', ('u2f', ('vec4', ('extract_uword', 'v', 0),
+                               ('extract_uword', 'v', 1), 0, 0)),
+              65535.0),
+     'options->lower_unpack_unorm_2x16'),
+
+    (('unpack_unorm_4x8', 'v'),
+     ('fdiv', ('u2f', ('vec4', ('extract_ubyte', 'v', 0),
+                               ('extract_ubyte', 'v', 1),
+                               ('extract_ubyte', 'v', 2),
+                               ('extract_ubyte', 'v', 3))),
+              255.0),
+     'options->lower_unpack_unorm_4x8'),
+
+    (('unpack_snorm_2x16', 'v'),
+     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_iword', 'v', 0),
+                                                            ('extract_iword', 'v', 1), 0, 0)),
+                                           32767.0))),
+     'options->lower_unpack_snorm_2x16'),
+
+    (('unpack_snorm_4x8', 'v'),
+     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_ibyte', 'v', 0),
+                                                            ('extract_ibyte', 'v', 1),
+                                                            ('extract_ibyte', 'v', 2),
+                                                            ('extract_ibyte', 'v', 3))),
+                                           127.0))),
+     'options->lower_unpack_snorm_4x8'),
 ]
 
 # Add optimizations to handle the case where the result of a ternary is
diff --git a/src/glsl/nir/nir_print.c b/src/glsl/nir/nir_print.c
index 850774b1099..a137706b15b 100644
--- a/src/glsl/nir/nir_print.c
+++ b/src/glsl/nir/nir_print.c
@@ -487,7 +487,7 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state)
       if (i != 0)
          fprintf(fp, ", ");
 
-      fprintf(fp, "%u", instr->const_index[i]);
+      fprintf(fp, "%d", instr->const_index[i]);
    }
 
    fprintf(fp, ")");
diff --git a/src/glsl/nir/shader_enums.c b/src/glsl/nir/shader_enums.c
index 1410a504484..41da4a7b9ea 100644
--- a/src/glsl/nir/shader_enums.c
+++ b/src/glsl/nir/shader_enums.c
@@ -33,7 +33,8 @@
 #define ENUM(x) [x] = #x
 #define NAME(val) ((((val) < ARRAY_SIZE(names)) && names[(val)]) ? names[(val)] : "UNKNOWN")
 
-const char * gl_shader_stage_name(gl_shader_stage stage)
+const char *
+gl_shader_stage_name(gl_shader_stage stage)
 {
    static const char *names[] = {
       ENUM(MESA_SHADER_VERTEX),
@@ -51,15 +52,16 @@ const char * gl_shader_stage_name(gl_shader_stage stage)
  * Translate a gl_shader_stage to a short shader stage name for debug
  * printouts and error messages.
  */
-const char * _mesa_shader_stage_to_string(unsigned stage)
+const char *
+_mesa_shader_stage_to_string(unsigned stage)
 {
    switch (stage) {
    case MESA_SHADER_VERTEX:   return "vertex";
    case MESA_SHADER_FRAGMENT: return "fragment";
    case MESA_SHADER_GEOMETRY: return "geometry";
    case MESA_SHADER_COMPUTE:  return "compute";
-   case MESA_SHADER_TESS_CTRL: return "tess ctrl";
-   case MESA_SHADER_TESS_EVAL: return "tess eval";
+   case MESA_SHADER_TESS_CTRL: return "tessellation control";
+   case MESA_SHADER_TESS_EVAL: return "tessellation evaluation";
    }
 
    unreachable("Unknown shader stage.");
@@ -69,7 +71,8 @@ const char * _mesa_shader_stage_to_string(unsigned stage)
  * Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS)
  * for debug printouts and error messages.
  */
-const char * _mesa_shader_stage_to_abbrev(unsigned stage)
+const char *
+_mesa_shader_stage_to_abbrev(unsigned stage)
 {
    switch (stage) {
    case MESA_SHADER_VERTEX:   return "VS";
@@ -83,7 +86,8 @@ const char * _mesa_shader_stage_to_abbrev(unsigned stage)
    unreachable("Unknown shader stage.");
 }
 
-const char * gl_vert_attrib_name(gl_vert_attrib attrib)
+const char *
+gl_vert_attrib_name(gl_vert_attrib attrib)
 {
    static const char *names[] = {
       ENUM(VERT_ATTRIB_POS),
@@ -124,7 +128,8 @@ const char * gl_vert_attrib_name(gl_vert_attrib attrib)
    return NAME(attrib);
 }
 
-const char * gl_varying_slot_name(gl_varying_slot slot)
+const char *
+gl_varying_slot_name(gl_varying_slot slot)
 {
    static const char *names[] = {
       ENUM(VARYING_SLOT_POS),
@@ -190,7 +195,8 @@ const char * gl_varying_slot_name(gl_varying_slot slot)
    return NAME(slot);
 }
 
-const char * gl_system_value_name(gl_system_value sysval)
+const char *
+gl_system_value_name(gl_system_value sysval)
 {
    static const char *names[] = {
      ENUM(SYSTEM_VALUE_VERTEX_ID),
@@ -218,7 +224,8 @@ const char * gl_system_value_name(gl_system_value sysval)
    return NAME(sysval);
 }
 
-const char * glsl_interp_qualifier_name(enum glsl_interp_qualifier qual)
+const char *
+glsl_interp_qualifier_name(enum glsl_interp_qualifier qual)
 {
    static const char *names[] = {
       ENUM(INTERP_QUALIFIER_NONE),
@@ -230,7 +237,8 @@ const char * glsl_interp_qualifier_name(enum glsl_interp_qualifier qual)
    return NAME(qual);
 }
 
-const char * gl_frag_result_name(gl_frag_result result)
+const char *
+gl_frag_result_name(gl_frag_result result)
 {
    static const char *names[] = {
       ENUM(FRAG_RESULT_DEPTH),
diff --git a/src/glsl/nir/shader_enums.h b/src/glsl/nir/shader_enums.h
index bc6ea3844b6..3a06b14a46b 100644
--- a/src/glsl/nir/shader_enums.h
+++ b/src/glsl/nir/shader_enums.h
@@ -47,19 +47,19 @@ typedef enum
    MESA_SHADER_COMPUTE = 5,
 } gl_shader_stage;
 
-const char * gl_shader_stage_name(gl_shader_stage stage);
+const char *gl_shader_stage_name(gl_shader_stage stage);
 
 /**
  * Translate a gl_shader_stage to a short shader stage name for debug
  * printouts and error messages.
  */
-const char * _mesa_shader_stage_to_string(unsigned stage);
+const char *_mesa_shader_stage_to_string(unsigned stage);
 
 /**
  * Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS)
  * for debug printouts and error messages.
  */
-const char * _mesa_shader_stage_to_abbrev(unsigned stage);
+const char *_mesa_shader_stage_to_abbrev(unsigned stage);
 
 #define MESA_SHADER_STAGES (MESA_SHADER_COMPUTE + 1)
 
@@ -109,7 +109,7 @@ typedef enum
    VERT_ATTRIB_MAX = 33
 } gl_vert_attrib;
 
-const char * gl_vert_attrib_name(gl_vert_attrib attrib);
+const char *gl_vert_attrib_name(gl_vert_attrib attrib);
 
 /**
  * Symbolic constats to help iterating over
@@ -254,7 +254,7 @@ typedef enum
 #define VARYING_SLOT_PATCH0	(VARYING_SLOT_MAX)
 #define VARYING_SLOT_TESS_MAX	(VARYING_SLOT_PATCH0 + MAX_VARYING)
 
-const char * gl_varying_slot_name(gl_varying_slot slot);
+const char *gl_varying_slot_name(gl_varying_slot slot);
 
 /**
  * Bitflags for varying slots.
@@ -467,7 +467,7 @@ typedef enum
    SYSTEM_VALUE_MAX             /**< Number of values */
 } gl_system_value;
 
-const char * gl_system_value_name(gl_system_value sysval);
+const char *gl_system_value_name(gl_system_value sysval);
 
 /**
  * The possible interpolation qualifiers that can be applied to a fragment
@@ -485,7 +485,7 @@ enum glsl_interp_qualifier
    INTERP_QUALIFIER_COUNT /**< Number of interpolation qualifiers */
 };
 
-const char * glsl_interp_qualifier_name(enum glsl_interp_qualifier qual);
+const char *glsl_interp_qualifier_name(enum glsl_interp_qualifier qual);
 
 /**
  * Fragment program results
@@ -516,7 +516,7 @@ typedef enum
    FRAG_RESULT_DATA7,
 } gl_frag_result;
 
-const char * gl_frag_result_name(gl_frag_result result);
+const char *gl_frag_result_name(gl_frag_result result);
 
 #define FRAG_RESULT_MAX		(FRAG_RESULT_DATA0 + MAX_DRAW_BUFFERS)
 
diff --git a/src/mapi/Makefile.am b/src/mapi/Makefile.am
index 307e05d503f..68a28a2283c 100644
--- a/src/mapi/Makefile.am
+++ b/src/mapi/Makefile.am
@@ -35,6 +35,7 @@ EXTRA_DIST = \
 	es2api/ABI-check \
 	mapi_abi.py \
 	glapi/SConscript \
+	glapi/registry/gl.xml \
 	shared-glapi/SConscript
 
 AM_CFLAGS = \
@@ -106,12 +107,16 @@ if HAVE_SPARC_ASM
 GLAPI_ASM_SOURCES = glapi/glapi_sparc.S
 endif
 
-glapi_libglapi_la_SOURCES = glapi/glapi_gentable.c
+glapi_libglapi_la_SOURCES =
 glapi_libglapi_la_CPPFLAGS = \
 	$(AM_CPPFLAGS) \
 	-I$(top_srcdir)/src/mapi/glapi \
 	-I$(top_srcdir)/src/mesa
 
+if HAVE_APPLEDRI
+glapi_libglapi_la_SOURCES += glapi/glapi_gentable.c
+endif
+
 if HAVE_SHARED_GLAPI
 glapi_libglapi_la_SOURCES += $(MAPI_BRIDGE_FILES) glapi/glapi_mapi_tmp.h
 glapi_libglapi_la_CPPFLAGS += \
diff --git a/src/mapi/glapi/gen/GREMEDY_string_marker.xml b/src/mapi/glapi/gen/GREMEDY_string_marker.xml
new file mode 100644
index 00000000000..ffa3eac5898
--- /dev/null
+++ b/src/mapi/glapi/gen/GREMEDY_string_marker.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0"?>
+<!DOCTYPE OpenGLAPI SYSTEM "gl_API.dtd">
+
+<!-- Note: no GLX protocol info yet. -->
+
+
+<OpenGLAPI>
+
+<category name="GL_GREMEDY_string_marker" number="311">
+
+    <function name="StringMarkerGREMEDY">
+        <param name="len" type="GLsizei"/>
+        <param name="string" type="const GLvoid *"/>
+    </function>
+
+</category>
+
+</OpenGLAPI>
diff --git a/src/mapi/glapi/gen/Makefile.am b/src/mapi/glapi/gen/Makefile.am
index 900b61a5d45..cd7feabba24 100644
--- a/src/mapi/glapi/gen/Makefile.am
+++ b/src/mapi/glapi/gen/Makefile.am
@@ -27,8 +27,11 @@ MESA_GLAPI_OUTPUTS = \
 	$(MESA_GLAPI_DIR)/glapi_mapi_tmp.h \
 	$(MESA_GLAPI_DIR)/glprocs.h \
 	$(MESA_GLAPI_DIR)/glapitemp.h \
-	$(MESA_GLAPI_DIR)/glapitable.h \
-	$(MESA_GLAPI_DIR)/glapi_gentable.c
+	$(MESA_GLAPI_DIR)/glapitable.h
+
+if HAVE_APPLEDRI
+MESA_GLAPI_OUTPUTS += $(MESA_GLAPI_DIR)/glapi_gentable.c
+endif
 
 MESA_GLAPI_ASM_OUTPUTS =
 if HAVE_X86_ASM
@@ -57,6 +60,7 @@ BUILT_SOURCES = \
 	$(MESA_GLX_DIR)/indirect_size.c
 EXTRA_DIST= \
 	$(BUILT_SOURCES) \
+	$(MESA_GLAPI_DIR)/glapi_gentable.c \
 	$(MESA_GLAPI_DIR)/glapi_x86.S \
 	$(MESA_GLAPI_DIR)/glapi_x86-64.S \
 	$(MESA_GLAPI_DIR)/glapi_sparc.S \
@@ -88,8 +92,12 @@ XORG_GLAPI_DIR = $(XORG_BASE)/glx
 XORG_GLAPI_OUTPUTS = \
 	$(XORG_GLAPI_DIR)/glprocs.h \
 	$(XORG_GLAPI_DIR)/glapitable.h \
-	$(XORG_GLAPI_DIR)/dispatch.h \
+	$(XORG_GLAPI_DIR)/dispatch.h
+
+if HAVE_APPLEDRI
+XORG_GLAPI_OUTPUTS += \
 	$(XORG_GLAPI_DIR)/glapi_gentable.c
+endif
 
 XORG_OUTPUTS = \
 	$(XORG_GLAPI_OUTPUTS) \
@@ -188,6 +196,7 @@ API_XML = \
 	EXT_texture_array.xml \
 	EXT_texture_integer.xml \
 	EXT_transform_feedback.xml \
+	GREMEDY_string_marker.xml \
 	INTEL_performance_query.xml \
 	KHR_debug.xml \
 	KHR_context_flush_control.xml \
diff --git a/src/mapi/glapi/gen/gl_API.xml b/src/mapi/glapi/gen/gl_API.xml
index 593ace49563..d7ab3bff4df 100644
--- a/src/mapi/glapi/gen/gl_API.xml
+++ b/src/mapi/glapi/gen/gl_API.xml
@@ -12620,6 +12620,8 @@
 
 <xi:include href="EXT_framebuffer_object.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
+<xi:include href="GREMEDY_string_marker.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
+
 <xi:include href="EXT_packed_depth_stencil.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
 <xi:include href="EXT_provoking_vertex.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
diff --git a/src/mapi/glapi/gen/gl_gentable.py b/src/mapi/glapi/gen/gl_gentable.py
index 1b3eb72470d..7cd475aa2b8 100644
--- a/src/mapi/glapi/gen/gl_gentable.py
+++ b/src/mapi/glapi/gen/gl_gentable.py
@@ -113,6 +113,9 @@ __glapi_gentable_set_remaining_noop(struct _glapi_table *disp) {
             dispatch[i] = p.v;
 }
 
+"""
+
+footer = """
 struct _glapi_table *
 _glapi_create_table_from_handle(void *handle, const char *symbol_prefix) {
     struct _glapi_table *disp = calloc(_glapi_get_dispatch_table_size(), sizeof(_glapi_proc));
@@ -123,27 +126,28 @@ _glapi_create_table_from_handle(void *handle, const char *symbol_prefix) {
 
     if(symbol_prefix == NULL)
         symbol_prefix = "";
-"""
 
-footer = """
-    __glapi_gentable_set_remaining_noop(disp);
+    /* Note: This code relies on _glapi_table_func_names being sorted by the
+     * entry point index of each function.
+     */
+    for (int func_index = 0; func_index < GLAPI_TABLE_COUNT; ++func_index) {
+        const char *name = _glapi_table_func_names[func_index];
+        void ** procp = &((void **)disp)[func_index];
 
-    return disp;
-}
-"""
-
-body_template = """
-    if(!disp->%(name)s) {
-        void ** procp = (void **) &disp->%(name)s;
-        snprintf(symboln, sizeof(symboln), "%%s%(entry_point)s", symbol_prefix);
+        snprintf(symboln, sizeof(symboln), \"%s%s\", symbol_prefix, name);
 #ifdef _WIN32
         *procp = GetProcAddress(handle, symboln);
 #else
         *procp = dlsym(handle, symboln);
 #endif
     }
+    __glapi_gentable_set_remaining_noop(disp);
+
+    return disp;
+}
 """
 
+
 class PrintCode(gl_XML.gl_print_base):
 
     def __init__(self):
@@ -180,12 +184,33 @@ class PrintCode(gl_XML.gl_print_base):
 
 
     def printBody(self, api):
-        for f in api.functionIterateByOffset():
-            for entry_point in f.entry_points:
-                vars = { 'entry_point' : entry_point,
-                         'name' : f.name }
 
-                print body_template % vars
+        # Determine how many functions have a defined offset.
+        func_count = 0
+        for f in api.functions_by_name.itervalues():
+            if f.offset != -1:
+                func_count += 1
+
+        # Build the mapping from offset to function name.
+        funcnames = [None] * func_count
+        for f in api.functions_by_name.itervalues():
+            if f.offset != -1:
+                if not (funcnames[f.offset] is None):
+                    raise Exception("Function table has more than one function with same offset (offset %d, func %s)" % (f.offset, f.name))
+                funcnames[f.offset] = f.name
+
+        # Check that the table has no gaps.  We expect a function at every offset,
+        # and the code which generates the table relies on this.
+        for i in xrange(0, func_count):
+            if funcnames[i] is None:
+                raise Exception("Function table has no function at offset %d" % (i))
+
+        print "#define GLAPI_TABLE_COUNT %d" % func_count
+        print "static const char * const _glapi_table_func_names[GLAPI_TABLE_COUNT] = {"
+        for i in xrange(0, func_count):
+            print "    /* %5d */ \"%s\"," % (i, funcnames[i])
+        print "};"
+
         return
 
 
diff --git a/src/mapi/glapi/glapi.h b/src/mapi/glapi/glapi.h
index f269b1701bc..3593c88bbc1 100644
--- a/src/mapi/glapi/glapi.h
+++ b/src/mapi/glapi/glapi.h
@@ -158,8 +158,10 @@ _GLAPI_EXPORT const char *
 _glapi_get_proc_name(unsigned int offset);
 
 
+#ifdef GLX_USE_APPLEGL
 _GLAPI_EXPORT struct _glapi_table *
 _glapi_create_table_from_handle(void *handle, const char *symbol_prefix);
+#endif
 
 
 _GLAPI_EXPORT void
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 86777430a2e..5d69039d1af 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -1,6 +1,7 @@
 i965_compiler_FILES = \
 	brw_cfg.cpp \
 	brw_cfg.h \
+	brw_compiler.c \
 	brw_compiler.h \
 	brw_dead_control_flow.cpp \
 	brw_dead_control_flow.h \
@@ -72,7 +73,9 @@ i965_compiler_FILES = \
 	brw_vec4_surface_builder.cpp \
 	brw_vec4_surface_builder.h \
 	brw_vec4_tcs.cpp \
+	brw_vec4_tcs.h \
 	brw_vec4_tes.cpp \
+	brw_vec4_tes.h \
 	brw_vec4_visitor.cpp \
 	brw_vec4_vs_visitor.cpp \
 	brw_vue_map.c \
diff --git a/src/mesa/drivers/dri/i965/brw_binding_tables.c b/src/mesa/drivers/dri/i965/brw_binding_tables.c
index 7fa5d602b96..f3a0310861c 100644
--- a/src/mesa/drivers/dri/i965/brw_binding_tables.c
+++ b/src/mesa/drivers/dri/i965/brw_binding_tables.c
@@ -365,7 +365,7 @@ gen7_disable_hw_binding_tables(struct brw_context *brw)
 /**
  * Enable hardware binding tables and set up the binding table pool.
  */
-static void
+void
 gen7_enable_hw_binding_tables(struct brw_context *brw)
 {
    if (!brw->use_resource_streamer)
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c b/src/mesa/drivers/dri/i965/brw_compiler.c
new file mode 100644
index 00000000000..f9e22d1d6b5
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_compiler.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright © 2015-2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_compiler.h"
+#include "brw_context.h"
+#include "glsl/nir/nir.h"
+#include "main/errors.h"
+#include "util/debug.h"
+
+static void
+shader_debug_log_mesa(void *data, const char *fmt, ...)
+{
+   struct brw_context *brw = (struct brw_context *)data;
+   va_list args;
+
+   va_start(args, fmt);
+   GLuint msg_id = 0;
+   _mesa_gl_vdebug(&brw->ctx, &msg_id,
+                   MESA_DEBUG_SOURCE_SHADER_COMPILER,
+                   MESA_DEBUG_TYPE_OTHER,
+                   MESA_DEBUG_SEVERITY_NOTIFICATION, fmt, args);
+   va_end(args);
+}
+
+static void
+shader_perf_log_mesa(void *data, const char *fmt, ...)
+{
+   struct brw_context *brw = (struct brw_context *)data;
+
+   va_list args;
+   va_start(args, fmt);
+
+   if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
+      va_list args_copy;
+      va_copy(args_copy, args);
+      vfprintf(stderr, fmt, args_copy);
+      va_end(args_copy);
+   }
+
+   if (brw->perf_debug) {
+      GLuint msg_id = 0;
+      _mesa_gl_vdebug(&brw->ctx, &msg_id,
+                      MESA_DEBUG_SOURCE_SHADER_COMPILER,
+                      MESA_DEBUG_TYPE_PERFORMANCE,
+                      MESA_DEBUG_SEVERITY_MEDIUM, fmt, args);
+   }
+   va_end(args);
+}
+
+#define COMMON_OPTIONS                                                        \
+   /* In order to help allow for better CSE at the NIR level we tell NIR to   \
+    * split all ffma instructions during opt_algebraic and we then re-combine \
+    * them as a later step.                                                   \
+    */                                                                        \
+   .lower_ffma = true,                                                        \
+   .lower_sub = true,                                                         \
+   .lower_fdiv = true,                                                        \
+   .lower_scmp = true,                                                        \
+   .lower_fmod = true,                                                        \
+   .lower_bitfield_extract = true,                                            \
+   .lower_bitfield_insert = true,                                             \
+   .lower_uadd_carry = true,                                                  \
+   .lower_usub_borrow = true,                                                 \
+   .lower_fdiv = true,                                                        \
+   .native_integers = true
+
+static const struct nir_shader_compiler_options scalar_nir_options = {
+   COMMON_OPTIONS,
+   .lower_pack_half_2x16 = true,
+   .lower_pack_snorm_2x16 = true,
+   .lower_pack_snorm_4x8 = true,
+   .lower_pack_unorm_2x16 = true,
+   .lower_pack_unorm_4x8 = true,
+   .lower_unpack_half_2x16 = true,
+   .lower_unpack_snorm_2x16 = true,
+   .lower_unpack_snorm_4x8 = true,
+   .lower_unpack_unorm_2x16 = true,
+   .lower_unpack_unorm_4x8 = true,
+};
+
+static const struct nir_shader_compiler_options vector_nir_options = {
+   COMMON_OPTIONS,
+
+   /* In the vec4 backend, our dpN instruction replicates its result to all the
+    * components of a vec4.  We would like NIR to give us replicated fdot
+    * instructions because it can optimize better for us.
+    */
+   .fdot_replicates = true,
+
+   .lower_pack_snorm_2x16 = true,
+   .lower_pack_unorm_2x16 = true,
+   .lower_unpack_snorm_2x16 = true,
+   .lower_unpack_unorm_2x16 = true,
+   .lower_extract_byte = true,
+   .lower_extract_word = true,
+};
+
+struct brw_compiler *
+brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
+{
+   struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler);
+
+   compiler->devinfo = devinfo;
+   compiler->shader_debug_log = shader_debug_log_mesa;
+   compiler->shader_perf_log = shader_perf_log_mesa;
+
+   brw_fs_alloc_reg_sets(compiler);
+   brw_vec4_alloc_reg_set(compiler);
+
+   compiler->scalar_stage[MESA_SHADER_VERTEX] =
+      devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS);
+   compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false;
+   compiler->scalar_stage[MESA_SHADER_TESS_EVAL] =
+      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true);
+   compiler->scalar_stage[MESA_SHADER_GEOMETRY] =
+      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", false);
+   compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true;
+   compiler->scalar_stage[MESA_SHADER_COMPUTE] = true;
+
+   /* We want the GLSL compiler to emit code that uses condition codes */
+   for (int i = 0; i < MESA_SHADER_STAGES; i++) {
+      compiler->glsl_compiler_options[i].MaxUnrollIterations = 32;
+      compiler->glsl_compiler_options[i].MaxIfDepth =
+         devinfo->gen < 6 ? 16 : UINT_MAX;
+
+      compiler->glsl_compiler_options[i].EmitCondCodes = true;
+      compiler->glsl_compiler_options[i].EmitNoNoise = true;
+      compiler->glsl_compiler_options[i].EmitNoMainReturn = true;
+      compiler->glsl_compiler_options[i].EmitNoIndirectInput = true;
+      compiler->glsl_compiler_options[i].EmitNoIndirectUniform = false;
+      compiler->glsl_compiler_options[i].LowerClipDistance = true;
+
+      bool is_scalar = compiler->scalar_stage[i];
+
+      compiler->glsl_compiler_options[i].EmitNoIndirectOutput = is_scalar;
+      compiler->glsl_compiler_options[i].EmitNoIndirectTemp = is_scalar;
+      compiler->glsl_compiler_options[i].OptimizeForAOS = !is_scalar;
+
+      /* !ARB_gpu_shader5 */
+      if (devinfo->gen < 7)
+         compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true;
+
+      compiler->glsl_compiler_options[i].NirOptions =
+         is_scalar ? &scalar_nir_options : &vector_nir_options;
+
+      compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true;
+   }
+
+   compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false;
+   compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false;
+
+   if (compiler->scalar_stage[MESA_SHADER_GEOMETRY])
+      compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false;
+
+   compiler->glsl_compiler_options[MESA_SHADER_COMPUTE]
+      .LowerShaderSharedVariables = true;
+
+   return compiler;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index b66869b8a78..62dcb4dad84 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -689,6 +689,9 @@ struct brw_gs_prog_data
 
 /** @} */
 
+struct brw_compiler *
+brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo);
+
 /**
  * Compile a vertex shader.
  *
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 7b0340fc2ab..2a29dfe5eec 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -221,6 +221,7 @@ enum brw_state_id {
    BRW_STATE_COMPUTE_PROGRAM,
    BRW_STATE_CS_WORK_GROUPS,
    BRW_STATE_URB_SIZE,
+   BRW_STATE_CC_STATE,
    BRW_NUM_STATE_BITS
 };
 
@@ -309,6 +310,7 @@ enum brw_state_id {
 #define BRW_NEW_COMPUTE_PROGRAM         (1ull << BRW_STATE_COMPUTE_PROGRAM)
 #define BRW_NEW_CS_WORK_GROUPS          (1ull << BRW_STATE_CS_WORK_GROUPS)
 #define BRW_NEW_URB_SIZE                (1ull << BRW_STATE_URB_SIZE)
+#define BRW_NEW_CC_STATE                (1ull << BRW_STATE_CC_STATE)
 
 struct brw_state_flags {
    /** State update flags signalled by mesa internals */
@@ -1262,7 +1264,7 @@ struct brw_context
 
    int num_atoms[BRW_NUM_PIPELINES];
    const struct brw_tracked_state render_atoms[76];
-   const struct brw_tracked_state compute_atoms[10];
+   const struct brw_tracked_state compute_atoms[11];
 
    /* If (INTEL_DEBUG & DEBUG_BATCH) */
    struct {
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index df7a79fcd89..9edb6f54204 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1087,6 +1087,18 @@ enum opcode {
     */
    SHADER_OPCODE_BROADCAST,
 
+   /**
+    * Pick the byte from its first source register given by the index
+    * specified as second source.
+    */
+   SHADER_OPCODE_EXTRACT_BYTE,
+
+   /**
+    * Pick the word from its first source register given by the index
+    * specified as second source.
+    */
+   SHADER_OPCODE_EXTRACT_WORD,
+
    VEC4_OPCODE_MOV_BYTES,
    VEC4_OPCODE_PACK_BYTES,
    VEC4_OPCODE_UNPACK_UNIFORM,
diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
index 21f0b703d00..cbad47ee40a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
@@ -72,6 +72,13 @@ channel_expressions_predicate(ir_instruction *ir)
       return false;
 
    switch (expr->operation) {
+      case ir_unop_pack_half_2x16:
+      case ir_unop_pack_snorm_2x16:
+      case ir_unop_pack_snorm_4x8:
+      case ir_unop_pack_unorm_2x16:
+      case ir_unop_pack_unorm_4x8:
+         return false;
+
       /* these opcodes need to act on the whole vector,
        * just like texturing.
        */
@@ -162,6 +169,11 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
       return visit_continue;
 
    switch (expr->operation) {
+      case ir_unop_pack_half_2x16:
+      case ir_unop_pack_snorm_2x16:
+      case ir_unop_pack_snorm_4x8:
+      case ir_unop_pack_unorm_2x16:
+      case ir_unop_pack_unorm_4x8:
       case ir_unop_interpolate_at_centroid:
       case ir_binop_interpolate_at_offset:
       case ir_binop_interpolate_at_sample:
@@ -399,9 +411,6 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
    case ir_unop_ssbo_unsized_array_length:
       unreachable("should have been lowered");
 
-   case ir_unop_unpack_half_2x16_split_x:
-   case ir_unop_unpack_half_2x16_split_y:
-   case ir_binop_pack_half_2x16_split:
    case ir_unop_interpolate_at_centroid:
    case ir_binop_interpolate_at_offset:
    case ir_binop_interpolate_at_sample:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index 3b65a382dc8..cde6566c05c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -78,6 +78,8 @@ is_expression(const fs_visitor *v, const fs_inst *const inst)
    case FS_OPCODE_LINTERP:
    case SHADER_OPCODE_FIND_LIVE_CHANNEL:
    case SHADER_OPCODE_BROADCAST:
+   case SHADER_OPCODE_EXTRACT_BYTE:
+   case SHADER_OPCODE_EXTRACT_WORD:
    case SHADER_OPCODE_MOV_INDIRECT:
       return true;
    case SHADER_OPCODE_RCP:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index b1134cff3c8..cac92b37bd5 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -2233,6 +2233,28 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          brw_broadcast(p, dst, src[0], src[1]);
          break;
 
+      case SHADER_OPCODE_EXTRACT_BYTE: {
+         assert(src[0].type == BRW_REGISTER_TYPE_D ||
+                src[0].type == BRW_REGISTER_TYPE_UD);
+
+         enum brw_reg_type type =
+            src[0].type == BRW_REGISTER_TYPE_D ? BRW_REGISTER_TYPE_B
+                                               : BRW_REGISTER_TYPE_UB;
+         brw_MOV(p, dst, spread(suboffset(retype(src[0], type), src[1].ud), 4));
+         break;
+      }
+
+      case SHADER_OPCODE_EXTRACT_WORD: {
+         assert(src[0].type == BRW_REGISTER_TYPE_D ||
+                src[0].type == BRW_REGISTER_TYPE_UD);
+
+         enum brw_reg_type type =
+            src[0].type == BRW_REGISTER_TYPE_D ? BRW_REGISTER_TYPE_W
+                                               : BRW_REGISTER_TYPE_UW;
+         brw_MOV(p, dst, spread(suboffset(retype(src[0], type), src[1].ud), 2));
+         break;
+      }
+
       case FS_OPCODE_SET_SAMPLE_ID:
          generate_set_sample_id(inst, dst, src[0], src[1]);
          break;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 65a0ffc4d8d..f41854c2c09 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1126,6 +1126,22 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       inst->predicate = BRW_PREDICATE_NORMAL;
       break;
 
+   case nir_op_extract_ubyte:
+   case nir_op_extract_ibyte: {
+      nir_const_value *byte = nir_src_as_const_value(instr->src[1].src);
+      bld.emit(SHADER_OPCODE_EXTRACT_BYTE,
+               result, op[0], brw_imm_ud(byte->u[0]));
+      break;
+   }
+
+   case nir_op_extract_uword:
+   case nir_op_extract_iword: {
+      nir_const_value *word = nir_src_as_const_value(instr->src[1].src);
+      bld.emit(SHADER_OPCODE_EXTRACT_WORD,
+               result, op[0], brw_imm_ud(word->u[0]));
+      break;
+   }
+
    default:
       unreachable("unhandled instruction");
    }
diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp
index 234afd554df..ab9d7929c05 100644
--- a/src/mesa/drivers/dri/i965/brw_link.cpp
+++ b/src/mesa/drivers/dri/i965/brw_link.cpp
@@ -73,36 +73,13 @@ brw_lower_packing_builtins(struct brw_context *brw,
                            gl_shader_stage shader_type,
                            exec_list *ir)
 {
-   const struct brw_compiler *compiler = brw->intelScreen->compiler;
+   /* Gens < 7 don't have instructions to convert to or from half-precision,
+    * and Gens < 6 don't expose that functionality.
+    */
+   if (brw->gen != 6)
+      return;
 
-   int ops = LOWER_PACK_SNORM_2x16
-           | LOWER_UNPACK_SNORM_2x16
-           | LOWER_PACK_UNORM_2x16
-           | LOWER_UNPACK_UNORM_2x16;
-
-   if (compiler->scalar_stage[shader_type]) {
-      ops |= LOWER_UNPACK_UNORM_4x8
-           | LOWER_UNPACK_SNORM_4x8
-           | LOWER_PACK_UNORM_4x8
-           | LOWER_PACK_SNORM_4x8;
-   }
-
-   if (brw->gen >= 7) {
-      /* Gen7 introduced the f32to16 and f16to32 instructions, which can be
-       * used to execute packHalf2x16 and unpackHalf2x16. For AOS code, no
-       * lowering is needed. For SOA code, the Half2x16 ops must be
-       * scalarized.
-       */
-      if (compiler->scalar_stage[shader_type]) {
-         ops |= LOWER_PACK_HALF_2x16_TO_SPLIT
-             |  LOWER_UNPACK_HALF_2x16_TO_SPLIT;
-      }
-   } else {
-      ops |= LOWER_PACK_HALF_2x16
-          |  LOWER_UNPACK_HALF_2x16;
-   }
-
-   lower_packing_builtins(ir, ops);
+   lower_packing_builtins(ir, LOWER_PACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16);
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index cf6ba5b4aeb..319c2a5669f 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -868,12 +868,146 @@ brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline)
    const uint32_t _3DSTATE_PIPELINE_SELECT =
       is_965 ? CMD_PIPELINE_SELECT_965 : CMD_PIPELINE_SELECT_GM45;
 
+   if (brw->use_resource_streamer && pipeline != BRW_RENDER_PIPELINE) {
+      /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
+       * PIPELINE_SELECT [DevBWR+]":
+       *
+       *   Project: HSW, BDW, CHV, SKL, BXT
+       *
+       *   Hardware Binding Tables are only supported for 3D
+       *   workloads. Resource streamer must be enabled only for 3D
+       *   workloads. Resource streamer must be disabled for Media and GPGPU
+       *   workloads.
+       */
+      BEGIN_BATCH(1);
+      OUT_BATCH(MI_RS_CONTROL | 0);
+      ADVANCE_BATCH();
+
+      gen7_disable_hw_binding_tables(brw);
+
+      /* XXX - Disable gather constant pool too when we start using it. */
+   }
+
+   if (brw->gen >= 8 && brw->gen < 10) {
+      /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
+       *
+       *   Software must clear the COLOR_CALC_STATE Valid field in
+       *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
+       *   with Pipeline Select set to GPGPU.
+       *
+       * The internal hardware docs recommend the same workaround for Gen9
+       * hardware too.
+       */
+      if (pipeline == BRW_COMPUTE_PIPELINE) {
+         BEGIN_BATCH(2);
+         OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
+         OUT_BATCH(0);
+         ADVANCE_BATCH();
+
+         brw->ctx.NewDriverState |= BRW_NEW_CC_STATE;
+      }
+
+   } else if (brw->gen >= 6) {
+      /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
+       * PIPELINE_SELECT [DevBWR+]":
+       *
+       *   Project: DEVSNB+
+       *
+       *   Software must ensure all the write caches are flushed through a
+       *   stalling PIPE_CONTROL command followed by another PIPE_CONTROL
+       *   command to invalidate read only caches prior to programming
+       *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
+       */
+      const unsigned dc_flush =
+         brw->gen >= 7 ? PIPE_CONTROL_DATA_CACHE_INVALIDATE : 0;
+
+      if (brw->gen == 6) {
+         /* Hardware workaround: SNB B-Spec says:
+          *
+          *   Before a PIPE_CONTROL with Write Cache Flush Enable = 1, a
+          *   PIPE_CONTROL with any non-zero post-sync-op is required.
+          */
+         brw_emit_post_sync_nonzero_flush(brw);
+      }
+
+      brw_emit_pipe_control_flush(brw,
+                                  PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                  dc_flush |
+                                  PIPE_CONTROL_NO_WRITE |
+                                  PIPE_CONTROL_CS_STALL);
+
+      brw_emit_pipe_control_flush(brw,
+                                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+                                  PIPE_CONTROL_CONST_CACHE_INVALIDATE |
+                                  PIPE_CONTROL_STATE_CACHE_INVALIDATE |
+                                  PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+                                  PIPE_CONTROL_NO_WRITE);
+
+   } else {
+      /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
+       * PIPELINE_SELECT [DevBWR+]":
+       *
+       *   Project: PRE-DEVSNB
+       *
+       *   Software must ensure the current pipeline is flushed via an
+       *   MI_FLUSH or PIPE_CONTROL prior to the execution of PIPELINE_SELECT.
+       */
+      BEGIN_BATCH(1);
+      OUT_BATCH(MI_FLUSH);
+      ADVANCE_BATCH();
+   }
+
    /* Select the pipeline */
    BEGIN_BATCH(1);
    OUT_BATCH(_3DSTATE_PIPELINE_SELECT << 16 |
              (brw->gen >= 9 ? (3 << 8) : 0) |
              (pipeline == BRW_COMPUTE_PIPELINE ? 2 : 0));
    ADVANCE_BATCH();
+
+   if (brw->gen == 7 && !brw->is_haswell &&
+       pipeline == BRW_RENDER_PIPELINE) {
+      /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
+       * PIPELINE_SELECT [DevBWR+]":
+       *
+       *   Project: DEVIVB, DEVHSW:GT3:A0
+       *
+       *   Software must send a pipe_control with a CS stall and a post sync
+       *   operation and then a dummy DRAW after every MI_SET_CONTEXT and
+       *   after any PIPELINE_SELECT that is enabling 3D mode.
+       */
+      gen7_emit_cs_stall_flush(brw);
+
+      BEGIN_BATCH(7);
+      OUT_BATCH(CMD_3D_PRIM << 16 | (7 - 2));
+      OUT_BATCH(_3DPRIM_POINTLIST);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   if (brw->use_resource_streamer && pipeline == BRW_RENDER_PIPELINE) {
+      /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
+       * PIPELINE_SELECT [DevBWR+]":
+       *
+       *   Project: HSW, BDW, CHV, SKL, BXT
+       *
+       *   Hardware Binding Tables are only supported for 3D
+       *   workloads. Resource streamer must be enabled only for 3D
+       *   workloads. Resource streamer must be disabled for Media and GPGPU
+       *   workloads.
+       */
+      BEGIN_BATCH(1);
+      OUT_BATCH(MI_RS_CONTROL | 1);
+      ADVANCE_BATCH();
+
+      gen7_enable_hw_binding_tables(brw);
+
+      /* XXX - Re-enable gather constant pool here. */
+   }
 }
 
 /**
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index d983f58765e..d6987c80ed6 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -60,7 +60,7 @@ struct add_const_offset_to_base_params {
 };
 
 static bool
-add_const_offset_to_base(nir_block *block, void *closure)
+add_const_offset_to_base_block(nir_block *block, void *closure)
 {
    struct add_const_offset_to_base_params *params = closure;
    nir_builder *b = &params->b;
@@ -85,7 +85,19 @@ add_const_offset_to_base(nir_block *block, void *closure)
       }
    }
    return true;
+}
 
+static void
+add_const_offset_to_base(nir_shader *nir, nir_variable_mode mode)
+{
+   struct add_const_offset_to_base_params params = { .mode = mode };
+
+   nir_foreach_function(nir, f) {
+      if (f->impl) {
+         nir_builder_init(&params.b, f->impl);
+         nir_foreach_block(f->impl, add_const_offset_to_base_block, &params);
+      }
+   }
 }
 
 static bool
@@ -195,10 +207,6 @@ brw_nir_lower_inputs(nir_shader *nir,
                      const struct brw_device_info *devinfo,
                      bool is_scalar)
 {
-   struct add_const_offset_to_base_params params = {
-      .mode = nir_var_shader_in
-   };
-
    switch (nir->stage) {
    case MESA_SHADER_VERTEX:
       /* Start with the location of the variable's base. */
@@ -212,6 +220,11 @@ brw_nir_lower_inputs(nir_shader *nir,
        */
       nir_lower_io(nir, nir_var_shader_in, type_size_vec4);
 
+      /* This pass needs actual constants */
+      nir_opt_constant_folding(nir);
+
+      add_const_offset_to_base(nir, nir_var_shader_in);
+
       if (is_scalar) {
          /* Finally, translate VERT_ATTRIB_* values into the actual registers.
           *
@@ -221,13 +234,8 @@ brw_nir_lower_inputs(nir_shader *nir,
           */
          GLbitfield64 inputs_read = nir->info.inputs_read;
 
-         /* This pass needs actual constants */
-         nir_opt_constant_folding(nir);
-
          nir_foreach_function(nir, function) {
             if (function->impl) {
-               nir_builder_init(&params.b, function->impl);
-               nir_foreach_block(function->impl, add_const_offset_to_base, &params);
                nir_foreach_block(function->impl, remap_vs_attrs, &inputs_read);
             }
          }
@@ -270,10 +278,10 @@ brw_nir_lower_inputs(nir_shader *nir,
          /* This pass needs actual constants */
          nir_opt_constant_folding(nir);
 
+         add_const_offset_to_base(nir, nir_var_shader_in);
+
          nir_foreach_function(nir, function) {
             if (function->impl) {
-               nir_builder_init(&params.b, function->impl);
-               nir_foreach_block(function->impl, add_const_offset_to_base, &params);
                nir_foreach_block(function->impl, remap_inputs_with_vue_map,
                                  &input_vue_map);
             }
@@ -296,10 +304,10 @@ brw_nir_lower_inputs(nir_shader *nir,
       /* This pass needs actual constants */
       nir_opt_constant_folding(nir);
 
+      add_const_offset_to_base(nir, nir_var_shader_in);
+
       nir_foreach_function(nir, function) {
          if (function->impl) {
-            nir_builder_init(&params.b, function->impl);
-            nir_foreach_block(function->impl, add_const_offset_to_base, &params);
             nir_builder_init(&state.b, function->impl);
             nir_foreach_block(function->impl, remap_patch_urb_offsets, &state);
          }
@@ -339,10 +347,6 @@ brw_nir_lower_outputs(nir_shader *nir,
       }
       break;
    case MESA_SHADER_TESS_CTRL: {
-      struct add_const_offset_to_base_params params = {
-         .mode = nir_var_shader_out
-      };
-
       struct remap_patch_urb_offsets_state state;
       brw_compute_tess_vue_map(&state.vue_map, nir->info.outputs_written,
                                nir->info.patch_outputs_written);
@@ -356,10 +360,10 @@ brw_nir_lower_outputs(nir_shader *nir,
       /* This pass needs actual constants */
       nir_opt_constant_folding(nir);
 
+      add_const_offset_to_base(nir, nir_var_shader_out);
+
       nir_foreach_function(nir, function) {
          if (function->impl) {
-            nir_builder_init(&params.b, function->impl);
-            nir_foreach_block(function->impl, add_const_offset_to_base, &params);
             nir_builder_init(&state.b, function->impl);
             nir_foreach_block(function->impl, remap_patch_urb_offsets, &state);
          }
diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c
index d181468f5cb..c20a02817f9 100644
--- a/src/mesa/drivers/dri/i965/brw_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c
@@ -582,7 +582,7 @@ brw_upload_sampler_state_table(struct brw_context *brw,
       batch_offset_for_sampler_state += size_in_bytes;
    }
 
-   if (brw->gen >= 7) {
+   if (brw->gen >= 7 && stage_state->stage != MESA_SHADER_COMPUTE) {
       /* Emit a 3DSTATE_SAMPLER_STATE_POINTERS_XS packet. */
       gen7_emit_sampler_state_pointers_xs(brw, stage_state);
    } else {
@@ -693,3 +693,23 @@ const struct brw_tracked_state brw_tes_samplers = {
    },
    .emit = brw_upload_tes_samplers,
 };
+
+static void
+brw_upload_cs_samplers(struct brw_context *brw)
+{
+   /* BRW_NEW_COMPUTE_PROGRAM */
+   struct gl_program *cs = (struct gl_program *) brw->compute_program;
+   if (!cs)
+      return;
+
+   brw_upload_sampler_state_table(brw, cs, &brw->cs.base);
+}
+
+const struct brw_tracked_state brw_cs_samplers = {
+   .dirty = {
+      .mesa = _NEW_TEXTURE,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_COMPUTE_PROGRAM,
+   },
+   .emit = brw_upload_cs_samplers,
+};
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index fec96bac923..e4ce8cbf748 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -29,137 +29,6 @@
 #include "brw_vec4_tes.h"
 #include "main/shaderobj.h"
 #include "main/uniforms.h"
-#include "util/debug.h"
-
-static void
-shader_debug_log_mesa(void *data, const char *fmt, ...)
-{
-   struct brw_context *brw = (struct brw_context *)data;
-   va_list args;
-
-   va_start(args, fmt);
-   GLuint msg_id = 0;
-   _mesa_gl_vdebug(&brw->ctx, &msg_id,
-                   MESA_DEBUG_SOURCE_SHADER_COMPILER,
-                   MESA_DEBUG_TYPE_OTHER,
-                   MESA_DEBUG_SEVERITY_NOTIFICATION, fmt, args);
-   va_end(args);
-}
-
-static void
-shader_perf_log_mesa(void *data, const char *fmt, ...)
-{
-   struct brw_context *brw = (struct brw_context *)data;
-
-   va_list args;
-   va_start(args, fmt);
-
-   if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
-      va_list args_copy;
-      va_copy(args_copy, args);
-      vfprintf(stderr, fmt, args_copy);
-      va_end(args_copy);
-   }
-
-   if (brw->perf_debug) {
-      GLuint msg_id = 0;
-      _mesa_gl_vdebug(&brw->ctx, &msg_id,
-                      MESA_DEBUG_SOURCE_SHADER_COMPILER,
-                      MESA_DEBUG_TYPE_PERFORMANCE,
-                      MESA_DEBUG_SEVERITY_MEDIUM, fmt, args);
-   }
-   va_end(args);
-}
-
-struct brw_compiler *
-brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
-{
-   struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler);
-
-   compiler->devinfo = devinfo;
-   compiler->shader_debug_log = shader_debug_log_mesa;
-   compiler->shader_perf_log = shader_perf_log_mesa;
-
-   brw_fs_alloc_reg_sets(compiler);
-   brw_vec4_alloc_reg_set(compiler);
-
-   compiler->scalar_stage[MESA_SHADER_VERTEX] =
-      devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS);
-   compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false;
-   compiler->scalar_stage[MESA_SHADER_TESS_EVAL] =
-      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true);
-   compiler->scalar_stage[MESA_SHADER_GEOMETRY] =
-      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", false);
-   compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true;
-   compiler->scalar_stage[MESA_SHADER_COMPUTE] = true;
-
-   nir_shader_compiler_options *nir_options =
-      rzalloc(compiler, nir_shader_compiler_options);
-   nir_options->native_integers = true;
-   nir_options->vertex_id_zero_based = true;
-   nir_options->lower_fdiv = true;
-   /* In order to help allow for better CSE at the NIR level we tell NIR
-    * to split all ffma instructions during opt_algebraic and we then
-    * re-combine them as a later step.
-    */
-   nir_options->lower_ffma = true;
-   nir_options->lower_sub = true;
-   nir_options->lower_fdiv = true;
-   nir_options->lower_scmp = true;
-   nir_options->lower_fmod = true;
-   nir_options->lower_bitfield_extract = true;
-   nir_options->lower_bitfield_insert = true;
-   nir_options->lower_uadd_carry = true;
-   nir_options->lower_usub_borrow = true;
-
-   /* In the vec4 backend, our dpN instruction replicates its result to all
-    * the components of a vec4.  We would like NIR to give us replicated fdot
-    * instructions because it can optimize better for us.
-    *
-    * For the FS backend, it should be lowered away by the scalarizing pass so
-    * we should never see fdot anyway.
-    */
-   nir_options->fdot_replicates = true;
-
-   /* We want the GLSL compiler to emit code that uses condition codes */
-   for (int i = 0; i < MESA_SHADER_STAGES; i++) {
-      compiler->glsl_compiler_options[i].MaxUnrollIterations = 32;
-      compiler->glsl_compiler_options[i].MaxIfDepth =
-         devinfo->gen < 6 ? 16 : UINT_MAX;
-
-      compiler->glsl_compiler_options[i].EmitCondCodes = true;
-      compiler->glsl_compiler_options[i].EmitNoNoise = true;
-      compiler->glsl_compiler_options[i].EmitNoMainReturn = true;
-      compiler->glsl_compiler_options[i].EmitNoIndirectInput = true;
-      compiler->glsl_compiler_options[i].EmitNoIndirectUniform = false;
-      compiler->glsl_compiler_options[i].LowerClipDistance = true;
-
-      bool is_scalar = compiler->scalar_stage[i];
-
-      compiler->glsl_compiler_options[i].EmitNoIndirectOutput = is_scalar;
-      compiler->glsl_compiler_options[i].EmitNoIndirectTemp = is_scalar;
-      compiler->glsl_compiler_options[i].OptimizeForAOS = !is_scalar;
-
-      /* !ARB_gpu_shader5 */
-      if (devinfo->gen < 7)
-         compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true;
-
-      compiler->glsl_compiler_options[i].NirOptions = nir_options;
-
-      compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true;
-   }
-
-   compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false;
-   compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false;
-
-   if (compiler->scalar_stage[MESA_SHADER_GEOMETRY])
-      compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false;
-
-   compiler->glsl_compiler_options[MESA_SHADER_COMPUTE]
-      .LowerShaderSharedVariables = true;
-
-   return compiler;
-}
 
 extern "C" struct gl_shader *
 brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type)
@@ -444,6 +313,10 @@ brw_instruction_name(enum opcode op)
    case SHADER_OPCODE_BROADCAST:
       return "broadcast";
 
+   case SHADER_OPCODE_EXTRACT_BYTE:
+      return "extract_byte";
+   case SHADER_OPCODE_EXTRACT_WORD:
+      return "extract_word";
    case VEC4_OPCODE_MOV_BYTES:
       return "mov_bytes";
    case VEC4_OPCODE_PACK_BYTES:
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index d29b997b963..f44ccd6e071 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -75,6 +75,7 @@ extern const struct brw_tracked_state brw_vs_samplers;
 extern const struct brw_tracked_state brw_tcs_samplers;
 extern const struct brw_tracked_state brw_tes_samplers;
 extern const struct brw_tracked_state brw_gs_samplers;
+extern const struct brw_tracked_state brw_cs_samplers;
 extern const struct brw_tracked_state brw_vs_ubo_surfaces;
 extern const struct brw_tracked_state brw_vs_abo_surfaces;
 extern const struct brw_tracked_state brw_vs_image_surfaces;
@@ -396,6 +397,7 @@ void gen7_update_binding_table_from_array(struct brw_context *brw,
                                           gl_shader_stage stage,
                                           const uint32_t* binding_table,
                                           int num_surfaces);
+void gen7_enable_hw_binding_tables(struct brw_context *brw);
 void gen7_disable_hw_binding_tables(struct brw_context *brw);
 void gen7_reset_hw_bt_pool_offsets(struct brw_context *brw);
 
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 2a671a58d8c..ee75ca88549 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -282,6 +282,7 @@ static const struct brw_tracked_state *gen7_compute_atoms[] =
    &brw_cs_abo_surfaces,
    &brw_texture_surfaces,
    &brw_cs_work_groups_surface,
+   &brw_cs_samplers,
    &brw_cs_state,
 };
 
@@ -396,6 +397,7 @@ static const struct brw_tracked_state *gen8_compute_atoms[] =
    &brw_cs_abo_surfaces,
    &brw_texture_surfaces,
    &brw_cs_work_groups_surface,
+   &brw_cs_samplers,
    &brw_cs_state,
 };
 
@@ -664,6 +666,7 @@ static struct dirty_bit_map brw_bits[] = {
    DEFINE_BIT(BRW_NEW_COMPUTE_PROGRAM),
    DEFINE_BIT(BRW_NEW_CS_WORK_GROUPS),
    DEFINE_BIT(BRW_NEW_URB_SIZE),
+   DEFINE_BIT(BRW_NEW_CC_STATE),
    {0, 0, 0}
 };
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 358a71041fc..394e32169d9 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1980,11 +1980,11 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
                unsigned *final_assembly_size,
                char **error_str)
 {
+   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX];
    nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
    shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
-                                      compiler->scalar_stage[MESA_SHADER_VERTEX]);
-   shader = brw_postprocess_nir(shader, compiler->devinfo,
-                                compiler->scalar_stage[MESA_SHADER_VERTEX]);
+                                      is_scalar);
+   shader = brw_postprocess_nir(shader, compiler->devinfo, is_scalar);
 
    const unsigned *assembly = NULL;
 
@@ -2010,7 +2010,7 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
     * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode.  Empirically, in
     * vec4 mode, the hardware appears to wedge unless we read something.
     */
-   if (compiler->scalar_stage[MESA_SHADER_VERTEX])
+   if (is_scalar)
       prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attributes, 2);
    else
       prog_data->base.urb_read_length = DIV_ROUND_UP(MAX2(nr_attributes, 1), 2);
@@ -2029,7 +2029,7 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
    else
       prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
 
-   if (compiler->scalar_stage[MESA_SHADER_VERTEX]) {
+   if (is_scalar) {
       prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
 
       fs_visitor v(compiler, log_data, mem_ctx, key, &prog_data->base.base,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 531113a9df5..a608dca03ff 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1062,7 +1062,7 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
    case nir_op_umul_high: {
       struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
 
-      if (devinfo->gen >=8)
+      if (devinfo->gen >= 8)
          emit(MUL(acc, op[0], retype(op[1], BRW_REGISTER_TYPE_UW)));
       else
          emit(MUL(acc, op[0], op[1]));
@@ -1376,6 +1376,24 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
    case nir_op_pack_unorm_2x16:
       unreachable("not reached: should be handled by lower_packing_builtins");
 
+   case nir_op_pack_uvec4_to_uint:
+      unreachable("not reached");
+
+   case nir_op_pack_uvec2_to_uint: {
+      dst_reg tmp1 = dst_reg(this, glsl_type::uint_type);
+      tmp1.writemask = WRITEMASK_X;
+      op[0].swizzle = BRW_SWIZZLE_YYYY;
+      emit(SHL(tmp1, op[0], src_reg(brw_imm_ud(16u))));
+
+      dst_reg tmp2 = dst_reg(this, glsl_type::uint_type);
+      tmp2.writemask = WRITEMASK_X;
+      op[0].swizzle = BRW_SWIZZLE_XXXX;
+      emit(AND(tmp2, op[0], src_reg(brw_imm_ud(0xffffu))));
+
+      emit(OR(dst, src_reg(tmp1), src_reg(tmp2)));
+      break;
+   }
+
    case nir_op_unpack_half_2x16:
       /* As NIR does not guarantee that we have a correct swizzle outside the
        * boundaries of a vector, and the implementation of emit_unpack_half_2x16
diff --git a/src/mesa/drivers/dri/i965/brw_vue_map.c b/src/mesa/drivers/dri/i965/brw_vue_map.c
index fea24368e8c..b66c209b24d 100644
--- a/src/mesa/drivers/dri/i965/brw_vue_map.c
+++ b/src/mesa/drivers/dri/i965/brw_vue_map.c
@@ -248,6 +248,8 @@ brw_compute_tess_vue_map(struct brw_vue_map *vue_map,
 static const char *
 varying_name(brw_varying_slot slot)
 {
+   assume(slot < BRW_VARYING_SLOT_COUNT);
+
    if (slot < VARYING_SLOT_MAX)
       return gl_varying_slot_name(slot);
 
@@ -257,7 +259,6 @@ varying_name(brw_varying_slot slot)
       [BRW_VARYING_SLOT_PNTC - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_PNTC",
    };
 
-   assert(slot < BRW_VARYING_SLOT_COUNT);
    return brw_names[slot - VARYING_SLOT_MAX];
 }
 
diff --git a/src/mesa/drivers/dri/i965/gen6_cc.c b/src/mesa/drivers/dri/i965/gen6_cc.c
index 3bab8f46ae8..cee139b7fd4 100644
--- a/src/mesa/drivers/dri/i965/gen6_cc.c
+++ b/src/mesa/drivers/dri/i965/gen6_cc.c
@@ -298,6 +298,7 @@ const struct brw_tracked_state gen6_color_calc_state = {
       .mesa = _NEW_COLOR |
               _NEW_STENCIL,
       .brw = BRW_NEW_BATCH |
+             BRW_NEW_CC_STATE |
              BRW_NEW_STATE_BASE_ADDRESS,
    },
    .emit = gen6_upload_color_calc_state,
diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c b/src/mesa/drivers/dri/i965/gen7_cs_state.c
index a025bb9dd66..6d6988c6a41 100644
--- a/src/mesa/drivers/dri/i965/gen7_cs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c
@@ -196,6 +196,7 @@ const struct brw_tracked_state brw_cs_state = {
       .brw = BRW_NEW_BATCH |
              BRW_NEW_CS_PROG_DATA |
              BRW_NEW_PUSH_CONSTANT_ALLOCATION |
+             BRW_NEW_SAMPLER_STATE_TABLE |
              BRW_NEW_SURFACES,
    },
    .emit = brw_upload_cs_state
diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 26f873bc9a9..8ede1f06e4e 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -953,7 +953,7 @@ _mesa_handle_bind_buffer_gen(struct gl_context *ctx,
 {
    struct gl_buffer_object *buf = *buf_handle;
 
-   if (!buf && (ctx->API == API_OPENGL_CORE || _mesa_is_gles31(ctx))) {
+   if (!buf && (ctx->API == API_OPENGL_CORE)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s(non-gen name)", caller);
       return false;
    }
diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index be983d4c86a..f3fd01f395e 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -1930,31 +1930,6 @@ _mesa_check_blend_func_error(struct gl_context *ctx)
    return GL_TRUE;
 }
 
-static bool
-shader_linked_or_absent(struct gl_context *ctx,
-                        const struct gl_shader_program *shProg,
-                        bool *shader_present, const char *where)
-{
-   if (shProg) {
-      *shader_present = true;
-
-      if (!shProg->LinkStatus) {
-         _mesa_error(ctx, GL_INVALID_OPERATION, "%s(shader not linked)", where);
-         return false;
-      }
-#if 0 /* not normally enabled */
-      {
-         char errMsg[100];
-         if (!_mesa_validate_shader_program(ctx, shProg, errMsg)) {
-            _mesa_warning(ctx, "Shader program %u is invalid: %s",
-                          shProg->Name, errMsg);
-         }
-      }
-#endif
-   }
-
-   return true;
-}
 
 /**
  * Prior to drawing anything with glBegin, glDrawArrays, etc. this function
@@ -1967,54 +1942,22 @@ shader_linked_or_absent(struct gl_context *ctx,
 GLboolean
 _mesa_valid_to_render(struct gl_context *ctx, const char *where)
 {
-   unsigned i;
-
    /* This depends on having up to date derived state (shaders) */
    if (ctx->NewState)
       _mesa_update_state(ctx);
 
-   if (ctx->API == API_OPENGL_CORE || ctx->API == API_OPENGLES2) {
-      bool from_glsl_shader[MESA_SHADER_COMPUTE] = { false };
-
-      for (i = 0; i < MESA_SHADER_COMPUTE; i++) {
-         if (!shader_linked_or_absent(ctx, ctx->_Shader->CurrentProgram[i],
-                                      &from_glsl_shader[i], where))
-            return GL_FALSE;
-      }
-
-      /* In OpenGL Core Profile and OpenGL ES 2.0 / 3.0, there are no assembly
-       * shaders.  Don't check state related to those.
-       */
-   } else {
-      bool has_vertex_shader = false;
-      bool has_fragment_shader = false;
-
-      /* In OpenGL Compatibility Profile, there is only vertex shader and
-       * fragment shader.  We take this path also for API_OPENGLES because
-       * optimizing that path would make the other (more common) paths
-       * slightly slower.
-       */
-      if (!shader_linked_or_absent(ctx,
-                                   ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX],
-                                   &has_vertex_shader, where))
-         return GL_FALSE;
-
-      if (!shader_linked_or_absent(ctx,
-                                   ctx->_Shader->CurrentProgram[MESA_SHADER_FRAGMENT],
-                                   &has_fragment_shader, where))
-         return GL_FALSE;
-
+   if (ctx->API == API_OPENGL_COMPAT) {
       /* Any shader stages that are not supplied by the GLSL shader and have
        * assembly shaders enabled must now be validated.
        */
-      if (!has_vertex_shader
+      if (!ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX]
           && ctx->VertexProgram.Enabled && !ctx->VertexProgram._Enabled) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "%s(vertex program not valid)", where);
          return GL_FALSE;
       }
 
-      if (!has_fragment_shader) {
+      if (!ctx->_Shader->CurrentProgram[MESA_SHADER_FRAGMENT]) {
          if (ctx->FragmentProgram.Enabled && !ctx->FragmentProgram._Enabled) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
                         "%s(fragment program not valid)", where);
diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h
index 70ed5633f7b..d4378e51159 100644
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@@ -762,6 +762,12 @@ struct dd_function_table {
    void (*UseProgram)(struct gl_context *ctx, struct gl_shader_program *shProg);
    /*@}*/
 
+   /**
+    * \name GREMEDY debug/marker functions
+    */
+   /*@{*/
+   void (*EmitStringMarker)(struct gl_context *ctx, const GLchar *string, GLsizei len);
+   /*@}*/
 
    /**
     * \name Support for multiple T&L engines
diff --git a/src/mesa/main/dlist.c b/src/mesa/main/dlist.c
index ba2e670eb9a..cd8e3b6a2f2 100644
--- a/src/mesa/main/dlist.c
+++ b/src/mesa/main/dlist.c
@@ -5982,9 +5982,8 @@ save_DrawTransformFeedbackStreamInstanced(GLenum mode, GLuint name,
    }
 }
 
-/* aka UseProgram() */
 static void GLAPIENTRY
-save_UseProgramObjectARB(GLhandleARB program)
+save_UseProgram(GLuint program)
 {
    GET_CURRENT_CONTEXT(ctx);
    Node *n;
@@ -9454,7 +9453,7 @@ _mesa_initialize_save_table(const struct gl_context *ctx)
 
    SET_BlitFramebuffer(table, save_BlitFramebufferEXT);
 
-   SET_UseProgram(table, save_UseProgramObjectARB);
+   SET_UseProgram(table, save_UseProgram);
    SET_Uniform1f(table, save_Uniform1fARB);
    SET_Uniform2f(table, save_Uniform2fARB);
    SET_Uniform3f(table, save_Uniform3fARB);
diff --git a/src/mesa/main/errors.c b/src/mesa/main/errors.c
index 9e6610918c4..674364c7b0c 100644
--- a/src/mesa/main/errors.c
+++ b/src/mesa/main/errors.c
@@ -1018,6 +1018,13 @@ _mesa_DebugMessageInsert(GLenum source, GLenum type, GLuint id,
            gl_enum_to_debug_type(type), id,
            gl_enum_to_debug_severity(severity),
            length, buf);
+
+   if (type == GL_DEBUG_TYPE_MARKER && ctx->Driver.EmitStringMarker) {
+      /* if length not specified, string will be null terminated: */
+      if (length < 0)
+         length = strlen(buf);
+      ctx->Driver.EmitStringMarker(ctx, buf, length);
+   }
 }
 
 
@@ -1276,6 +1283,19 @@ _mesa_free_errors_data(struct gl_context *ctx)
    mtx_destroy(&ctx->DebugMutex);
 }
 
+void GLAPIENTRY
+_mesa_StringMarkerGREMEDY(GLsizei len, const GLvoid *string)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   if (ctx->Extensions.GREMEDY_string_marker) {
+      /* if length not specified, string will be null terminated: */
+      if (len <= 0)
+         len = strlen(string);
+      ctx->Driver.EmitStringMarker(ctx, string, len);
+   } else {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "StringMarkerGREMEDY");
+   }
+}
 
 /**********************************************************************/
 /** \name Diagnostics */
diff --git a/src/mesa/main/errors.h b/src/mesa/main/errors.h
index f2919765488..92df2ac868a 100644
--- a/src/mesa/main/errors.h
+++ b/src/mesa/main/errors.h
@@ -138,6 +138,9 @@ _mesa_PushDebugGroup(GLenum source, GLuint id, GLsizei length,
 void GLAPIENTRY
 _mesa_PopDebugGroup(void);
 
+void GLAPIENTRY
+_mesa_StringMarkerGREMEDY(GLsizei len, const GLvoid *string);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index aeccb017423..9cec1762dbe 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -251,6 +251,8 @@ EXT(EXT_unpack_subimage                     , dummy_true
 EXT(EXT_vertex_array                        , dummy_true                             , GLL,  x ,  x ,  x , 1995)
 EXT(EXT_vertex_array_bgra                   , EXT_vertex_array_bgra                  , GLL, GLC,  x ,  x , 2008)
 
+EXT(GREMEDY_string_marker                   , GREMEDY_string_marker                  , GLL, GLC,  x ,  x , 2007)
+
 EXT(IBM_multimode_draw_arrays               , dummy_true                             , GLL, GLC,  x ,  x , 1998)
 EXT(IBM_rasterpos_clip                      , dummy_true                             , GLL,  x ,  x ,  x , 1996)
 EXT(IBM_texture_mirrored_repeat             , dummy_true                             , GLL,  x ,  x ,  x , 1998)
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 1c717feabc2..3a0b89f4572 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3885,6 +3885,7 @@ struct gl_extensions
    GLboolean ATI_texture_env_combine3;
    GLboolean ATI_fragment_shader;
    GLboolean ATI_separate_stencil;
+   GLboolean GREMEDY_string_marker;
    GLboolean INTEL_performance_query;
    GLboolean KHR_texture_compression_astc_hdr;
    GLboolean KHR_texture_compression_astc_ldr;
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index a18b860022d..e902585924a 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -64,8 +64,8 @@ DECL_RESOURCE_FUNC(XFB, gl_transform_feedback_varying_info);
 DECL_RESOURCE_FUNC(SUB, gl_subroutine_function);
 
 void GLAPIENTRY
-_mesa_BindAttribLocation(GLhandleARB program, GLuint index,
-                            const GLcharARB *name)
+_mesa_BindAttribLocation(GLuint program, GLuint index,
+                         const GLchar *name)
 {
    GET_CURRENT_CONTEXT(ctx);
 
@@ -126,9 +126,9 @@ is_active_attrib(const gl_shader_variable *var)
 }
 
 void GLAPIENTRY
-_mesa_GetActiveAttrib(GLhandleARB program, GLuint desired_index,
-                         GLsizei maxLength, GLsizei * length, GLint * size,
-                         GLenum * type, GLcharARB * name)
+_mesa_GetActiveAttrib(GLuint program, GLuint desired_index,
+                      GLsizei maxLength, GLsizei * length, GLint * size,
+                      GLenum * type, GLchar * name)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct gl_shader_program *shProg;
@@ -191,7 +191,7 @@ _mesa_GetActiveAttrib(GLhandleARB program, GLuint desired_index,
 }
 
 GLint GLAPIENTRY
-_mesa_GetAttribLocation(GLhandleARB program, const GLcharARB * name)
+_mesa_GetAttribLocation(GLuint program, const GLchar * name)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct gl_shader_program *const shProg =
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index cdc85f3413b..5854369a28c 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -1265,7 +1265,7 @@ _mesa_AttachShader(GLuint program, GLuint shader)
 
 
 void GLAPIENTRY
-_mesa_CompileShader(GLhandleARB shaderObj)
+_mesa_CompileShader(GLuint shaderObj)
 {
    GET_CURRENT_CONTEXT(ctx);
    if (MESA_VERBOSE & VERBOSE_API)
@@ -1315,7 +1315,7 @@ _mesa_DeleteObjectARB(GLhandleARB obj)
 {
    if (MESA_VERBOSE & VERBOSE_API) {
       GET_CURRENT_CONTEXT(ctx);
-      _mesa_debug(ctx, "glDeleteObjectARB(%u)\n", obj);
+      _mesa_debug(ctx, "glDeleteObjectARB(%lu)\n", (unsigned long)obj);
    }
 
    if (obj) {
@@ -1374,10 +1374,26 @@ _mesa_DetachShader(GLuint program, GLuint shader)
 
 void GLAPIENTRY
 _mesa_GetAttachedObjectsARB(GLhandleARB container, GLsizei maxCount,
-                            GLsizei * count, GLhandleARB * obj)
+                            GLsizei * count, GLhandleARB * objARB)
 {
+   int i;
+   GLuint *obj;
+
    GET_CURRENT_CONTEXT(ctx);
+
+   obj = calloc(maxCount, sizeof(GLuint));
+   if (!obj) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetAttachedObjectsARB");
+      return;
+   }
+
    get_attached_shaders(ctx, container, maxCount, count, obj);
+
+   for (i = 0 ; i < *count; i++) {
+      objARB[i] = (GLhandleARB)obj[i];
+   }
+
+   free(obj);
 }
 
 
@@ -1479,8 +1495,8 @@ _mesa_GetShaderInfoLog(GLuint shader, GLsizei bufSize,
 
 
 void GLAPIENTRY
-_mesa_GetShaderSource(GLhandleARB shader, GLsizei maxLength,
-                         GLsizei *length, GLcharARB *sourceOut)
+_mesa_GetShaderSource(GLuint shader, GLsizei maxLength,
+                      GLsizei *length, GLchar *sourceOut)
 {
    GET_CURRENT_CONTEXT(ctx);
    get_shader_source(ctx, shader, maxLength, length, sourceOut);
@@ -1512,7 +1528,7 @@ _mesa_IsShader(GLuint name)
 
 
 void GLAPIENTRY
-_mesa_LinkProgram(GLhandleARB programObj)
+_mesa_LinkProgram(GLuint programObj)
 {
    GET_CURRENT_CONTEXT(ctx);
    if (MESA_VERBOSE & VERBOSE_API)
@@ -1641,8 +1657,8 @@ read_shader(const gl_shader_stage stage, const char *source)
  * and pass it to _mesa_shader_source().
  */
 void GLAPIENTRY
-_mesa_ShaderSource(GLhandleARB shaderObj, GLsizei count,
-                   const GLcharARB * const * string, const GLint * length)
+_mesa_ShaderSource(GLuint shaderObj, GLsizei count,
+                   const GLchar * const * string, const GLint * length)
 {
    GET_CURRENT_CONTEXT(ctx);
    GLint *offsets;
@@ -1729,7 +1745,7 @@ _mesa_ShaderSource(GLhandleARB shaderObj, GLsizei count,
 
 
 void GLAPIENTRY
-_mesa_UseProgram(GLhandleARB program)
+_mesa_UseProgram(GLuint program)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct gl_shader_program *shProg;
@@ -1791,7 +1807,7 @@ _mesa_UseProgram(GLhandleARB program)
 
 
 void GLAPIENTRY
-_mesa_ValidateProgram(GLhandleARB program)
+_mesa_ValidateProgram(GLuint program)
 {
    GET_CURRENT_CONTEXT(ctx);
    validate_program(ctx, program);
@@ -2530,6 +2546,11 @@ _mesa_UniformSubroutinesuiv(GLenum shadertype, GLsizei count,
    i = 0;
    do {
       struct gl_uniform_storage *uni = sh->SubroutineUniformRemapTable[i];
+      if (uni == NULL) {
+         i++;
+         continue;
+      }
+
       int uni_count = uni->array_elements ? uni->array_elements : 1;
       int j, k;
 
@@ -2557,6 +2578,11 @@ _mesa_UniformSubroutinesuiv(GLenum shadertype, GLsizei count,
    i = 0;
    do {
       struct gl_uniform_storage *uni = sh->SubroutineUniformRemapTable[i];
+      if (uni == NULL) {
+         i++;
+         continue;
+      }
+
       int uni_count = uni->array_elements ? uni->array_elements : 1;
 
       memcpy(&uni->storage[0], &indices[i],
diff --git a/src/mesa/main/shaderapi.h b/src/mesa/main/shaderapi.h
index fba767bf4c1..8922c4d0640 100644
--- a/src/mesa/main/shaderapi.h
+++ b/src/mesa/main/shaderapi.h
@@ -64,7 +64,7 @@ extern void GLAPIENTRY
 _mesa_AttachObjectARB(GLhandleARB, GLhandleARB);
 
 extern void  GLAPIENTRY
-_mesa_CompileShader(GLhandleARB);
+_mesa_CompileShader(GLuint);
 
 extern GLhandleARB GLAPIENTRY
 _mesa_CreateProgramObjectARB(void);
@@ -100,7 +100,7 @@ extern void GLAPIENTRY
 _mesa_GetObjectParameterivARB(GLhandleARB, GLenum, GLint *);
 
 extern void GLAPIENTRY
-_mesa_GetShaderSource(GLhandleARB, GLsizei, GLsizei *, GLcharARB *);
+_mesa_GetShaderSource(GLuint, GLsizei, GLsizei *, GLchar *);
 
 extern GLboolean GLAPIENTRY
 _mesa_IsProgram(GLuint name);
@@ -109,20 +109,20 @@ extern GLboolean GLAPIENTRY
 _mesa_IsShader(GLuint name);
 
 extern void GLAPIENTRY
-_mesa_LinkProgram(GLhandleARB programObj);
+_mesa_LinkProgram(GLuint programObj);
 
 extern void GLAPIENTRY
-_mesa_ShaderSource(GLhandleARB, GLsizei, const GLcharARB* const *, const GLint *);
+_mesa_ShaderSource(GLuint, GLsizei, const GLchar* const *, const GLint *);
 
 extern void GLAPIENTRY
-_mesa_UseProgram(GLhandleARB);
+_mesa_UseProgram(GLuint);
 
 extern void GLAPIENTRY
-_mesa_ValidateProgram(GLhandleARB);
+_mesa_ValidateProgram(GLuint);
 
 
 extern void GLAPIENTRY
-_mesa_BindAttribLocation(GLhandleARB, GLuint, const GLcharARB *);
+_mesa_BindAttribLocation(GLuint program, GLuint, const GLchar *);
 
 extern void GLAPIENTRY
 _mesa_BindFragDataLocation(GLuint program, GLuint colorNumber,
@@ -133,11 +133,11 @@ _mesa_BindFragDataLocationIndexed(GLuint program, GLuint colorNumber,
                                   GLuint index, const GLchar *name);
 
 extern void GLAPIENTRY
-_mesa_GetActiveAttrib(GLhandleARB, GLuint, GLsizei, GLsizei *, GLint *,
-                         GLenum *, GLcharARB *);
+_mesa_GetActiveAttrib(GLuint, GLuint, GLsizei, GLsizei *, GLint *,
+                         GLenum *, GLchar *);
 
 extern GLint GLAPIENTRY
-_mesa_GetAttribLocation(GLhandleARB, const GLcharARB *);
+_mesa_GetAttribLocation(GLuint, const GLchar *);
 
 
 
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index 7610bcbd701..eb1108124e9 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -940,6 +940,9 @@ const struct function common_desktop_functions_possible[] = {
    { "glGetTextureSubImage", 20, -1 },
    { "glGetCompressedTextureSubImage", 20, -1 },
 
+   /* GL_GREMEDY_string_marker */
+   { "glStringMarkerGREMEDY", 15, -1 },
+
    { NULL, 0, -1 }
 };
 
diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c
index b107a8f8678..e926c7b6cd2 100644
--- a/src/mesa/main/texobj.c
+++ b/src/mesa/main/texobj.c
@@ -769,7 +769,8 @@ _mesa_test_texobj_completeness( const struct gl_context *ctx,
    }
 
    if (t->Target == GL_TEXTURE_CUBE_MAP_ARB) {
-      /* Make sure that all six cube map level 0 images are the same size.
+      /* Make sure that all six cube map level 0 images are the same size and
+       * format.
        * Note:  we know that the image's width==height (we enforce that
        * at glTexImage time) so we only need to test the width here.
        */
@@ -784,6 +785,15 @@ _mesa_test_texobj_completeness( const struct gl_context *ctx,
             incomplete(t, BASE, "Cube face missing or mismatched size");
             return;
          }
+         if (t->Image[face][baseLevel]->InternalFormat !=
+             baseImage->InternalFormat) {
+            incomplete(t, BASE, "Cube face format mismatch");
+            return;
+         }
+         if (t->Image[face][baseLevel]->Border != baseImage->Border) {
+            incomplete(t, BASE, "Cube face border size mismatch");
+            return;
+         }
       }
    }
 
@@ -858,16 +868,6 @@ _mesa_test_texobj_completeness( const struct gl_context *ctx,
                              img->Depth2);
                   return;
                }
-
-               /* Extra checks for cube textures */
-               if (face > 0) {
-                  /* check that cube faces are the same size */
-                  if (img->Width2 != t->Image[0][i]->Width2 ||
-                      img->Height2 != t->Image[0][i]->Height2) {
-		     incomplete(t, MIPMAP, "CubeMap Image[n][i] bad size");
-		     return;
-		  }
-               }
             }
          }
 
diff --git a/src/mesa/main/varray.c b/src/mesa/main/varray.c
index c71e16a1e56..c2bf2951687 100644
--- a/src/mesa/main/varray.c
+++ b/src/mesa/main/varray.c
@@ -1744,6 +1744,10 @@ vertex_array_vertex_buffer(struct gl_context *ctx,
    } else if (buffer != 0) {
       vbo = _mesa_lookup_bufferobj(ctx, buffer);
 
+      if (!vbo && _mesa_is_gles31(ctx)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION, "%s(non-gen name)", func);
+         return;
+      }
       /* From the GL_ARB_vertex_attrib_array spec:
        *
        *   "[Core profile only:]
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index 9da9733438d..88d8337bb3e 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -1245,10 +1245,7 @@ ir_to_mesa_visitor::visit(ir_expression *ir)
    case ir_unop_unpack_unorm_2x16:
    case ir_unop_unpack_unorm_4x8:
    case ir_unop_unpack_half_2x16:
-   case ir_unop_unpack_half_2x16_split_x:
-   case ir_unop_unpack_half_2x16_split_y:
    case ir_unop_unpack_double_2x32:
-   case ir_binop_pack_half_2x16_split:
    case ir_unop_bitfield_reverse:
    case ir_unop_bit_count:
    case ir_unop_find_msb:
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index f8b367989e7..0ceb37027e1 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -1670,6 +1670,12 @@ st_finalize_texture(struct gl_context *ctx,
          width = stObj->width0;
          height = stObj->height0;
          depth = stObj->depth0;
+      } else {
+         /* The width/height/depth may have been previously reset in
+          * guess_and_alloc_texture. */
+         stObj->width0 = width;
+         stObj->height0 = height;
+         stObj->depth0 = depth;
       }
       /* convert GL dims to Gallium dims */
       st_gl_texture_dims_to_pipe_dims(stObj->base.Target, width, height, depth,
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 4add50e3ed9..ce1e97aacb5 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -438,6 +438,12 @@ void st_destroy_context( struct st_context *st )
    free(ctx);
 }
 
+static void
+st_emit_string_marker(struct gl_context *ctx, const GLchar *string, GLsizei len)
+{
+   struct st_context *st = ctx->st;
+   st->pipe->emit_string_marker(st->pipe, string, len);
+}
 
 void st_init_driver_functions(struct pipe_screen *screen,
                               struct dd_function_table *functions)
@@ -476,6 +482,9 @@ void st_init_driver_functions(struct pipe_screen *screen,
 
    st_init_vdpau_functions(functions);
 
+   if (screen->get_param(screen, PIPE_CAP_STRING_MARKER))
+      functions->EmitStringMarker = st_emit_string_marker;
+
    functions->Enable = st_Enable;
    functions->UpdateState = st_invalidate_state;
 }
diff --git a/src/mesa/state_tracker/st_debug.c b/src/mesa/state_tracker/st_debug.c
index 134366db09d..9eb3b53b230 100644
--- a/src/mesa/state_tracker/st_debug.c
+++ b/src/mesa/state_tracker/st_debug.c
@@ -57,6 +57,7 @@ static const struct debug_named_value st_debug_flags[] = {
    { "buffer",   DEBUG_BUFFER, NULL },
    { "wf",       DEBUG_WIREFRAME, NULL },
    { "precompile",  DEBUG_PRECOMPILE, NULL },
+   { "gremedy",  DEBUG_GREMEDY, "Enable GREMEDY debug extensions" },
    DEBUG_NAMED_VALUE_END
 };
 
diff --git a/src/mesa/state_tracker/st_debug.h b/src/mesa/state_tracker/st_debug.h
index ed3ead82914..a094fdc2bfa 100644
--- a/src/mesa/state_tracker/st_debug.h
+++ b/src/mesa/state_tracker/st_debug.h
@@ -50,6 +50,7 @@ st_print_current(void);
 #define DEBUG_BUFFER    0x200
 #define DEBUG_WIREFRAME 0x400
 #define DEBUG_PRECOMPILE   0x800
+#define DEBUG_GREMEDY   0x1000
 
 #ifdef DEBUG
 extern int ST_DEBUG;
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 2a3e52362e4..53ea6767395 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -37,6 +37,7 @@
 #include "util/u_math.h"
 
 #include "st_context.h"
+#include "st_debug.h"
 #include "st_extensions.h"
 #include "st_format.h"
 
@@ -973,4 +974,8 @@ void st_init_extensions(struct pipe_screen *screen,
       extensions->ARB_gpu_shader_fp64 = GL_TRUE;
       extensions->ARB_vertex_attrib_64bit = GL_TRUE;
    }
+
+   if ((ST_DEBUG & DEBUG_GREMEDY) &&
+       screen->get_param(screen, PIPE_CAP_STRING_MARKER))
+      extensions->GREMEDY_string_marker = GL_TRUE;
 }
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index d424e3b335f..a06683f31c8 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -2177,12 +2177,9 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
 
    case ir_unop_unpack_snorm_2x16:
    case ir_unop_unpack_unorm_2x16:
-   case ir_unop_unpack_half_2x16_split_x:
-   case ir_unop_unpack_half_2x16_split_y:
    case ir_unop_unpack_snorm_4x8:
    case ir_unop_unpack_unorm_4x8:
 
-   case ir_binop_pack_half_2x16_split:
    case ir_quadop_vector:
    case ir_binop_vector_extract:
    case ir_triop_vector_insert: