diff --git a/configure.ac b/configure.ac
index f236dad6441..0c88db9f66f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -108,6 +108,8 @@ AC_SYS_LARGEFILE
 LT_PREREQ([2.2])
 LT_INIT([disable-static])
 
+AC_CHECK_PROG(RM, rm, [rm -f])
+
 AX_PROG_BISON([],
               AS_IF([test ! -f "$srcdir/src/glsl/glcpp/glcpp-parse.c"],
                     [AC_MSG_ERROR([bison not found - unable to compile glcpp-parse.y])]))
diff --git a/docs/GL3.txt b/docs/GL3.txt
index 6503e2ab1da..167321676df 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -169,7 +169,7 @@ GL 4.3, GLSL 4.30:
   GL_ARB_texture_buffer_range                          DONE (nv50, nvc0, i965, r600, radeonsi, llvmpipe)
   GL_ARB_texture_query_levels                          DONE (all drivers that support GLSL 1.30)
   GL_ARB_texture_storage_multisample                   DONE (all drivers that support GL_ARB_texture_multisample)
-  GL_ARB_texture_view                                  DONE (i965, nv50, nvc0, llvmpipe, softpipe)
+  GL_ARB_texture_view                                  DONE (i965, nv50, nvc0, radeonsi, llvmpipe, softpipe)
   GL_ARB_vertex_attrib_binding                         DONE (all drivers)
 
 
diff --git a/docs/relnotes/11.1.0.html b/docs/relnotes/11.1.0.html
index dcf425e4c68..d3dbe9dda13 100644
--- a/docs/relnotes/11.1.0.html
+++ b/docs/relnotes/11.1.0.html
@@ -51,6 +51,7 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_shader_texture_image_samples on i965, nv50, nvc0, r600, radeonsi</li>
 <li>GL_ARB_texture_barrier / GL_NV_texture_barrier on i965</li>
 <li>GL_ARB_texture_query_lod on softpipe</li>
+<li>GL_ARB_texture_view on radeonsi</li>
 <li>EGL_KHR_create_context on softpipe, llvmpipe</li>
 <li>EGL_KHR_gl_colorspace on softpipe, llvmpipe</li>
 </ul>
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 0ad94bb031f..5584c4a222c 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -355,8 +355,9 @@ struct draw_vertex_info {
 };
 
 /* these flags are set if the primitive is a segment of a larger one */
-#define DRAW_SPLIT_BEFORE 0x1
-#define DRAW_SPLIT_AFTER  0x2
+#define DRAW_SPLIT_BEFORE        0x1
+#define DRAW_SPLIT_AFTER         0x2
+#define DRAW_LINE_LOOP_AS_STRIP  0x4
 
 struct draw_prim_info {
    boolean linear;
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
index ffec863ae6f..aa20b918f50 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@@ -359,6 +359,16 @@ fetch_pipeline_generic(struct draw_pt_middle_end *middle,
 }
 
 
+static inline unsigned
+prim_type(unsigned prim, unsigned flags)
+{
+   if (flags & DRAW_LINE_LOOP_AS_STRIP)
+      return PIPE_PRIM_LINE_STRIP;
+   else
+      return prim;
+}
+
+
 static void
 fetch_pipeline_run(struct draw_pt_middle_end *middle,
                    const unsigned *fetch_elts,
@@ -380,7 +390,7 @@ fetch_pipeline_run(struct draw_pt_middle_end *middle,
    prim_info.start = 0;
    prim_info.count = draw_count;
    prim_info.elts = draw_elts;
-   prim_info.prim = fpme->input_prim;
+   prim_info.prim = prim_type(fpme->input_prim, prim_flags);
    prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &draw_count;
@@ -408,7 +418,7 @@ fetch_pipeline_linear_run(struct draw_pt_middle_end *middle,
    prim_info.start = 0;
    prim_info.count = count;
    prim_info.elts = NULL;
-   prim_info.prim = fpme->input_prim;
+   prim_info.prim = prim_type(fpme->input_prim, prim_flags);
    prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &count;
@@ -439,7 +449,7 @@ fetch_pipeline_linear_run_elts(struct draw_pt_middle_end *middle,
    prim_info.start = 0;
    prim_info.count = draw_count;
    prim_info.elts = draw_elts;
-   prim_info.prim = fpme->input_prim;
+   prim_info.prim = prim_type(fpme->input_prim, prim_flags);
    prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &draw_count;
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
index e42c4af0e70..2d7569b0fdf 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
@@ -473,6 +473,16 @@ llvm_pipeline_generic(struct draw_pt_middle_end *middle,
 }
 
 
+static inline unsigned
+prim_type(unsigned prim, unsigned flags)
+{
+   if (flags & DRAW_LINE_LOOP_AS_STRIP)
+      return PIPE_PRIM_LINE_STRIP;
+   else
+      return prim;
+}
+
+
 static void
 llvm_middle_end_run(struct draw_pt_middle_end *middle,
                     const unsigned *fetch_elts,
@@ -494,7 +504,7 @@ llvm_middle_end_run(struct draw_pt_middle_end *middle,
    prim_info.start = 0;
    prim_info.count = draw_count;
    prim_info.elts = draw_elts;
-   prim_info.prim = fpme->input_prim;
+   prim_info.prim = prim_type(fpme->input_prim, prim_flags);
    prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &draw_count;
@@ -522,7 +532,7 @@ llvm_middle_end_linear_run(struct draw_pt_middle_end *middle,
    prim_info.start = 0;
    prim_info.count = count;
    prim_info.elts = NULL;
-   prim_info.prim = fpme->input_prim;
+   prim_info.prim = prim_type(fpme->input_prim, prim_flags);
    prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &count;
@@ -552,7 +562,7 @@ llvm_middle_end_linear_run_elts(struct draw_pt_middle_end *middle,
    prim_info.start = 0;
    prim_info.count = draw_count;
    prim_info.elts = draw_elts;
-   prim_info.prim = fpme->input_prim;
+   prim_info.prim = prim_type(fpme->input_prim, prim_flags);
    prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &draw_count;
diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h
index 0afabb01398..6da79b9490b 100644
--- a/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h
+++ b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h
@@ -249,6 +249,9 @@ vsplit_segment_loop_linear(struct vsplit_frontend *vsplit, unsigned flags,
 
    assert(icount + !!close_loop <= vsplit->segment_size);
 
+   /* need to draw the sections of the line loop as line strips */
+   flags |= DRAW_LINE_LOOP_AS_STRIP;
+
    if (close_loop) {
       for (nr = 0; nr < icount; nr++)
          vsplit->fetch_elts[nr] = istart + nr;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_limits.h b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
index 571c615f9f8..ad64ae058b6 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_limits.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
@@ -137,6 +137,8 @@ gallivm_get_shader_param(enum pipe_shader_cap param)
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       return 0;
+   case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+      return 32;
    }
    /* if we get here, we missed a shader cap above (and should have seen
     * a compiler warning.)
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index a371aa95e70..f86adcec506 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -474,6 +474,8 @@ tgsi_exec_get_shader_param(enum pipe_shader_cap param)
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       return 0;
+   case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+      return 32;
    }
    /* if we get here, we missed a shader cap above (and should have seen
     * a compiler warning.)
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index b84a1753eeb..4645ef26cab 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -369,19 +369,7 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                       procType == TGSI_PROCESSOR_GEOMETRY ||
                       procType == TGSI_PROCESSOR_TESS_CTRL ||
                       procType == TGSI_PROCESSOR_TESS_EVAL) {
-                     if (semName == TGSI_SEMANTIC_CLIPDIST) {
-                        info->num_written_clipdistance +=
-                           util_bitcount(fulldecl->Declaration.UsageMask);
-                        info->clipdist_writemask |=
-                           fulldecl->Declaration.UsageMask << (semIndex*4);
-                     }
-                     else if (semName == TGSI_SEMANTIC_CULLDIST) {
-                        info->num_written_culldistance +=
-                           util_bitcount(fulldecl->Declaration.UsageMask);
-                        info->culldist_writemask |=
-                           fulldecl->Declaration.UsageMask << (semIndex*4);
-                     }
-                     else if (semName == TGSI_SEMANTIC_VIEWPORT_INDEX) {
+                     if (semName == TGSI_SEMANTIC_VIEWPORT_INDEX) {
                         info->writes_viewport_index = TRUE;
                      }
                      else if (semName == TGSI_SEMANTIC_LAYER) {
@@ -432,9 +420,21 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
             const struct tgsi_full_property *fullprop
                = &parse.FullToken.FullProperty;
             unsigned name = fullprop->Property.PropertyName;
+            unsigned value = fullprop->u[0].Data;
 
             assert(name < Elements(info->properties));
-            info->properties[name] = fullprop->u[0].Data;
+            info->properties[name] = value;
+
+            switch (name) {
+            case TGSI_PROPERTY_NUM_CLIPDIST_ENABLED:
+               info->num_written_clipdistance = value;
+               info->clipdist_writemask |= (1 << value) - 1;
+               break;
+            case TGSI_PROPERTY_NUM_CULLDIST_ENABLED:
+               info->num_written_culldistance = value;
+               info->culldist_writemask |= (1 << value) - 1;
+               break;
+            }
          }
          break;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c
index 8271ea08177..89369d60f4e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c
@@ -137,6 +137,8 @@ const char *tgsi_property_names[TGSI_PROPERTY_COUNT] =
    "TES_SPACING",
    "TES_VERTEX_ORDER_CW",
    "TES_POINT_MODE",
+   "NUM_CLIPDIST_ENABLED",
+   "NUM_CULLDIST_ENABLED",
 };
 
 const char *tgsi_return_type_names[TGSI_RETURN_TYPE_COUNT] =
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index e08844b2f0b..151afb2dffe 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -276,6 +276,8 @@ The integer capabilities:
   GL4 hardware will likely need to emulate it with a shader variant, or by
   selecting the interpolation weights with a conditional assignment
   in the shader.
+* ``PIPE_CAP_SHAREABLE_SHADERS``: Whether shader CSOs can be used by any
+  pipe_context.
 
 
 
@@ -365,6 +367,10 @@ to be 0.
   are supported.
 * ``PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE``: Whether the driver doesn't
   ignore tgsi_declaration_range::Last for shader inputs and outputs.
+* ``PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT``: This is the maximum number
+  of iterations that loops are allowed to have to be unrolled. It is only
+  a hint to state trackers. Whether any loops will be unrolled is not
+  guaranteed.
 
 
 .. _pipe_compute_cap:
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 314fe1bb74f..01e18f3084e 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -3126,6 +3126,16 @@ TES_POINT_MODE
 If set to a non-zero value, this turns on point mode for the tessellator,
 which means that points will be generated instead of primitives.
 
+NUM_CLIPDIST_ENABLED
+""""""""""""""""
+
+How many clip distance scalar outputs are enabled.
+
+NUM_CULLDIST_ENABLED
+""""""""""""""""
+
+How many cull distance scalar outputs are enabled.
+
 
 Texture Sampling and Texture Formats
 ------------------------------------
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index b64f78ca32b..50d140fe903 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -237,6 +237,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 	case PIPE_CAP_TGSI_TXQS:
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+	case PIPE_CAP_SHAREABLE_SHADERS:
 		return 0;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
@@ -411,6 +412,8 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 		return 16;
 	case PIPE_SHADER_CAP_PREFERRED_IR:
 		return PIPE_SHADER_IR_TGSI;
+	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+		return 32;
 	}
 	debug_printf("unknown shader param %d\n", param);
 	return 0;
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 9d6b3d39183..5812af626cb 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -167,6 +167,8 @@ i915_get_shader_param(struct pipe_screen *screen, unsigned shader, enum pipe_sha
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
          return 0;
+      case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+         return 32;
       default:
          debug_printf("%s: Unknown cap %u.\n", __FUNCTION__, cap);
          return 0;
@@ -249,6 +251,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_DEPTH_BOUNDS_TEST:
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_SHAREABLE_SHADERS:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index 76812a666a0..e1a7dc56685 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -138,6 +138,8 @@ ilo_get_shader_param(struct pipe_screen *screen, unsigned shader,
       return PIPE_SHADER_IR_TGSI;
    case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
       return 1;
+   case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+      return 32;
 
    default:
       return 0;
@@ -471,6 +473,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_DEPTH_BOUNDS_TEST:
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_SHAREABLE_SHADERS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 50c3781f5f8..e2ed267da78 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -298,6 +298,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_DEPTH_BOUNDS_TEST:
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_SHAREABLE_SHADERS:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 335c163b661..03301649e38 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -171,6 +171,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_SHAREABLE_SHADERS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -263,6 +264,8 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
          return 0;
+      case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+         return 32;
       default:
          debug_printf("unknown vertex shader param %d\n", param);
          return 0;
@@ -304,6 +307,8 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
          return 0;
+      case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+         return 32;
       default:
          debug_printf("unknown fragment shader param %d\n", param);
          return 0;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 812b246ea0e..ec51d00f266 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -216,6 +216,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
    case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_SHAREABLE_SHADERS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -299,6 +300,8 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
       return 0;
+   case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+      return 32;
    default:
       NOUVEAU_ERR("unknown PIPE_SHADER_CAP %d\n", param);
       return 0;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index f34ad0ed5d1..af8e5f72670 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -202,6 +202,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_SHAREABLE_SHADERS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -312,6 +313,8 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
       return 16; /* would be 32 in linked (OpenGL-style) mode */
    case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
       return 16; /* XXX not sure if more are really safe */
+   case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+      return 32;
    default:
       NOUVEAU_ERR("unknown PIPE_SHADER_CAP %d\n", param);
       return 0;
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 1165ac8a9c0..a576abdfaf2 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -197,6 +197,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_DEPTH_BOUNDS_TEST:
         case PIPE_CAP_TGSI_TXQS:
         case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+        case PIPE_CAP_SHAREABLE_SHADERS:
             return 0;
 
         /* SWTCL-only features. */
@@ -302,6 +303,8 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e
         case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
             return 0;
+        case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+            return 32;
         case PIPE_SHADER_CAP_PREFERRED_IR:
             return PIPE_SHADER_IR_TGSI;
         }
@@ -358,6 +361,8 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e
         case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
             return 0;
+        case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+            return 32;
         case PIPE_SHADER_CAP_PREFERRED_IR:
             return PIPE_SHADER_IR_TGSI;
         }
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 32ce76a9e07..9a97de9965e 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -343,6 +343,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
 	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+	case PIPE_CAP_SHAREABLE_SHADERS:
 		return 0;
 
 	/* Stream output. */
@@ -510,6 +511,12 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, e
 	case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
 		return 0;
+	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+		/* due to a bug in the shader compiler, some loops hang
+		 * if they are not unrolled, see:
+		 *    https://bugs.freedesktop.org/show_bug.cgi?id=86720
+		 */
+		return 255;
 	}
 	return 0;
 }
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index d5c5db30029..082ea850675 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -55,11 +55,11 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op)
 	util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa);
 	util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state);
 	util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer);
-	util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader);
-	util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader);
-	util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader);
-	util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader);
-	util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader);
+	util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso);
+	util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso);
+	util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader.cso);
+	util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader.cso);
+	util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso);
 	util_blitter_save_vertex_elements(sctx->blitter, sctx->vertex_elements);
 	util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask.sample_mask);
 	util_blitter_save_viewport(sctx->blitter, &sctx->viewports.states[0]);
diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c
index 7d41e8d00e0..53062187b88 100644
--- a/src/gallium/drivers/radeonsi/si_debug.c
+++ b/src/gallium/drivers/radeonsi/si_debug.c
@@ -31,15 +31,15 @@
 #include "ddebug/dd_util.h"
 
 
-static void si_dump_shader(struct si_shader_selector *sel, const char *name,
+static void si_dump_shader(struct si_shader_ctx_state *state, const char *name,
 			   FILE *f)
 {
-	if (!sel || !sel->current)
+	if (!state->cso || !state->current)
 		return;
 
 	fprintf(f, "%s shader disassembly:\n", name);
-	si_dump_shader_key(sel->type, &sel->current->key, f);
-	fprintf(f, "%s\n\n", sel->current->binary.disasm_string);
+	si_dump_shader_key(state->cso->type, &state->current->key, f);
+	fprintf(f, "%s\n\n", state->current->binary.disasm_string);
 }
 
 /* Parsed IBs are difficult to read without colors. Use "less -R file" to
@@ -536,11 +536,11 @@ static void si_dump_debug_state(struct pipe_context *ctx, FILE *f,
 	if (flags & PIPE_DEBUG_DEVICE_IS_HUNG)
 		si_dump_debug_registers(sctx, f);
 
-	si_dump_shader(sctx->vs_shader, "Vertex", f);
-	si_dump_shader(sctx->tcs_shader, "Tessellation control", f);
-	si_dump_shader(sctx->tes_shader, "Tessellation evaluation", f);
-	si_dump_shader(sctx->gs_shader, "Geometry", f);
-	si_dump_shader(sctx->ps_shader, "Fragment", f);
+	si_dump_shader(&sctx->vs_shader, "Vertex", f);
+	si_dump_shader(&sctx->tcs_shader, "Tessellation control", f);
+	si_dump_shader(&sctx->tes_shader, "Tessellation evaluation", f);
+	si_dump_shader(&sctx->gs_shader, "Geometry", f);
+	si_dump_shader(&sctx->ps_shader, "Fragment", f);
 
 	si_dump_last_bo_list(sctx, f);
 	si_dump_last_ib(sctx, f);
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 19dd14f9b6f..13738da5e2c 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -915,10 +915,10 @@ static void si_set_user_data_base(struct si_context *sctx,
 void si_shader_change_notify(struct si_context *sctx)
 {
 	/* VS can be bound as VS, ES, or LS. */
-	if (sctx->tes_shader)
+	if (sctx->tes_shader.cso)
 		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
 				      R_00B530_SPI_SHADER_USER_DATA_LS_0);
-	else if (sctx->gs_shader)
+	else if (sctx->gs_shader.cso)
 		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
 				      R_00B330_SPI_SHADER_USER_DATA_ES_0);
 	else
@@ -926,8 +926,8 @@ void si_shader_change_notify(struct si_context *sctx)
 				      R_00B130_SPI_SHADER_USER_DATA_VS_0);
 
 	/* TES can be bound as ES, VS, or not bound. */
-	if (sctx->tes_shader) {
-		if (sctx->gs_shader)
+	if (sctx->tes_shader.cso) {
+		if (sctx->gs_shader.cso)
 			si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
 					      R_00B330_SPI_SHADER_USER_DATA_ES_0);
 		else
@@ -964,7 +964,7 @@ void si_emit_shader_userdata(struct si_context *sctx, struct r600_atom *atom)
 	unsigned i;
 	uint32_t *sh_base = sctx->shader_userdata.sh_base;
 
-	if (sctx->gs_shader) {
+	if (sctx->gs_shader.cso) {
 		/* The VS copy shader needs these for clipping, streamout, and rings. */
 		unsigned vs_base = R_00B130_SPI_SHADER_USER_DATA_VS_0;
 		unsigned i = PIPE_SHADER_VERTEX;
@@ -975,7 +975,7 @@ void si_emit_shader_userdata(struct si_context *sctx, struct r600_atom *atom)
 		/* The TESSEVAL shader needs this for the ESGS ring buffer. */
 		si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc,
 				       R_00B330_SPI_SHADER_USER_DATA_ES_0, true);
-	} else if (sctx->tes_shader) {
+	} else if (sctx->tes_shader.cso) {
 		/* The TESSEVAL shader needs this for streamout. */
 		si_emit_shader_pointer(sctx, &sctx->rw_buffers[PIPE_SHADER_VERTEX].desc,
 				       R_00B130_SPI_SHADER_USER_DATA_VS_0, true);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 53c80dba602..5f910c95ef3 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -57,8 +57,8 @@ static void si_destroy_context(struct pipe_context *context)
 		sctx->b.b.delete_sampler_state(&sctx->b.b, sctx->pstipple_sampler_state);
 	if (sctx->dummy_pixel_shader)
 		sctx->b.b.delete_fs_state(&sctx->b.b, sctx->dummy_pixel_shader);
-	if (sctx->fixed_func_tcs_shader)
-		sctx->b.b.delete_tcs_state(&sctx->b.b, sctx->fixed_func_tcs_shader);
+	if (sctx->fixed_func_tcs_shader.cso)
+		sctx->b.b.delete_tcs_state(&sctx->b.b, sctx->fixed_func_tcs_shader.cso);
 	if (sctx->custom_dsa_flush)
 		sctx->b.b.delete_depth_stencil_alpha_state(&sctx->b.b, sctx->custom_dsa_flush);
 	if (sctx->custom_blend_resolve)
@@ -293,7 +293,9 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
 	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+	case PIPE_CAP_SHAREABLE_SHADERS:
 	case PIPE_CAP_DEPTH_BOUNDS_TEST:
+	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_TEXTURE_QUERY_LOD:
 	case PIPE_CAP_TEXTURE_GATHER_SM5:
 	case PIPE_CAP_TGSI_TXQS:
@@ -335,7 +337,6 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_USER_VERTEX_BUFFERS:
 	case PIPE_CAP_FAKE_SW_MSAA:
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
-	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
 		return 0;
 
@@ -507,6 +508,8 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 	case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
 		return 1;
+	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+		return 32;
 	}
 	return 0;
 }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 2abd5b5a0c3..d7a2282952a 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -152,6 +152,15 @@ struct si_viewports {
 	struct pipe_viewport_state	states[SI_MAX_VIEWPORTS];
 };
 
+/* A shader state consists of the shader selector, which is a constant state
+ * object shared by multiple contexts and shouldn't be modified, and
+ * the current shader variant selected for this context.
+ */
+struct si_shader_ctx_state {
+	struct si_shader_selector	*cso;
+	struct si_shader		*current;
+};
+
 struct si_context {
 	struct r600_common_context	b;
 	struct blitter_context		*blitter;
@@ -162,7 +171,7 @@ struct si_context {
 	void				*pstipple_sampler_state;
 	struct si_screen		*screen;
 	struct pipe_fence_handle	*last_gfx_fence;
-	struct si_shader_selector	*fixed_func_tcs_shader;
+	struct si_shader_ctx_state	fixed_func_tcs_shader;
 	LLVMTargetMachineRef		tm;
 
 	/* Atoms (direct states). */
@@ -199,11 +208,11 @@ struct si_context {
 	void				*dummy_pixel_shader;
 
 	/* shaders */
-	struct si_shader_selector	*ps_shader;
-	struct si_shader_selector	*gs_shader;
-	struct si_shader_selector	*vs_shader;
-	struct si_shader_selector	*tcs_shader;
-	struct si_shader_selector	*tes_shader;
+	struct si_shader_ctx_state	ps_shader;
+	struct si_shader_ctx_state	gs_shader;
+	struct si_shader_ctx_state	vs_shader;
+	struct si_shader_ctx_state	tcs_shader;
+	struct si_shader_ctx_state	tes_shader;
 	struct si_cs_shader_state	cs_shader_state;
 
 	/* shader information */
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 54dad726d01..fd5500c1ab3 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -179,15 +179,18 @@ struct radeon_shader_reloc;
 
 struct si_shader;
 
+/* A shader selector is a gallium CSO and contains shader variants and
+ * binaries for one TGSI program. This can be shared by multiple contexts.
+ */
 struct si_shader_selector {
-	struct si_shader *current;
+	pipe_mutex		mutex;
+	struct si_shader	*first_variant; /* immutable after the first variant */
+	struct si_shader	*last_variant; /* mutable */
 
 	struct tgsi_token       *tokens;
 	struct pipe_stream_output_info  so;
 	struct tgsi_shader_info		info;
 
-	unsigned	num_shaders;
-
 	/* PIPE_SHADER_[VERTEX|FRAGMENT|...] */
 	unsigned	type;
 
@@ -241,7 +244,7 @@ union si_shader_key {
 		uint64_t	es_enabled_outputs;
 		unsigned	as_es:1; /* export shader */
 		unsigned	as_ls:1; /* local shader */
-		unsigned	export_prim_id; /* when PS needs it and GS is disabled */
+		unsigned	export_prim_id:1; /* when PS needs it and GS is disabled */
 	} vs;
 	struct {
 		unsigned	prim_mode:3;
@@ -252,7 +255,7 @@ union si_shader_key {
 		 * This describes how outputs are laid out in memory. */
 		uint64_t	es_enabled_outputs;
 		unsigned	as_es:1; /* export shader */
-		unsigned	export_prim_id; /* when PS needs it and GS is disabled */
+		unsigned	export_prim_id:1; /* when PS needs it and GS is disabled */
 	} tes; /* tessellation evaluation shader */
 };
 
@@ -293,24 +296,24 @@ struct si_shader {
 
 static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
 {
-	if (sctx->gs_shader)
-		return &sctx->gs_shader->info;
-	else if (sctx->tes_shader)
-		return &sctx->tes_shader->info;
-	else if (sctx->vs_shader)
-		return &sctx->vs_shader->info;
+	if (sctx->gs_shader.cso)
+		return &sctx->gs_shader.cso->info;
+	else if (sctx->tes_shader.cso)
+		return &sctx->tes_shader.cso->info;
+	else if (sctx->vs_shader.cso)
+		return &sctx->vs_shader.cso->info;
 	else
 		return NULL;
 }
 
 static inline struct si_shader* si_get_vs_state(struct si_context *sctx)
 {
-	if (sctx->gs_shader)
-		return sctx->gs_shader->current->gs_copy_shader;
-	else if (sctx->tes_shader)
-		return sctx->tes_shader->current;
+	if (sctx->gs_shader.current)
+		return sctx->gs_shader.current->gs_copy_shader;
+	else if (sctx->tes_shader.current)
+		return sctx->tes_shader.current;
 	else
-		return sctx->vs_shader->current;
+		return sctx->vs_shader.current;
 }
 
 static inline bool si_vs_exports_prim_id(struct si_shader *shader)
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index e6475364f98..243bdc6e6d7 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -266,7 +266,7 @@ static void si_emit_cb_target_mask(struct si_context *sctx, struct r600_atom *at
 	 * Reproducible with Unigine Heaven 4.0 and drirc missing.
 	 */
 	if (blend->dual_src_blend &&
-	    (sctx->ps_shader->ps_colors_written & 0x3) != 0x3)
+	    (sctx->ps_shader.cso->ps_colors_written & 0x3) != 0x3)
 		mask = 0;
 
 	radeon_set_context_reg(cs, R_028238_CB_TARGET_MASK, mask);
@@ -1535,9 +1535,14 @@ static unsigned si_tex_compare(unsigned compare)
 	}
 }
 
-static unsigned si_tex_dim(unsigned dim, unsigned nr_samples)
+static unsigned si_tex_dim(unsigned res_target, unsigned view_target,
+			   unsigned nr_samples)
 {
-	switch (dim) {
+	if (view_target == PIPE_TEXTURE_CUBE ||
+	    view_target == PIPE_TEXTURE_CUBE_ARRAY)
+		res_target = view_target;
+
+	switch (res_target) {
 	default:
 	case PIPE_TEXTURE_1D:
 		return V_008F1C_SQ_RSRC_IMG_1D;
@@ -2391,6 +2396,7 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
 	struct radeon_surf_level *surflevel;
 	int first_non_void;
 	uint64_t va;
+	unsigned last_layer = state->u.tex.last_layer;
 
 	if (view == NULL)
 		return NULL;
@@ -2596,6 +2602,13 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
 	} else if (texture->target == PIPE_TEXTURE_CUBE_ARRAY)
 		depth = texture->array_size / 6;
 
+	/* This is not needed if state trackers set last_layer correctly. */
+	if (state->target == PIPE_TEXTURE_1D ||
+	    state->target == PIPE_TEXTURE_2D ||
+	    state->target == PIPE_TEXTURE_RECT ||
+	    state->target == PIPE_TEXTURE_CUBE)
+		last_layer = state->u.tex.first_layer;
+
 	va = tmp->resource.gpu_address + surflevel[base_level].offset;
 
 	view->state[0] = va >> 8;
@@ -2615,10 +2628,11 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
 						      last_level) |
 			  S_008F1C_TILING_INDEX(si_tile_mode_index(tmp, base_level, false)) |
 			  S_008F1C_POW2_PAD(texture->last_level > 0) |
-			  S_008F1C_TYPE(si_tex_dim(texture->target, texture->nr_samples)));
+			  S_008F1C_TYPE(si_tex_dim(texture->target, state->target,
+						   texture->nr_samples)));
 	view->state[4] = (S_008F20_DEPTH(depth - 1) | S_008F20_PITCH(pitch - 1));
 	view->state[5] = (S_008F24_BASE_ARRAY(state->u.tex.first_layer) |
-			  S_008F24_LAST_ARRAY(state->u.tex.last_layer));
+			  S_008F24_LAST_ARRAY(last_layer));
 	view->state[6] = 0;
 	view->state[7] = 0;
 
@@ -2653,11 +2667,12 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
 				       S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) |
 				       S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
 				       S_008F1C_TILING_INDEX(tmp->fmask.tile_mode_index) |
-				       S_008F1C_TYPE(si_tex_dim(texture->target, 0));
+				       S_008F1C_TYPE(si_tex_dim(texture->target,
+								state->target, 0));
 		view->fmask_state[4] = S_008F20_DEPTH(depth - 1) |
 				       S_008F20_PITCH(tmp->fmask.pitch - 1);
 		view->fmask_state[5] = S_008F24_BASE_ARRAY(state->u.tex.first_layer) |
-				       S_008F24_LAST_ARRAY(state->u.tex.last_layer);
+				       S_008F24_LAST_ARRAY(last_layer);
 		view->fmask_state[6] = 0;
 		view->fmask_state[7] = 0;
 	}
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 5face423941..ce6c98c3124 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -109,11 +109,11 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 				       unsigned *num_patches)
 {
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	struct si_shader_selector *ls = sctx->vs_shader;
+	struct si_shader_ctx_state *ls = &sctx->vs_shader;
 	/* The TES pointer will only be used for sctx->last_tcs.
 	 * It would be wrong to think that TCS = TES. */
 	struct si_shader_selector *tcs =
-		sctx->tcs_shader ? sctx->tcs_shader : sctx->tes_shader;
+		sctx->tcs_shader.cso ? sctx->tcs_shader.cso : sctx->tes_shader.cso;
 	unsigned tes_sh_base = sctx->shader_userdata.sh_base[PIPE_SHADER_TESS_EVAL];
 	unsigned num_tcs_input_cp = info->vertices_per_patch;
 	unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs;
@@ -138,9 +138,9 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 
 	/* This calculates how shader inputs and outputs among VS, TCS, and TES
 	 * are laid out in LDS. */
-	num_tcs_inputs = util_last_bit64(ls->outputs_written);
+	num_tcs_inputs = util_last_bit64(ls->cso->outputs_written);
 
-	if (sctx->tcs_shader) {
+	if (sctx->tcs_shader.cso) {
 		num_tcs_outputs = util_last_bit64(tcs->outputs_written);
 		num_tcs_output_cp = tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
 		num_tcs_patch_outputs = util_last_bit64(tcs->patch_outputs_written);
@@ -159,7 +159,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
 	output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
 
-	output_patch0_offset = sctx->tcs_shader ? input_patch_size * *num_patches : 0;
+	output_patch0_offset = sctx->tcs_shader.cso ? input_patch_size * *num_patches : 0;
 	perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
 
 	lds_size = output_patch0_offset + output_patch_size * *num_patches;
@@ -231,13 +231,13 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 	bool partial_vs_wave = false;
 	bool partial_es_wave = false;
 
-	if (sctx->gs_shader)
+	if (sctx->gs_shader.cso)
 		primgroup_size = 64; /* recommended with a GS */
 
-	if (sctx->tes_shader) {
+	if (sctx->tes_shader.cso) {
 		unsigned num_cp_out =
-			sctx->tcs_shader ?
-			sctx->tcs_shader->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] :
+			sctx->tcs_shader.cso ?
+			sctx->tcs_shader.cso->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] :
 			info->vertices_per_patch;
 		unsigned max_size = 256 / MAX2(info->vertices_per_patch, num_cp_out);
 
@@ -248,8 +248,8 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 
 		/* SWITCH_ON_EOI must be set if PrimID is used.
 		 * If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */
-		if ((sctx->tcs_shader && sctx->tcs_shader->info.uses_primid) ||
-		    sctx->tes_shader->info.uses_primid) {
+		if ((sctx->tcs_shader.cso && sctx->tcs_shader.cso->info.uses_primid) ||
+		    sctx->tes_shader.cso->info.uses_primid) {
 			ia_switch_on_eoi = true;
 			partial_es_wave = true;
 		}
@@ -258,7 +258,7 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 		if ((sctx->b.family == CHIP_TAHITI ||
 		     sctx->b.family == CHIP_PITCAIRN ||
 		     sctx->b.family == CHIP_BONAIRE) &&
-		    sctx->gs_shader)
+		    sctx->gs_shader.cso)
 			partial_vs_wave = true;
 	}
 
@@ -328,11 +328,11 @@ static unsigned si_get_ls_hs_config(struct si_context *sctx,
 {
 	unsigned num_output_cp;
 
-	if (!sctx->tes_shader)
+	if (!sctx->tes_shader.cso)
 		return 0;
 
-	num_output_cp = sctx->tcs_shader ?
-		sctx->tcs_shader->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] :
+	num_output_cp = sctx->tcs_shader.cso ?
+		sctx->tcs_shader.cso->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] :
 		info->vertices_per_patch;
 
 	return S_028B58_NUM_PATCHES(num_patches) |
@@ -395,7 +395,7 @@ static void si_emit_draw_registers(struct si_context *sctx,
 	unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->current_rast_prim);
 	unsigned ia_multi_vgt_param, ls_hs_config, num_patches = 0;
 
-	if (sctx->tes_shader)
+	if (sctx->tes_shader.cso)
 		si_emit_derived_tess_state(sctx, info, &num_patches);
 
 	ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info, num_patches);
@@ -735,11 +735,11 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	    (info->indexed || !info->count_from_stream_output))
 		return;
 
-	if (!sctx->ps_shader || !sctx->vs_shader) {
+	if (!sctx->ps_shader.cso || !sctx->vs_shader.cso) {
 		assert(0);
 		return;
 	}
-	if (!!sctx->tes_shader != (info->mode == PIPE_PRIM_PATCHES)) {
+	if (!!sctx->tes_shader.cso != (info->mode == PIPE_PRIM_PATCHES)) {
 		assert(0);
 		return;
 	}
@@ -751,11 +751,11 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	 * This must be done after si_decompress_textures, which can call
 	 * draw_vbo recursively, and before si_update_shaders, which uses
 	 * current_rast_prim for this draw_vbo call. */
-	if (sctx->gs_shader)
-		sctx->current_rast_prim = sctx->gs_shader->gs_output_prim;
-	else if (sctx->tes_shader)
+	if (sctx->gs_shader.cso)
+		sctx->current_rast_prim = sctx->gs_shader.cso->gs_output_prim;
+	else if (sctx->tes_shader.cso)
 		sctx->current_rast_prim =
-			sctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
+			sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
 	else
 		sctx->current_rast_prim = info->mode;
 
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index c98509bb0b9..eea00e0fafc 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -404,6 +404,7 @@ static void si_shader_ps(struct si_shader *shader)
 	unsigned num_sgprs, num_user_sgprs;
 	unsigned spi_baryc_cntl = 0;
 	uint64_t va;
+	bool has_centroid;
 
 	pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
 
@@ -435,8 +436,11 @@ static void si_shader_ps(struct si_shader *shader)
 		}
 	}
 
+	has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->spi_ps_input_ena) ||
+		       G_0286CC_LINEAR_CENTROID_ENA(shader->spi_ps_input_ena);
+
 	spi_ps_in_control = S_0286D8_NUM_INTERP(shader->nparam) |
-		S_0286D8_BC_OPTIMIZE_DISABLE(1);
+			    S_0286D8_BC_OPTIMIZE_DISABLE(has_centroid);
 
 	si_pm4_set_reg(pm4, R_0286E0_SPI_BARYC_CNTL, spi_baryc_cntl);
 	si_pm4_set_reg(pm4, R_0286D8_SPI_PS_IN_CONTROL, spi_ps_in_control);
@@ -523,26 +527,26 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 				key->vs.instance_divisors[i] =
 					sctx->vertex_elements->elements[i].instance_divisor;
 
-		if (sctx->tes_shader)
+		if (sctx->tes_shader.cso)
 			key->vs.as_ls = 1;
-		else if (sctx->gs_shader) {
+		else if (sctx->gs_shader.cso) {
 			key->vs.as_es = 1;
-			key->vs.es_enabled_outputs = sctx->gs_shader->inputs_read;
+			key->vs.es_enabled_outputs = sctx->gs_shader.cso->inputs_read;
 		}
 
-		if (!sctx->gs_shader && sctx->ps_shader &&
-		    sctx->ps_shader->info.uses_primid)
+		if (!sctx->gs_shader.cso && sctx->ps_shader.cso &&
+		    sctx->ps_shader.cso->info.uses_primid)
 			key->vs.export_prim_id = 1;
 		break;
 	case PIPE_SHADER_TESS_CTRL:
 		key->tcs.prim_mode =
-			sctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
+			sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
 		break;
 	case PIPE_SHADER_TESS_EVAL:
-		if (sctx->gs_shader) {
+		if (sctx->gs_shader.cso) {
 			key->tes.as_es = 1;
-			key->tes.es_enabled_outputs = sctx->gs_shader->inputs_read;
-		} else if (sctx->ps_shader && sctx->ps_shader->info.uses_primid)
+			key->tes.es_enabled_outputs = sctx->gs_shader.cso->inputs_read;
+		} else if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
 			key->tes.export_prim_id = 1;
 		break;
 	case PIPE_SHADER_GEOMETRY:
@@ -589,11 +593,13 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 
 /* Select the hw shader variant depending on the current state. */
 static int si_shader_select(struct pipe_context *ctx,
-			    struct si_shader_selector *sel)
+			    struct si_shader_ctx_state *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_shader_selector *sel = state->cso;
+	struct si_shader *current = state->current;
 	union si_shader_key key;
-	struct si_shader * shader = NULL;
+	struct si_shader *iter, *shader = NULL;
 	int r;
 
 	si_shader_selector_key(ctx, sel, &key);
@@ -602,49 +608,51 @@ static int si_shader_select(struct pipe_context *ctx,
 	 * This path is also used for most shaders that don't need multiple
 	 * variants, it will cost just a computation of the key and this
 	 * test. */
-	if (likely(sel->current && memcmp(&sel->current->key, &key, sizeof(key)) == 0)) {
+	if (likely(current && memcmp(&current->key, &key, sizeof(key)) == 0))
 		return 0;
-	}
 
-	/* lookup if we have other variants in the list */
-	if (sel->num_shaders > 1) {
-		struct si_shader *p = sel->current, *c = p->next_variant;
+	pipe_mutex_lock(sel->mutex);
 
-		while (c && memcmp(&c->key, &key, sizeof(key)) != 0) {
-			p = c;
-			c = c->next_variant;
-		}
-
-		if (c) {
-			p->next_variant = c->next_variant;
-			shader = c;
+	/* Find the shader variant. */
+	for (iter = sel->first_variant; iter; iter = iter->next_variant) {
+		/* Don't check the "current" shader. We checked it above. */
+		if (current != iter &&
+		    memcmp(&iter->key, &key, sizeof(key)) == 0) {
+			state->current = iter;
+			pipe_mutex_unlock(sel->mutex);
+			return 0;
 		}
 	}
 
-	if (shader) {
-		shader->next_variant = sel->current;
-		sel->current = shader;
+	/* Build a new shader. */
+	shader = CALLOC_STRUCT(si_shader);
+	if (!shader) {
+		pipe_mutex_unlock(sel->mutex);
+		return -ENOMEM;
+	}
+	shader->selector = sel;
+	shader->key = key;
+
+	r = si_shader_create(sctx->screen, sctx->tm, shader);
+	if (unlikely(r)) {
+		R600_ERR("Failed to build shader variant (type=%u) %d\n",
+			 sel->type, r);
+		FREE(shader);
+		pipe_mutex_unlock(sel->mutex);
+		return r;
+	}
+	si_shader_init_pm4_state(shader);
+
+	if (!sel->last_variant) {
+		sel->first_variant = shader;
+		sel->last_variant = shader;
 	} else {
-		shader = CALLOC(1, sizeof(struct si_shader));
-		shader->selector = sel;
-		shader->key = key;
-
-		shader->next_variant = sel->current;
-		sel->current = shader;
-		r = si_shader_create((struct si_screen*)ctx->screen, sctx->tm,
-				     shader);
-		if (unlikely(r)) {
-			R600_ERR("Failed to build shader variant (type=%u) %d\n",
-				 sel->type, r);
-			sel->current = NULL;
-			FREE(shader);
-			return r;
-		}
-		si_shader_init_pm4_state(shader);
-		sel->num_shaders++;
-		p_atomic_inc(&sctx->screen->b.num_compilations);
+		sel->last_variant->next_variant = shader;
+		sel->last_variant = shader;
 	}
-
+	state->current = shader;
+	p_atomic_inc(&sctx->screen->b.num_compilations);
+	pipe_mutex_unlock(sel->mutex);
 	return 0;
 }
 
@@ -752,14 +760,18 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 		break;
 	}
 
-	if (sscreen->b.debug_flags & DBG_PRECOMPILE)
-		if (si_shader_select(ctx, sel)) {
+	if (sscreen->b.debug_flags & DBG_PRECOMPILE) {
+		struct si_shader_ctx_state state = {sel};
+
+		if (si_shader_select(ctx, &state)) {
 			fprintf(stderr, "radeonsi: can't create a shader\n");
 			tgsi_free_tokens(sel->tokens);
 			FREE(sel);
 			return NULL;
 		}
+	}
 
+	pipe_mutex_init(sel->mutex);
 	return sel;
 }
 
@@ -787,10 +799,11 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_shader_selector *sel = state;
 
-	if (sctx->vs_shader == sel || !sel)
+	if (sctx->vs_shader.cso == sel || !sel)
 		return;
 
-	sctx->vs_shader = sel;
+	sctx->vs_shader.cso = sel;
+	sctx->vs_shader.current = sel->first_variant;
 	si_mark_atom_dirty(sctx, &sctx->clip_regs);
 	si_update_viewports_and_scissors(sctx);
 }
@@ -799,12 +812,13 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_shader_selector *sel = state;
-	bool enable_changed = !!sctx->gs_shader != !!sel;
+	bool enable_changed = !!sctx->gs_shader.cso != !!sel;
 
-	if (sctx->gs_shader == sel)
+	if (sctx->gs_shader.cso == sel)
 		return;
 
-	sctx->gs_shader = sel;
+	sctx->gs_shader.cso = sel;
+	sctx->gs_shader.current = sel ? sel->first_variant : NULL;
 	si_mark_atom_dirty(sctx, &sctx->clip_regs);
 	sctx->last_rast_prim = -1; /* reset this so that it gets updated */
 
@@ -817,12 +831,13 @@ static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_shader_selector *sel = state;
-	bool enable_changed = !!sctx->tcs_shader != !!sel;
+	bool enable_changed = !!sctx->tcs_shader.cso != !!sel;
 
-	if (sctx->tcs_shader == sel)
+	if (sctx->tcs_shader.cso == sel)
 		return;
 
-	sctx->tcs_shader = sel;
+	sctx->tcs_shader.cso = sel;
+	sctx->tcs_shader.current = sel ? sel->first_variant : NULL;
 
 	if (enable_changed)
 		sctx->last_tcs = NULL; /* invalidate derived tess state */
@@ -832,12 +847,13 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_shader_selector *sel = state;
-	bool enable_changed = !!sctx->tes_shader != !!sel;
+	bool enable_changed = !!sctx->tes_shader.cso != !!sel;
 
-	if (sctx->tes_shader == sel)
+	if (sctx->tes_shader.cso == sel)
 		return;
 
-	sctx->tes_shader = sel;
+	sctx->tes_shader.cso = sel;
+	sctx->tes_shader.current = sel ? sel->first_variant : NULL;
 	si_mark_atom_dirty(sctx, &sctx->clip_regs);
 	sctx->last_rast_prim = -1; /* reset this so that it gets updated */
 
@@ -864,7 +880,7 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 	struct si_shader_selector *sel = state;
 
 	/* skip if supplied shader is one already in use */
-	if (sctx->ps_shader == sel)
+	if (sctx->ps_shader.cso == sel)
 		return;
 
 	/* use a dummy shader if binding a NULL shader */
@@ -873,7 +889,8 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 		sel = sctx->dummy_pixel_shader;
 	}
 
-	sctx->ps_shader = sel;
+	sctx->ps_shader.cso = sel;
+	sctx->ps_shader.current = sel->first_variant;
 	si_mark_atom_dirty(sctx, &sctx->cb_target_mask);
 }
 
@@ -881,8 +898,8 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_shader_selector *sel = (struct si_shader_selector *)state;
-	struct si_shader *p = sel->current, *c;
-	struct si_shader_selector **current_shader[SI_NUM_SHADERS] = {
+	struct si_shader *p = sel->first_variant, *c;
+	struct si_shader_ctx_state *current_shader[SI_NUM_SHADERS] = {
 		[PIPE_SHADER_VERTEX] = &sctx->vs_shader,
 		[PIPE_SHADER_TESS_CTRL] = &sctx->tcs_shader,
 		[PIPE_SHADER_TESS_EVAL] = &sctx->tes_shader,
@@ -890,8 +907,10 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
 		[PIPE_SHADER_FRAGMENT] = &sctx->ps_shader,
 	};
 
-	if (*current_shader[sel->type] == sel)
-		*current_shader[sel->type] = NULL;
+	if (current_shader[sel->type]->cso == sel) {
+		current_shader[sel->type]->cso = NULL;
+		current_shader[sel->type]->current = NULL;
+	}
 
 	while (p) {
 		c = p->next_variant;
@@ -927,6 +946,7 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
 		p = c;
 	}
 
+	pipe_mutex_destroy(sel->mutex);
 	free(sel->tokens);
 	free(sel);
 }
@@ -934,7 +954,7 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
 static void si_emit_spi_map(struct si_context *sctx, struct r600_atom *atom)
 {
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	struct si_shader *ps = sctx->ps_shader->current;
+	struct si_shader *ps = sctx->ps_shader.current;
 	struct si_shader *vs = si_get_vs_state(sctx);
 	struct tgsi_shader_info *psinfo = &ps->selector->info;
 	struct tgsi_shader_info *vsinfo = &vs->selector->info;
@@ -1004,7 +1024,7 @@ bcolor:
 static void si_emit_spi_ps_input(struct si_context *sctx, struct r600_atom *atom)
 {
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	struct si_shader *ps = sctx->ps_shader->current;
+	struct si_shader *ps = sctx->ps_shader.current;
 	unsigned input_ena = ps->spi_ps_input_ena;
 
 	/* we need to enable at least one of them, otherwise we hang the GPU */
@@ -1133,7 +1153,7 @@ static void si_init_gs_rings(struct si_context *sctx)
 
 static void si_update_gs_rings(struct si_context *sctx)
 {
-	unsigned gsvs_itemsize = sctx->gs_shader->gsvs_itemsize;
+	unsigned gsvs_itemsize = sctx->gs_shader.cso->gsvs_itemsize;
 	uint64_t offset;
 
 	if (gsvs_itemsize == sctx->last_gsvs_itemsize)
@@ -1167,17 +1187,14 @@ static void si_update_gs_rings(struct si_context *sctx)
  *          < 0 if there was a failure
  */
 static int si_update_scratch_buffer(struct si_context *sctx,
-				    struct si_shader_selector *sel)
+				    struct si_shader *shader)
 {
-	struct si_shader *shader;
 	uint64_t scratch_va = sctx->scratch_buffer->gpu_address;
 	int r;
 
-	if (!sel)
+	if (!shader)
 		return 0;
 
-	shader = sel->current;
-
 	/* This shader doesn't need a scratch buffer */
 	if (shader->scratch_bytes_per_wave == 0)
 		return 0;
@@ -1209,20 +1226,20 @@ static unsigned si_get_current_scratch_buffer_size(struct si_context *sctx)
 	return sctx->scratch_buffer ? sctx->scratch_buffer->b.b.width0 : 0;
 }
 
-static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader_selector *sel)
+static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader *shader)
 {
-	return sel ? sel->current->scratch_bytes_per_wave : 0;
+	return shader ? shader->scratch_bytes_per_wave : 0;
 }
 
 static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx)
 {
 	unsigned bytes = 0;
 
-	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader));
-	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader));
-	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader));
-	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tcs_shader));
-	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader.current));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader.current));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader.current));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tcs_shader.current));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader.current));
 	return bytes;
 }
 
@@ -1256,46 +1273,46 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
 		 * last used, so we still need to try to update them, even if
 		 * they require scratch buffers smaller than the current size.
 		 */
-		r = si_update_scratch_buffer(sctx, sctx->ps_shader);
+		r = si_update_scratch_buffer(sctx, sctx->ps_shader.current);
 		if (r < 0)
 			return false;
 		if (r == 1)
-			si_pm4_bind_state(sctx, ps, sctx->ps_shader->current->pm4);
+			si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
 
-		r = si_update_scratch_buffer(sctx, sctx->gs_shader);
+		r = si_update_scratch_buffer(sctx, sctx->gs_shader.current);
 		if (r < 0)
 			return false;
 		if (r == 1)
-			si_pm4_bind_state(sctx, gs, sctx->gs_shader->current->pm4);
+			si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
 
-		r = si_update_scratch_buffer(sctx, sctx->tcs_shader);
+		r = si_update_scratch_buffer(sctx, sctx->tcs_shader.current);
 		if (r < 0)
 			return false;
 		if (r == 1)
-			si_pm4_bind_state(sctx, hs, sctx->tcs_shader->current->pm4);
+			si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4);
 
 		/* VS can be bound as LS, ES, or VS. */
-		r = si_update_scratch_buffer(sctx, sctx->vs_shader);
+		r = si_update_scratch_buffer(sctx, sctx->vs_shader.current);
 		if (r < 0)
 			return false;
 		if (r == 1) {
-			if (sctx->tes_shader)
-				si_pm4_bind_state(sctx, ls, sctx->vs_shader->current->pm4);
-			else if (sctx->gs_shader)
-				si_pm4_bind_state(sctx, es, sctx->vs_shader->current->pm4);
+			if (sctx->tes_shader.current)
+				si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
+			else if (sctx->gs_shader.current)
+				si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
 			else
-				si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4);
+				si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
 		}
 
 		/* TES can be bound as ES or VS. */
-		r = si_update_scratch_buffer(sctx, sctx->tes_shader);
+		r = si_update_scratch_buffer(sctx, sctx->tes_shader.current);
 		if (r < 0)
 			return false;
 		if (r == 1) {
-			if (sctx->gs_shader)
-				si_pm4_bind_state(sctx, es, sctx->tes_shader->current->pm4);
+			if (sctx->gs_shader.current)
+				si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
 			else
-				si_pm4_bind_state(sctx, vs, sctx->tes_shader->current->pm4);
+				si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
 		}
 	}
 
@@ -1361,7 +1378,7 @@ static void si_generate_fixed_func_tcs(struct si_context *sctx)
 	if (!ureg)
 		return; /* if we get here, we're screwed */
 
-	assert(!sctx->fixed_func_tcs_shader);
+	assert(!sctx->fixed_func_tcs_shader.cso);
 
 	ureg_DECL_constant2D(ureg, 0, 1, SI_DRIVER_STATE_CONST_BUF);
 	const0 = ureg_src_dimension(ureg_src_register(TGSI_FILE_CONSTANT, 0),
@@ -1376,7 +1393,7 @@ static void si_generate_fixed_func_tcs(struct si_context *sctx)
 	ureg_MOV(ureg, tessinner, const1);
 	ureg_END(ureg);
 
-	sctx->fixed_func_tcs_shader =
+	sctx->fixed_func_tcs_shader.cso =
 		ureg_create_shader_and_destroy(ureg, &sctx->b.b);
 }
 
@@ -1384,7 +1401,7 @@ static void si_update_vgt_shader_config(struct si_context *sctx)
 {
 	/* Calculate the index of the config.
 	 * 0 = VS, 1 = VS+GS, 2 = VS+Tess, 3 = VS+Tess+GS */
-	unsigned index = 2*!!sctx->tes_shader + !!sctx->gs_shader;
+	unsigned index = 2*!!sctx->tes_shader.cso + !!sctx->gs_shader.cso;
 	struct si_pm4_state **pm4 = &sctx->vgt_shader_config[index];
 
 	if (!*pm4) {
@@ -1392,17 +1409,17 @@ static void si_update_vgt_shader_config(struct si_context *sctx)
 
 		*pm4 = CALLOC_STRUCT(si_pm4_state);
 
-		if (sctx->tes_shader) {
+		if (sctx->tes_shader.cso) {
 			stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) |
 				  S_028B54_HS_EN(1);
 
-			if (sctx->gs_shader)
+			if (sctx->gs_shader.cso)
 				stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) |
 					  S_028B54_GS_EN(1) |
 				          S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
 			else
 				stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
-		} else if (sctx->gs_shader) {
+		} else if (sctx->gs_shader.cso) {
 			stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
 				  S_028B54_GS_EN(1) |
 			          S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
@@ -1432,7 +1449,7 @@ bool si_update_shaders(struct si_context *sctx)
 	int r;
 
 	/* Update stages before GS. */
-	if (sctx->tes_shader) {
+	if (sctx->tes_shader.cso) {
 		if (!sctx->tf_ring) {
 			si_init_tess_factor_ring(sctx);
 			if (!sctx->tf_ring)
@@ -1440,65 +1457,65 @@ bool si_update_shaders(struct si_context *sctx)
 		}
 
 		/* VS as LS */
-		r = si_shader_select(ctx, sctx->vs_shader);
+		r = si_shader_select(ctx, &sctx->vs_shader);
 		if (r)
 			return false;
-		si_pm4_bind_state(sctx, ls, sctx->vs_shader->current->pm4);
+		si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
 
-		if (sctx->tcs_shader) {
-			r = si_shader_select(ctx, sctx->tcs_shader);
+		if (sctx->tcs_shader.cso) {
+			r = si_shader_select(ctx, &sctx->tcs_shader);
 			if (r)
 				return false;
-			si_pm4_bind_state(sctx, hs, sctx->tcs_shader->current->pm4);
+			si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4);
 		} else {
-			if (!sctx->fixed_func_tcs_shader) {
+			if (!sctx->fixed_func_tcs_shader.cso) {
 				si_generate_fixed_func_tcs(sctx);
-				if (!sctx->fixed_func_tcs_shader)
+				if (!sctx->fixed_func_tcs_shader.cso)
 					return false;
 			}
 
-			r = si_shader_select(ctx, sctx->fixed_func_tcs_shader);
+			r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader);
 			if (r)
 				return false;
 			si_pm4_bind_state(sctx, hs,
-					  sctx->fixed_func_tcs_shader->current->pm4);
+					  sctx->fixed_func_tcs_shader.current->pm4);
 		}
 
-		r = si_shader_select(ctx, sctx->tes_shader);
+		r = si_shader_select(ctx, &sctx->tes_shader);
 		if (r)
 			return false;
 
-		if (sctx->gs_shader) {
+		if (sctx->gs_shader.cso) {
 			/* TES as ES */
-			si_pm4_bind_state(sctx, es, sctx->tes_shader->current->pm4);
+			si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
 		} else {
 			/* TES as VS */
-			si_pm4_bind_state(sctx, vs, sctx->tes_shader->current->pm4);
-			si_update_so(sctx, sctx->tes_shader);
+			si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
+			si_update_so(sctx, sctx->tes_shader.cso);
 		}
-	} else if (sctx->gs_shader) {
+	} else if (sctx->gs_shader.cso) {
 		/* VS as ES */
-		r = si_shader_select(ctx, sctx->vs_shader);
+		r = si_shader_select(ctx, &sctx->vs_shader);
 		if (r)
 			return false;
-		si_pm4_bind_state(sctx, es, sctx->vs_shader->current->pm4);
+		si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
 	} else {
 		/* VS as VS */
-		r = si_shader_select(ctx, sctx->vs_shader);
+		r = si_shader_select(ctx, &sctx->vs_shader);
 		if (r)
 			return false;
-		si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4);
-		si_update_so(sctx, sctx->vs_shader);
+		si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
+		si_update_so(sctx, sctx->vs_shader.cso);
 	}
 
 	/* Update GS. */
-	if (sctx->gs_shader) {
-		r = si_shader_select(ctx, sctx->gs_shader);
+	if (sctx->gs_shader.cso) {
+		r = si_shader_select(ctx, &sctx->gs_shader);
 		if (r)
 			return false;
-		si_pm4_bind_state(sctx, gs, sctx->gs_shader->current->pm4);
-		si_pm4_bind_state(sctx, vs, sctx->gs_shader->current->gs_copy_shader->pm4);
-		si_update_so(sctx, sctx->gs_shader);
+		si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
+		si_pm4_bind_state(sctx, vs, sctx->gs_shader.current->gs_copy_shader->pm4);
+		si_update_so(sctx, sctx->gs_shader.cso);
 
 		if (!sctx->gsvs_ring) {
 			si_init_gs_rings(sctx);
@@ -1514,10 +1531,10 @@ bool si_update_shaders(struct si_context *sctx)
 
 	si_update_vgt_shader_config(sctx);
 
-	r = si_shader_select(ctx, sctx->ps_shader);
+	r = si_shader_select(ctx, &sctx->ps_shader);
 	if (r)
 		return false;
-	si_pm4_bind_state(sctx, ps, sctx->ps_shader->current->pm4);
+	si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
 
 	if (si_pm4_state_changed(sctx, ps) || si_pm4_state_changed(sctx, vs) ||
 	    sctx->sprite_coord_enable != rs->sprite_coord_enable ||
@@ -1543,13 +1560,13 @@ bool si_update_shaders(struct si_context *sctx)
 			return false;
 	}
 
-	if (sctx->ps_db_shader_control != sctx->ps_shader->current->db_shader_control) {
-		sctx->ps_db_shader_control = sctx->ps_shader->current->db_shader_control;
+	if (sctx->ps_db_shader_control != sctx->ps_shader.current->db_shader_control) {
+		sctx->ps_db_shader_control = sctx->ps_shader.current->db_shader_control;
 		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 	}
 
-	if (sctx->smoothing_enabled != sctx->ps_shader->current->key.ps.poly_line_smoothing) {
-		sctx->smoothing_enabled = sctx->ps_shader->current->key.ps.poly_line_smoothing;
+	if (sctx->smoothing_enabled != sctx->ps_shader.current->key.ps.poly_line_smoothing) {
+		sctx->smoothing_enabled = sctx->ps_shader.current->key.ps.poly_line_smoothing;
 		si_mark_atom_dirty(sctx, &sctx->msaa_config);
 
 		if (sctx->b.chip_class == SI)
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index d468cf4de54..e7006d2fa0d 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -248,6 +248,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_DEPTH_BOUNDS_TEST:
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_SHAREABLE_SHADERS:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index dab89814334..f6fafca5c0b 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -381,6 +381,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_DEPTH_BOUNDS_TEST:
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_SHAREABLE_SHADERS:
       return 0;
    }
 
@@ -455,6 +456,8 @@ vgpu9_get_shader_param(struct pipe_screen *screen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
          return 0;
+      case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+         return 32;
       }
       /* If we get here, we failed to handle a cap above */
       debug_printf("Unexpected fragment shader query %u\n", param);
@@ -511,6 +514,8 @@ vgpu9_get_shader_param(struct pipe_screen *screen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
          return 0;
+      case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+         return 32;
       }
       /* If we get here, we failed to handle a cap above */
       debug_printf("Unexpected vertex shader query %u\n", param);
@@ -600,6 +605,8 @@ vgpu10_get_shader_param(struct pipe_screen *screen, unsigned shader,
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
       return 0;
+   case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+      return 32;
    default:
       debug_printf("Unexpected vgpu10 shader query %u\n", param);
       return 0;
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
index a842d604a51..17b524653bb 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
@@ -393,7 +393,7 @@ vc4_nir_lower_blend_block(nir_block *block, void *state)
                         continue;
 
                 nir_variable *output_var = NULL;
-                foreach_list_typed(nir_variable, var, node, &c->s->outputs) {
+                nir_foreach_variable(var, &c->s->outputs) {
                         if (var->data.driver_location == intr->const_index[0]) {
                                 output_var = var;
                                 break;
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
index a98d70da7d8..caf706aa2a6 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -23,6 +23,7 @@
 
 #include "vc4_qir.h"
 #include "glsl/nir/nir_builder.h"
+#include "util/u_format.h"
 
 /**
  * Walks the NIR generated by TGSI-to-NIR to lower its io intrinsics into
@@ -50,20 +51,188 @@ replace_intrinsic_with_vec4(nir_builder *b, nir_intrinsic_instr *intr,
         nir_instr_remove(&intr->instr);
 }
 
+static nir_ssa_def *
+vc4_nir_unpack_8i(nir_builder *b, nir_ssa_def *src, unsigned chan)
+{
+        return nir_ubitfield_extract(b,
+                                     src,
+                                     nir_imm_int(b, 8 * chan),
+                                     nir_imm_int(b, 8));
+}
+
+/** Returns the 16 bit field as a sign-extended 32-bit value. */
+static nir_ssa_def *
+vc4_nir_unpack_16i(nir_builder *b, nir_ssa_def *src, unsigned chan)
+{
+        return nir_ibitfield_extract(b,
+                                     src,
+                                     nir_imm_int(b, 16 * chan),
+                                     nir_imm_int(b, 16));
+}
+
+/** Returns the 16 bit field as an unsigned 32 bit value. */
+static nir_ssa_def *
+vc4_nir_unpack_16u(nir_builder *b, nir_ssa_def *src, unsigned chan)
+{
+        if (chan == 0) {
+                return nir_iand(b, src, nir_imm_int(b, 0xffff));
+        } else {
+                return nir_ushr(b, src, nir_imm_int(b, 16));
+        }
+}
+
+static nir_ssa_def *
+vc4_nir_unpack_8f(nir_builder *b, nir_ssa_def *src, unsigned chan)
+{
+        return nir_swizzle(b, nir_unpack_unorm_4x8(b, src), &chan, 1, false);
+}
+
+static nir_ssa_def *
+vc4_nir_get_vattr_channel_vpm(struct vc4_compile *c,
+                              nir_builder *b,
+                              nir_ssa_def **vpm_reads,
+                              uint8_t swiz,
+                              const struct util_format_description *desc)
+{
+        const struct util_format_channel_description *chan =
+                &desc->channel[swiz];
+        nir_ssa_def *temp;
+
+        if (swiz > UTIL_FORMAT_SWIZZLE_W) {
+                return vc4_nir_get_swizzled_channel(b, vpm_reads, swiz);
+        } else if (chan->size == 32 && chan->type == UTIL_FORMAT_TYPE_FLOAT) {
+                return vc4_nir_get_swizzled_channel(b, vpm_reads, swiz);
+        } else if (chan->size == 32 && chan->type == UTIL_FORMAT_TYPE_SIGNED) {
+                if (chan->normalized) {
+                        return nir_fmul(b,
+                                        nir_i2f(b, vpm_reads[swiz]),
+                                        nir_imm_float(b,
+                                                      1.0 / 0x7fffffff));
+                } else {
+                        return nir_i2f(b, vpm_reads[swiz]);
+                }
+        } else if (chan->size == 8 &&
+                   (chan->type == UTIL_FORMAT_TYPE_UNSIGNED ||
+                    chan->type == UTIL_FORMAT_TYPE_SIGNED)) {
+                nir_ssa_def *vpm = vpm_reads[0];
+                if (chan->type == UTIL_FORMAT_TYPE_SIGNED) {
+                        temp = nir_ixor(b, vpm, nir_imm_int(b, 0x80808080));
+                        if (chan->normalized) {
+                                return nir_fsub(b, nir_fmul(b,
+                                                            vc4_nir_unpack_8f(b, temp, swiz),
+                                                            nir_imm_float(b, 2.0)),
+                                                nir_imm_float(b, 1.0));
+                        } else {
+                                return nir_fadd(b,
+                                                nir_i2f(b,
+                                                        vc4_nir_unpack_8i(b, temp,
+                                                                          swiz)),
+                                                nir_imm_float(b, -128.0));
+                        }
+                } else {
+                        if (chan->normalized) {
+                                return vc4_nir_unpack_8f(b, vpm, swiz);
+                        } else {
+                                return nir_i2f(b, vc4_nir_unpack_8i(b, vpm, swiz));
+                        }
+                }
+        } else if (chan->size == 16 &&
+                   (chan->type == UTIL_FORMAT_TYPE_UNSIGNED ||
+                    chan->type == UTIL_FORMAT_TYPE_SIGNED)) {
+                nir_ssa_def *vpm = vpm_reads[swiz / 2];
+
+                /* Note that UNPACK_16F eats a half float, not ints, so we use
+                 * UNPACK_16_I for all of these.
+                 */
+                if (chan->type == UTIL_FORMAT_TYPE_SIGNED) {
+                        temp = nir_i2f(b, vc4_nir_unpack_16i(b, vpm, swiz & 1));
+                        if (chan->normalized) {
+                                return nir_fmul(b, temp,
+                                                nir_imm_float(b, 1/32768.0f));
+                        } else {
+                                return temp;
+                        }
+                } else {
+                        temp = nir_i2f(b, vc4_nir_unpack_16u(b, vpm, swiz & 1));
+                        if (chan->normalized) {
+                                return nir_fmul(b, temp,
+                                                nir_imm_float(b, 1 / 65535.0));
+                        } else {
+                                return temp;
+                        }
+                }
+        } else {
+                return NULL;
+        }
+}
+
 static void
-vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b,
-                    nir_intrinsic_instr *intr)
+vc4_nir_lower_vertex_attr(struct vc4_compile *c, nir_builder *b,
+                          nir_intrinsic_instr *intr)
 {
         b->cursor = nir_before_instr(&intr->instr);
 
-        if (c->stage == QSTAGE_FRAG && intr->const_index[0] ==
-            VC4_NIR_TLB_COLOR_READ_INPUT) {
+        int attr = intr->const_index[0];
+        enum pipe_format format = c->vs_key->attr_formats[attr];
+        uint32_t attr_size = util_format_get_blocksize(format);
+
+        /* All TGSI-to-NIR inputs are vec4. */
+        assert(intr->num_components == 4);
+
+        /* Generate dword loads for the VPM values (Since these intrinsics may
+         * be reordered, the actual reads will be generated at the top of the
+         * shader by ntq_setup_inputs().
+         */
+        nir_ssa_def *vpm_reads[4];
+        for (int i = 0; i < align(attr_size, 4) / 4; i++) {
+                nir_intrinsic_instr *intr_comp =
+                        nir_intrinsic_instr_create(c->s,
+                                                   nir_intrinsic_load_input);
+                intr_comp->num_components = 1;
+                intr_comp->const_index[0] = intr->const_index[0] * 4 + i;
+                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL);
+                nir_builder_instr_insert(b, &intr_comp->instr);
+
+                vpm_reads[i] = &intr_comp->dest.ssa;
+        }
+
+        bool format_warned = false;
+        const struct util_format_description *desc =
+                util_format_description(format);
+
+        nir_ssa_def *dests[4];
+        for (int i = 0; i < 4; i++) {
+                uint8_t swiz = desc->swizzle[i];
+                dests[i] = vc4_nir_get_vattr_channel_vpm(c, b, vpm_reads, swiz,
+                                                         desc);
+
+                if (!dests[i]) {
+                        if (!format_warned) {
+                                fprintf(stderr,
+                                        "vtx element %d unsupported type: %s\n",
+                                        attr, util_format_name(format));
+                                format_warned = true;
+                        }
+                        dests[i] = nir_imm_float(b, 0.0);
+                }
+        }
+
+        replace_intrinsic_with_vec4(b, intr, dests);
+}
+
+static void
+vc4_nir_lower_fs_input(struct vc4_compile *c, nir_builder *b,
+                       nir_intrinsic_instr *intr)
+{
+        b->cursor = nir_before_instr(&intr->instr);
+
+        if (intr->const_index[0] == VC4_NIR_TLB_COLOR_READ_INPUT) {
                 /* This doesn't need any lowering. */
                 return;
         }
 
         nir_variable *input_var = NULL;
-        foreach_list_typed(nir_variable, var, node, &c->s->inputs) {
+        nir_foreach_variable(var, &c->s->inputs) {
                 if (var->data.driver_location == intr->const_index[0]) {
                         input_var = var;
                         break;
@@ -87,38 +256,31 @@ vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b,
                 dests[i] = &intr_comp->dest.ssa;
         }
 
-        switch (c->stage) {
-        case QSTAGE_FRAG:
-                if (input_var->data.location == VARYING_SLOT_FACE) {
-                        dests[0] = nir_fsub(b,
-                                            nir_imm_float(b, 1.0),
-                                            nir_fmul(b,
-                                                     nir_i2f(b, dests[0]),
-                                                     nir_imm_float(b, 2.0)));
-                        dests[1] = nir_imm_float(b, 0.0);
+        if (input_var->data.location == VARYING_SLOT_FACE) {
+                dests[0] = nir_fsub(b,
+                                    nir_imm_float(b, 1.0),
+                                    nir_fmul(b,
+                                             nir_i2f(b, dests[0]),
+                                             nir_imm_float(b, 2.0)));
+                dests[1] = nir_imm_float(b, 0.0);
+                dests[2] = nir_imm_float(b, 0.0);
+                dests[3] = nir_imm_float(b, 1.0);
+        } else if (input_var->data.location >= VARYING_SLOT_VAR0) {
+                if (c->fs_key->point_sprite_mask &
+                    (1 << (input_var->data.location -
+                           VARYING_SLOT_VAR0))) {
+                        if (!c->fs_key->is_points) {
+                                dests[0] = nir_imm_float(b, 0.0);
+                                dests[1] = nir_imm_float(b, 0.0);
+                        }
+                        if (c->fs_key->point_coord_upper_left) {
+                                dests[1] = nir_fsub(b,
+                                                    nir_imm_float(b, 1.0),
+                                                    dests[1]);
+                        }
                         dests[2] = nir_imm_float(b, 0.0);
                         dests[3] = nir_imm_float(b, 1.0);
-                } else if (input_var->data.location >= VARYING_SLOT_VAR0) {
-                        if (c->fs_key->point_sprite_mask &
-                            (1 << (input_var->data.location -
-                                   VARYING_SLOT_VAR0))) {
-                                if (!c->fs_key->is_points) {
-                                        dests[0] = nir_imm_float(b, 0.0);
-                                        dests[1] = nir_imm_float(b, 0.0);
-                                }
-                                if (c->fs_key->point_coord_upper_left) {
-                                        dests[1] = nir_fsub(b,
-                                                            nir_imm_float(b, 1.0),
-                                                            dests[1]);
-                                }
-                                dests[2] = nir_imm_float(b, 0.0);
-                                dests[3] = nir_imm_float(b, 1.0);
-                        }
                 }
-                break;
-        case QSTAGE_COORD:
-        case QSTAGE_VERT:
-                break;
         }
 
         replace_intrinsic_with_vec4(b, intr, dests);
@@ -129,7 +291,7 @@ vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b,
                      nir_intrinsic_instr *intr)
 {
         nir_variable *output_var = NULL;
-        foreach_list_typed(nir_variable, var, node, &c->s->outputs) {
+        nir_foreach_variable(var, &c->s->outputs) {
                 if (var->data.driver_location == intr->const_index[0]) {
                         output_var = var;
                         break;
@@ -232,7 +394,10 @@ vc4_nir_lower_io_instr(struct vc4_compile *c, nir_builder *b,
 
         switch (intr->intrinsic) {
         case nir_intrinsic_load_input:
-                vc4_nir_lower_input(c, b, intr);
+                if (c->stage == QSTAGE_FRAG)
+                        vc4_nir_lower_fs_input(c, b, intr);
+                else
+                        vc4_nir_lower_vertex_attr(c, b, intr);
                 break;
 
         case nir_intrinsic_store_output:
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 31c7e28ff57..6e9ec6530c6 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -602,126 +602,18 @@ ntq_fsign(struct vc4_compile *c, struct qreg src)
                               qir_uniform_f(c, -1.0));
 }
 
-static struct qreg
-get_channel_from_vpm(struct vc4_compile *c,
-                     struct qreg *vpm_reads,
-                     uint8_t swiz,
-                     const struct util_format_description *desc)
-{
-        const struct util_format_channel_description *chan =
-                &desc->channel[swiz];
-        struct qreg temp;
-
-        if (swiz > UTIL_FORMAT_SWIZZLE_W)
-                return get_swizzled_channel(c, vpm_reads, swiz);
-        else if (chan->size == 32 &&
-                 chan->type == UTIL_FORMAT_TYPE_FLOAT) {
-                return get_swizzled_channel(c, vpm_reads, swiz);
-        } else if (chan->size == 32 &&
-                   chan->type == UTIL_FORMAT_TYPE_SIGNED) {
-                if (chan->normalized) {
-                        return qir_FMUL(c,
-                                        qir_ITOF(c, vpm_reads[swiz]),
-                                        qir_uniform_f(c,
-                                                      1.0 / 0x7fffffff));
-                } else {
-                        return qir_ITOF(c, vpm_reads[swiz]);
-                }
-        } else if (chan->size == 8 &&
-                   (chan->type == UTIL_FORMAT_TYPE_UNSIGNED ||
-                    chan->type == UTIL_FORMAT_TYPE_SIGNED)) {
-                struct qreg vpm = vpm_reads[0];
-                if (chan->type == UTIL_FORMAT_TYPE_SIGNED) {
-                        temp = qir_XOR(c, vpm, qir_uniform_ui(c, 0x80808080));
-                        if (chan->normalized) {
-                                return qir_FSUB(c, qir_FMUL(c,
-                                                            qir_UNPACK_8_F(c, temp, swiz),
-                                                            qir_uniform_f(c, 2.0)),
-                                                qir_uniform_f(c, 1.0));
-                        } else {
-                                return qir_FADD(c,
-                                                qir_ITOF(c,
-                                                         qir_UNPACK_8_I(c, temp,
-                                                                        swiz)),
-                                                qir_uniform_f(c, -128.0));
-                        }
-                } else {
-                        if (chan->normalized) {
-                                return qir_UNPACK_8_F(c, vpm, swiz);
-                        } else {
-                                return qir_ITOF(c, qir_UNPACK_8_I(c, vpm, swiz));
-                        }
-                }
-        } else if (chan->size == 16 &&
-                   (chan->type == UTIL_FORMAT_TYPE_UNSIGNED ||
-                    chan->type == UTIL_FORMAT_TYPE_SIGNED)) {
-                struct qreg vpm = vpm_reads[swiz / 2];
-
-                /* Note that UNPACK_16F eats a half float, not ints, so we use
-                 * UNPACK_16_I for all of these.
-                 */
-                if (chan->type == UTIL_FORMAT_TYPE_SIGNED) {
-                        temp = qir_ITOF(c, qir_UNPACK_16_I(c, vpm, swiz % 2));
-                        if (chan->normalized) {
-                                return qir_FMUL(c, temp,
-                                                qir_uniform_f(c, 1/32768.0f));
-                        } else {
-                                return temp;
-                        }
-                } else {
-                        /* UNPACK_16I sign-extends, so we have to emit ANDs. */
-                        temp = vpm;
-                        if (swiz == 1 || swiz == 3)
-                                temp = qir_UNPACK_16_I(c, temp, 1);
-                        temp = qir_AND(c, temp, qir_uniform_ui(c, 0xffff));
-                        temp = qir_ITOF(c, temp);
-
-                        if (chan->normalized) {
-                                return qir_FMUL(c, temp,
-                                                qir_uniform_f(c, 1 / 65535.0));
-                        } else {
-                                return temp;
-                        }
-                }
-        } else {
-                return c->undef;
-        }
-}
-
 static void
 emit_vertex_input(struct vc4_compile *c, int attr)
 {
         enum pipe_format format = c->vs_key->attr_formats[attr];
         uint32_t attr_size = util_format_get_blocksize(format);
-        struct qreg vpm_reads[4];
 
         c->vattr_sizes[attr] = align(attr_size, 4);
         for (int i = 0; i < align(attr_size, 4) / 4; i++) {
                 struct qreg vpm = { QFILE_VPM, attr * 4 + i };
-                vpm_reads[i] = qir_MOV(c, vpm);
+                c->inputs[attr * 4 + i] = qir_MOV(c, vpm);
                 c->num_inputs++;
         }
-
-        bool format_warned = false;
-        const struct util_format_description *desc =
-                util_format_description(format);
-
-        for (int i = 0; i < 4; i++) {
-                uint8_t swiz = desc->swizzle[i];
-                struct qreg result = get_channel_from_vpm(c, vpm_reads,
-                                                          swiz, desc);
-
-                if (result.file == QFILE_NULL) {
-                        if (!format_warned) {
-                                fprintf(stderr,
-                                        "vtx element %d unsupported type: %s\n",
-                                        attr, util_format_name(format));
-                                format_warned = true;
-                        }
-                        result = qir_uniform_f(c, 0.0);
-                }
-                c->inputs[attr * 4 + i] = result;
-        }
 }
 
 static void
@@ -876,6 +768,40 @@ ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr)
         *dest = result;
 }
 
+/** Handles sign-extended bitfield extracts for 16 bits. */
+static struct qreg
+ntq_emit_ibfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
+              struct qreg bits)
+{
+        assert(bits.file == QFILE_UNIF &&
+               c->uniform_contents[bits.index] == QUNIFORM_CONSTANT &&
+               c->uniform_data[bits.index] == 16);
+
+        assert(offset.file == QFILE_UNIF &&
+               c->uniform_contents[offset.index] == QUNIFORM_CONSTANT);
+        int offset_bit = c->uniform_data[offset.index];
+        assert(offset_bit % 16 == 0);
+
+        return qir_UNPACK_16_I(c, base, offset_bit / 16);
+}
+
+/** Handles unsigned bitfield extracts for 8 bits. */
+static struct qreg
+ntq_emit_ubfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
+              struct qreg bits)
+{
+        assert(bits.file == QFILE_UNIF &&
+               c->uniform_contents[bits.index] == QUNIFORM_CONSTANT &&
+               c->uniform_data[bits.index] == 8);
+
+        assert(offset.file == QFILE_UNIF &&
+               c->uniform_contents[offset.index] == QUNIFORM_CONSTANT);
+        int offset_bit = c->uniform_data[offset.index];
+        assert(offset_bit % 8 == 0);
+
+        return qir_UNPACK_8_I(c, base, offset_bit / 8);
+}
+
 static void
 ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
 {
@@ -1106,6 +1032,14 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
                                 qir_SUB(c, qir_uniform_ui(c, 0), src[0]));
                 break;
 
+        case nir_op_ibitfield_extract:
+                *dest = ntq_emit_ibfe(c, src[0], src[1], src[2]);
+                break;
+
+        case nir_op_ubitfield_extract:
+                *dest = ntq_emit_ubfe(c, src[0], src[1], src[2]);
+                break;
+
         default:
                 fprintf(stderr, "unknown NIR ALU inst: ");
                 nir_print_instr(&instr->instr, stderr);
@@ -1383,13 +1317,13 @@ static void
 ntq_setup_inputs(struct vc4_compile *c)
 {
         unsigned num_entries = 0;
-        foreach_list_typed(nir_variable, var, node, &c->s->inputs)
+        nir_foreach_variable(var, &c->s->inputs)
                 num_entries++;
 
         nir_variable *vars[num_entries];
 
         unsigned i = 0;
-        foreach_list_typed(nir_variable, var, node, &c->s->inputs)
+        nir_foreach_variable(var, &c->s->inputs)
                 vars[i++] = var;
 
         /* Sort the variables so that we emit the input setup in
@@ -1432,7 +1366,7 @@ ntq_setup_inputs(struct vc4_compile *c)
 static void
 ntq_setup_outputs(struct vc4_compile *c)
 {
-        foreach_list_typed(nir_variable, var, node, &c->s->outputs) {
+        nir_foreach_variable(var, &c->s->outputs) {
                 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
                 unsigned loc = var->data.driver_location * 4;
 
@@ -1471,7 +1405,7 @@ ntq_setup_outputs(struct vc4_compile *c)
 static void
 ntq_setup_uniforms(struct vc4_compile *c)
 {
-        foreach_list_typed(nir_variable, var, node, &c->s->uniforms) {
+        nir_foreach_variable(var, &c->s->uniforms) {
                 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
                 unsigned array_elem_size = 4 * sizeof(float);
 
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index 739ac86193a..774ec095652 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -182,6 +182,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 	case PIPE_CAP_TGSI_TXQS:
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+	case PIPE_CAP_SHAREABLE_SHADERS:
                 return 0;
 
                 /* Stream output. */
@@ -336,6 +337,8 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
                 return VC4_MAX_TEXTURE_SAMPLERS;
         case PIPE_SHADER_CAP_PREFERRED_IR:
                 return PIPE_SHADER_IR_TGSI;
+	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+		return 32;
         default:
                 fprintf(stderr, "unknown shader param %d\n", param);
                 return 0;
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index a4947154f17..1ad545aae09 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -632,6 +632,7 @@ enum pipe_cap
    PIPE_CAP_DEPTH_BOUNDS_TEST,
    PIPE_CAP_TGSI_TXQS,
    PIPE_CAP_FORCE_PERSAMPLE_INTERP,
+   PIPE_CAP_SHAREABLE_SHADERS,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)
@@ -696,7 +697,8 @@ enum pipe_shader_cap
    PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED, /* all rounding modes */
    PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED,
    PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED,
-   PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE
+   PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE,
+   PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT,
 };
 
 /**
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index b36e0a35b8d..e0ab9013dd5 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -267,7 +267,9 @@ union tgsi_immediate_data
 #define TGSI_PROPERTY_TES_SPACING            12
 #define TGSI_PROPERTY_TES_VERTEX_ORDER_CW    13
 #define TGSI_PROPERTY_TES_POINT_MODE         14
-#define TGSI_PROPERTY_COUNT                  15
+#define TGSI_PROPERTY_NUM_CLIPDIST_ENABLED   15
+#define TGSI_PROPERTY_NUM_CULLDIST_ENABLED   16
+#define TGSI_PROPERTY_COUNT                  17
 
 struct tgsi_property {
    unsigned Type         : 4;  /**< TGSI_TOKEN_TYPE_PROPERTY */
diff --git a/src/gallium/state_trackers/omx/vid_dec_h264.c b/src/gallium/state_trackers/omx/vid_dec_h264.c
index 18d88039579..f66ed896e62 100644
--- a/src/gallium/state_trackers/omx/vid_dec_h264.c
+++ b/src/gallium/state_trackers/omx/vid_dec_h264.c
@@ -753,10 +753,14 @@ static void slice_header(vid_dec_PrivateType *priv, struct vl_rbsp *rbsp,
          priv->codec_data.h264.delta_pic_order_cnt_bottom = delta_pic_order_cnt_bottom;
       }
 
-      priv->picture.h264.field_order_cnt[0] = pic_order_cnt_msb + pic_order_cnt_lsb;
-      priv->picture.h264.field_order_cnt[1] = pic_order_cnt_msb + pic_order_cnt_lsb;
-      if (!priv->picture.h264.field_pic_flag)
-         priv->picture.h264.field_order_cnt[1] += priv->codec_data.h264.delta_pic_order_cnt_bottom;
+      if (!priv->picture.h264.field_pic_flag) {
+         priv->picture.h264.field_order_cnt[0] = pic_order_cnt_msb + pic_order_cnt_lsb;
+         priv->picture.h264.field_order_cnt[1] = priv->picture.h264.field_order_cnt [0] +
+                                          priv->codec_data.h264.delta_pic_order_cnt_bottom;
+      } else if (!priv->picture.h264.bottom_field_flag)
+         priv->picture.h264.field_order_cnt[0] = pic_order_cnt_msb + pic_order_cnt_lsb;
+      else
+         priv->picture.h264.field_order_cnt[1] = pic_order_cnt_msb + pic_order_cnt_lsb;
 
    } else if (sps->pic_order_cnt_type == 1) {
       unsigned MaxFrameNum = 1 << (sps->log2_max_frame_num_minus4 + 4);
diff --git a/src/gallium/targets/osmesa/osmesa.def b/src/gallium/targets/osmesa/osmesa.def
index e2a31ab5457..e347463de9f 100644
--- a/src/gallium/targets/osmesa/osmesa.def
+++ b/src/gallium/targets/osmesa/osmesa.def
@@ -14,3 +14,340 @@ EXPORTS
 	OSMesaGetProcAddress
 	OSMesaColorClamp
 	OSMesaPostprocess
+	glAccum
+	glAlphaFunc
+	glAreTexturesResident
+	glArrayElement
+	glBegin
+	glBindTexture
+	glBitmap
+	glBlendFunc
+	glCallList
+	glCallLists
+	glClear
+	glClearAccum
+	glClearColor
+	glClearDepth
+	glClearIndex
+	glClearStencil
+	glClipPlane
+	glColor3b
+	glColor3bv
+	glColor3d
+	glColor3dv
+	glColor3f
+	glColor3fv
+	glColor3i
+	glColor3iv
+	glColor3s
+	glColor3sv
+	glColor3ub
+	glColor3ubv
+	glColor3ui
+	glColor3uiv
+	glColor3us
+	glColor3usv
+	glColor4b
+	glColor4bv
+	glColor4d
+	glColor4dv
+	glColor4f
+	glColor4fv
+	glColor4i
+	glColor4iv
+	glColor4s
+	glColor4sv
+	glColor4ub
+	glColor4ubv
+	glColor4ui
+	glColor4uiv
+	glColor4us
+	glColor4usv
+	glColorMask
+	glColorMaterial
+	glColorPointer
+	glCopyPixels
+	glCopyTexImage1D
+	glCopyTexImage2D
+	glCopyTexSubImage1D
+	glCopyTexSubImage2D
+	glCullFace
+;	glDebugEntry
+	glDeleteLists
+	glDeleteTextures
+	glDepthFunc
+	glDepthMask
+	glDepthRange
+	glDisable
+	glDisableClientState
+	glDrawArrays
+	glDrawBuffer
+	glDrawElements
+	glDrawPixels
+	glEdgeFlag
+	glEdgeFlagPointer
+	glEdgeFlagv
+	glEnable
+	glEnableClientState
+	glEnd
+	glEndList
+	glEvalCoord1d
+	glEvalCoord1dv
+	glEvalCoord1f
+	glEvalCoord1fv
+	glEvalCoord2d
+	glEvalCoord2dv
+	glEvalCoord2f
+	glEvalCoord2fv
+	glEvalMesh1
+	glEvalMesh2
+	glEvalPoint1
+	glEvalPoint2
+	glFeedbackBuffer
+	glFinish
+	glFlush
+	glFogf
+	glFogfv
+	glFogi
+	glFogiv
+	glFrontFace
+	glFrustum
+	glGenLists
+	glGenTextures
+	glGetBooleanv
+	glGetClipPlane
+	glGetDoublev
+	glGetError
+	glGetFloatv
+	glGetIntegerv
+	glGetLightfv
+	glGetLightiv
+	glGetMapdv
+	glGetMapfv
+	glGetMapiv
+	glGetMaterialfv
+	glGetMaterialiv
+	glGetPixelMapfv
+	glGetPixelMapuiv
+	glGetPixelMapusv
+	glGetPointerv
+	glGetPolygonStipple
+	glGetString
+	glGetTexEnvfv
+	glGetTexEnviv
+	glGetTexGendv
+	glGetTexGenfv
+	glGetTexGeniv
+	glGetTexImage
+	glGetTexLevelParameterfv
+	glGetTexLevelParameteriv
+	glGetTexParameterfv
+	glGetTexParameteriv
+	glHint
+	glIndexMask
+	glIndexPointer
+	glIndexd
+	glIndexdv
+	glIndexf
+	glIndexfv
+	glIndexi
+	glIndexiv
+	glIndexs
+	glIndexsv
+	glIndexub
+	glIndexubv
+	glInitNames
+	glInterleavedArrays
+	glIsEnabled
+	glIsList
+	glIsTexture
+	glLightModelf
+	glLightModelfv
+	glLightModeli
+	glLightModeliv
+	glLightf
+	glLightfv
+	glLighti
+	glLightiv
+	glLineStipple
+	glLineWidth
+	glListBase
+	glLoadIdentity
+	glLoadMatrixd
+	glLoadMatrixf
+	glLoadName
+	glLogicOp
+	glMap1d
+	glMap1f
+	glMap2d
+	glMap2f
+	glMapGrid1d
+	glMapGrid1f
+	glMapGrid2d
+	glMapGrid2f
+	glMaterialf
+	glMaterialfv
+	glMateriali
+	glMaterialiv
+	glMatrixMode
+	glMultMatrixd
+	glMultMatrixf
+	glNewList
+	glNormal3b
+	glNormal3bv
+	glNormal3d
+	glNormal3dv
+	glNormal3f
+	glNormal3fv
+	glNormal3i
+	glNormal3iv
+	glNormal3s
+	glNormal3sv
+	glNormalPointer
+	glOrtho
+	glPassThrough
+	glPixelMapfv
+	glPixelMapuiv
+	glPixelMapusv
+	glPixelStoref
+	glPixelStorei
+	glPixelTransferf
+	glPixelTransferi
+	glPixelZoom
+	glPointSize
+	glPolygonMode
+	glPolygonOffset
+	glPolygonStipple
+	glPopAttrib
+	glPopClientAttrib
+	glPopMatrix
+	glPopName
+	glPrioritizeTextures
+	glPushAttrib
+	glPushClientAttrib
+	glPushMatrix
+	glPushName
+	glRasterPos2d
+	glRasterPos2dv
+	glRasterPos2f
+	glRasterPos2fv
+	glRasterPos2i
+	glRasterPos2iv
+	glRasterPos2s
+	glRasterPos2sv
+	glRasterPos3d
+	glRasterPos3dv
+	glRasterPos3f
+	glRasterPos3fv
+	glRasterPos3i
+	glRasterPos3iv
+	glRasterPos3s
+	glRasterPos3sv
+	glRasterPos4d
+	glRasterPos4dv
+	glRasterPos4f
+	glRasterPos4fv
+	glRasterPos4i
+	glRasterPos4iv
+	glRasterPos4s
+	glRasterPos4sv
+	glReadBuffer
+	glReadPixels
+	glRectd
+	glRectdv
+	glRectf
+	glRectfv
+	glRecti
+	glRectiv
+	glRects
+	glRectsv
+	glRenderMode
+	glRotated
+	glRotatef
+	glScaled
+	glScalef
+	glScissor
+	glSelectBuffer
+	glShadeModel
+	glStencilFunc
+	glStencilMask
+	glStencilOp
+	glTexCoord1d
+	glTexCoord1dv
+	glTexCoord1f
+	glTexCoord1fv
+	glTexCoord1i
+	glTexCoord1iv
+	glTexCoord1s
+	glTexCoord1sv
+	glTexCoord2d
+	glTexCoord2dv
+	glTexCoord2f
+	glTexCoord2fv
+	glTexCoord2i
+	glTexCoord2iv
+	glTexCoord2s
+	glTexCoord2sv
+	glTexCoord3d
+	glTexCoord3dv
+	glTexCoord3f
+	glTexCoord3fv
+	glTexCoord3i
+	glTexCoord3iv
+	glTexCoord3s
+	glTexCoord3sv
+	glTexCoord4d
+	glTexCoord4dv
+	glTexCoord4f
+	glTexCoord4fv
+	glTexCoord4i
+	glTexCoord4iv
+	glTexCoord4s
+	glTexCoord4sv
+	glTexCoordPointer
+	glTexEnvf
+	glTexEnvfv
+	glTexEnvi
+	glTexEnviv
+	glTexGend
+	glTexGendv
+	glTexGenf
+	glTexGenfv
+	glTexGeni
+	glTexGeniv
+	glTexImage1D
+	glTexImage2D
+	glTexParameterf
+	glTexParameterfv
+	glTexParameteri
+	glTexParameteriv
+	glTexSubImage1D
+	glTexSubImage2D
+	glTranslated
+	glTranslatef
+	glVertex2d
+	glVertex2dv
+	glVertex2f
+	glVertex2fv
+	glVertex2i
+	glVertex2iv
+	glVertex2s
+	glVertex2sv
+	glVertex3d
+	glVertex3dv
+	glVertex3f
+	glVertex3fv
+	glVertex3i
+	glVertex3iv
+	glVertex3s
+	glVertex3sv
+	glVertex4d
+	glVertex4dv
+	glVertex4f
+	glVertex4fv
+	glVertex4i
+	glVertex4iv
+	glVertex4s
+	glVertex4sv
+	glVertexPointer
+	glViewport
diff --git a/src/gallium/targets/osmesa/osmesa.mingw.def b/src/gallium/targets/osmesa/osmesa.mingw.def
index 874ac544084..945201c9d83 100644
--- a/src/gallium/targets/osmesa/osmesa.mingw.def
+++ b/src/gallium/targets/osmesa/osmesa.mingw.def
@@ -11,3 +11,340 @@ EXPORTS
 	OSMesaGetProcAddress = OSMesaGetProcAddress@4
 	OSMesaColorClamp = OSMesaColorClamp@4
 	OSMesaPostprocess = OSMesaPostprocess@12
+	glAccum = glAccum@8
+	glAlphaFunc = glAlphaFunc@8
+	glAreTexturesResident = glAreTexturesResident@12
+	glArrayElement = glArrayElement@4
+	glBegin = glBegin@4
+	glBindTexture = glBindTexture@8
+	glBitmap = glBitmap@28
+	glBlendFunc = glBlendFunc@8
+	glCallList = glCallList@4
+	glCallLists = glCallLists@12
+	glClear = glClear@4
+	glClearAccum = glClearAccum@16
+	glClearColor = glClearColor@16
+	glClearDepth = glClearDepth@8
+	glClearIndex = glClearIndex@4
+	glClearStencil = glClearStencil@4
+	glClipPlane = glClipPlane@8
+	glColor3b = glColor3b@12
+	glColor3bv = glColor3bv@4
+	glColor3d = glColor3d@24
+	glColor3dv = glColor3dv@4
+	glColor3f = glColor3f@12
+	glColor3fv = glColor3fv@4
+	glColor3i = glColor3i@12
+	glColor3iv = glColor3iv@4
+	glColor3s = glColor3s@12
+	glColor3sv = glColor3sv@4
+	glColor3ub = glColor3ub@12
+	glColor3ubv = glColor3ubv@4
+	glColor3ui = glColor3ui@12
+	glColor3uiv = glColor3uiv@4
+	glColor3us = glColor3us@12
+	glColor3usv = glColor3usv@4
+	glColor4b = glColor4b@16
+	glColor4bv = glColor4bv@4
+	glColor4d = glColor4d@32
+	glColor4dv = glColor4dv@4
+	glColor4f = glColor4f@16
+	glColor4fv = glColor4fv@4
+	glColor4i = glColor4i@16
+	glColor4iv = glColor4iv@4
+	glColor4s = glColor4s@16
+	glColor4sv = glColor4sv@4
+	glColor4ub = glColor4ub@16
+	glColor4ubv = glColor4ubv@4
+	glColor4ui = glColor4ui@16
+	glColor4uiv = glColor4uiv@4
+	glColor4us = glColor4us@16
+	glColor4usv = glColor4usv@4
+	glColorMask = glColorMask@16
+	glColorMaterial = glColorMaterial@8
+	glColorPointer = glColorPointer@16
+	glCopyPixels = glCopyPixels@20
+	glCopyTexImage1D = glCopyTexImage1D@28
+	glCopyTexImage2D = glCopyTexImage2D@32
+	glCopyTexSubImage1D = glCopyTexSubImage1D@24
+	glCopyTexSubImage2D = glCopyTexSubImage2D@32
+	glCullFace = glCullFace@4
+;	glDebugEntry = glDebugEntry@8
+	glDeleteLists = glDeleteLists@8
+	glDeleteTextures = glDeleteTextures@8
+	glDepthFunc = glDepthFunc@4
+	glDepthMask = glDepthMask@4
+	glDepthRange = glDepthRange@16
+	glDisable = glDisable@4
+	glDisableClientState = glDisableClientState@4
+	glDrawArrays = glDrawArrays@12
+	glDrawBuffer = glDrawBuffer@4
+	glDrawElements = glDrawElements@16
+	glDrawPixels = glDrawPixels@20
+	glEdgeFlag = glEdgeFlag@4
+	glEdgeFlagPointer = glEdgeFlagPointer@8
+	glEdgeFlagv = glEdgeFlagv@4
+	glEnable = glEnable@4
+	glEnableClientState = glEnableClientState@4
+	glEnd = glEnd@0
+	glEndList = glEndList@0
+	glEvalCoord1d = glEvalCoord1d@8
+	glEvalCoord1dv = glEvalCoord1dv@4
+	glEvalCoord1f = glEvalCoord1f@4
+	glEvalCoord1fv = glEvalCoord1fv@4
+	glEvalCoord2d = glEvalCoord2d@16
+	glEvalCoord2dv = glEvalCoord2dv@4
+	glEvalCoord2f = glEvalCoord2f@8
+	glEvalCoord2fv = glEvalCoord2fv@4
+	glEvalMesh1 = glEvalMesh1@12
+	glEvalMesh2 = glEvalMesh2@20
+	glEvalPoint1 = glEvalPoint1@4
+	glEvalPoint2 = glEvalPoint2@8
+	glFeedbackBuffer = glFeedbackBuffer@12
+	glFinish = glFinish@0
+	glFlush = glFlush@0
+	glFogf = glFogf@8
+	glFogfv = glFogfv@8
+	glFogi = glFogi@8
+	glFogiv = glFogiv@8
+	glFrontFace = glFrontFace@4
+	glFrustum = glFrustum@48
+	glGenLists = glGenLists@4
+	glGenTextures = glGenTextures@8
+	glGetBooleanv = glGetBooleanv@8
+	glGetClipPlane = glGetClipPlane@8
+	glGetDoublev = glGetDoublev@8
+	glGetError = glGetError@0
+	glGetFloatv = glGetFloatv@8
+	glGetIntegerv = glGetIntegerv@8
+	glGetLightfv = glGetLightfv@12
+	glGetLightiv = glGetLightiv@12
+	glGetMapdv = glGetMapdv@12
+	glGetMapfv = glGetMapfv@12
+	glGetMapiv = glGetMapiv@12
+	glGetMaterialfv = glGetMaterialfv@12
+	glGetMaterialiv = glGetMaterialiv@12
+	glGetPixelMapfv = glGetPixelMapfv@8
+	glGetPixelMapuiv = glGetPixelMapuiv@8
+	glGetPixelMapusv = glGetPixelMapusv@8
+	glGetPointerv = glGetPointerv@8
+	glGetPolygonStipple = glGetPolygonStipple@4
+	glGetString = glGetString@4
+	glGetTexEnvfv = glGetTexEnvfv@12
+	glGetTexEnviv = glGetTexEnviv@12
+	glGetTexGendv = glGetTexGendv@12
+	glGetTexGenfv = glGetTexGenfv@12
+	glGetTexGeniv = glGetTexGeniv@12
+	glGetTexImage = glGetTexImage@20
+	glGetTexLevelParameterfv = glGetTexLevelParameterfv@16
+	glGetTexLevelParameteriv = glGetTexLevelParameteriv@16
+	glGetTexParameterfv = glGetTexParameterfv@12
+	glGetTexParameteriv = glGetTexParameteriv@12
+	glHint = glHint@8
+	glIndexMask = glIndexMask@4
+	glIndexPointer = glIndexPointer@12
+	glIndexd = glIndexd@8
+	glIndexdv = glIndexdv@4
+	glIndexf = glIndexf@4
+	glIndexfv = glIndexfv@4
+	glIndexi = glIndexi@4
+	glIndexiv = glIndexiv@4
+	glIndexs = glIndexs@4
+	glIndexsv = glIndexsv@4
+	glIndexub = glIndexub@4
+	glIndexubv = glIndexubv@4
+	glInitNames = glInitNames@0
+	glInterleavedArrays = glInterleavedArrays@12
+	glIsEnabled = glIsEnabled@4
+	glIsList = glIsList@4
+	glIsTexture = glIsTexture@4
+	glLightModelf = glLightModelf@8
+	glLightModelfv = glLightModelfv@8
+	glLightModeli = glLightModeli@8
+	glLightModeliv = glLightModeliv@8
+	glLightf = glLightf@12
+	glLightfv = glLightfv@12
+	glLighti = glLighti@12
+	glLightiv = glLightiv@12
+	glLineStipple = glLineStipple@8
+	glLineWidth = glLineWidth@4
+	glListBase = glListBase@4
+	glLoadIdentity = glLoadIdentity@0
+	glLoadMatrixd = glLoadMatrixd@4
+	glLoadMatrixf = glLoadMatrixf@4
+	glLoadName = glLoadName@4
+	glLogicOp = glLogicOp@4
+	glMap1d = glMap1d@32
+	glMap1f = glMap1f@24
+	glMap2d = glMap2d@56
+	glMap2f = glMap2f@40
+	glMapGrid1d = glMapGrid1d@20
+	glMapGrid1f = glMapGrid1f@12
+	glMapGrid2d = glMapGrid2d@40
+	glMapGrid2f = glMapGrid2f@24
+	glMaterialf = glMaterialf@12
+	glMaterialfv = glMaterialfv@12
+	glMateriali = glMateriali@12
+	glMaterialiv = glMaterialiv@12
+	glMatrixMode = glMatrixMode@4
+	glMultMatrixd = glMultMatrixd@4
+	glMultMatrixf = glMultMatrixf@4
+	glNewList = glNewList@8
+	glNormal3b = glNormal3b@12
+	glNormal3bv = glNormal3bv@4
+	glNormal3d = glNormal3d@24
+	glNormal3dv = glNormal3dv@4
+	glNormal3f = glNormal3f@12
+	glNormal3fv = glNormal3fv@4
+	glNormal3i = glNormal3i@12
+	glNormal3iv = glNormal3iv@4
+	glNormal3s = glNormal3s@12
+	glNormal3sv = glNormal3sv@4
+	glNormalPointer = glNormalPointer@12
+	glOrtho = glOrtho@48
+	glPassThrough = glPassThrough@4
+	glPixelMapfv = glPixelMapfv@12
+	glPixelMapuiv = glPixelMapuiv@12
+	glPixelMapusv = glPixelMapusv@12
+	glPixelStoref = glPixelStoref@8
+	glPixelStorei = glPixelStorei@8
+	glPixelTransferf = glPixelTransferf@8
+	glPixelTransferi = glPixelTransferi@8
+	glPixelZoom = glPixelZoom@8
+	glPointSize = glPointSize@4
+	glPolygonMode = glPolygonMode@8
+	glPolygonOffset = glPolygonOffset@8
+	glPolygonStipple = glPolygonStipple@4
+	glPopAttrib = glPopAttrib@0
+	glPopClientAttrib = glPopClientAttrib@0
+	glPopMatrix = glPopMatrix@0
+	glPopName = glPopName@0
+	glPrioritizeTextures = glPrioritizeTextures@12
+	glPushAttrib = glPushAttrib@4
+	glPushClientAttrib = glPushClientAttrib@4
+	glPushMatrix = glPushMatrix@0
+	glPushName = glPushName@4
+	glRasterPos2d = glRasterPos2d@16
+	glRasterPos2dv = glRasterPos2dv@4
+	glRasterPos2f = glRasterPos2f@8
+	glRasterPos2fv = glRasterPos2fv@4
+	glRasterPos2i = glRasterPos2i@8
+	glRasterPos2iv = glRasterPos2iv@4
+	glRasterPos2s = glRasterPos2s@8
+	glRasterPos2sv = glRasterPos2sv@4
+	glRasterPos3d = glRasterPos3d@24
+	glRasterPos3dv = glRasterPos3dv@4
+	glRasterPos3f = glRasterPos3f@12
+	glRasterPos3fv = glRasterPos3fv@4
+	glRasterPos3i = glRasterPos3i@12
+	glRasterPos3iv = glRasterPos3iv@4
+	glRasterPos3s = glRasterPos3s@12
+	glRasterPos3sv = glRasterPos3sv@4
+	glRasterPos4d = glRasterPos4d@32
+	glRasterPos4dv = glRasterPos4dv@4
+	glRasterPos4f = glRasterPos4f@16
+	glRasterPos4fv = glRasterPos4fv@4
+	glRasterPos4i = glRasterPos4i@16
+	glRasterPos4iv = glRasterPos4iv@4
+	glRasterPos4s = glRasterPos4s@16
+	glRasterPos4sv = glRasterPos4sv@4
+	glReadBuffer = glReadBuffer@4
+	glReadPixels = glReadPixels@28
+	glRectd = glRectd@32
+	glRectdv = glRectdv@8
+	glRectf = glRectf@16
+	glRectfv = glRectfv@8
+	glRecti = glRecti@16
+	glRectiv = glRectiv@8
+	glRects = glRects@16
+	glRectsv = glRectsv@8
+	glRenderMode = glRenderMode@4
+	glRotated = glRotated@32
+	glRotatef = glRotatef@16
+	glScaled = glScaled@24
+	glScalef = glScalef@12
+	glScissor = glScissor@16
+	glSelectBuffer = glSelectBuffer@8
+	glShadeModel = glShadeModel@4
+	glStencilFunc = glStencilFunc@12
+	glStencilMask = glStencilMask@4
+	glStencilOp = glStencilOp@12
+	glTexCoord1d = glTexCoord1d@8
+	glTexCoord1dv = glTexCoord1dv@4
+	glTexCoord1f = glTexCoord1f@4
+	glTexCoord1fv = glTexCoord1fv@4
+	glTexCoord1i = glTexCoord1i@4
+	glTexCoord1iv = glTexCoord1iv@4
+	glTexCoord1s = glTexCoord1s@4
+	glTexCoord1sv = glTexCoord1sv@4
+	glTexCoord2d = glTexCoord2d@16
+	glTexCoord2dv = glTexCoord2dv@4
+	glTexCoord2f = glTexCoord2f@8
+	glTexCoord2fv = glTexCoord2fv@4
+	glTexCoord2i = glTexCoord2i@8
+	glTexCoord2iv = glTexCoord2iv@4
+	glTexCoord2s = glTexCoord2s@8
+	glTexCoord2sv = glTexCoord2sv@4
+	glTexCoord3d = glTexCoord3d@24
+	glTexCoord3dv = glTexCoord3dv@4
+	glTexCoord3f = glTexCoord3f@12
+	glTexCoord3fv = glTexCoord3fv@4
+	glTexCoord3i = glTexCoord3i@12
+	glTexCoord3iv = glTexCoord3iv@4
+	glTexCoord3s = glTexCoord3s@12
+	glTexCoord3sv = glTexCoord3sv@4
+	glTexCoord4d = glTexCoord4d@32
+	glTexCoord4dv = glTexCoord4dv@4
+	glTexCoord4f = glTexCoord4f@16
+	glTexCoord4fv = glTexCoord4fv@4
+	glTexCoord4i = glTexCoord4i@16
+	glTexCoord4iv = glTexCoord4iv@4
+	glTexCoord4s = glTexCoord4s@16
+	glTexCoord4sv = glTexCoord4sv@4
+	glTexCoordPointer = glTexCoordPointer@16
+	glTexEnvf = glTexEnvf@12
+	glTexEnvfv = glTexEnvfv@12
+	glTexEnvi = glTexEnvi@12
+	glTexEnviv = glTexEnviv@12
+	glTexGend = glTexGend@16
+	glTexGendv = glTexGendv@12
+	glTexGenf = glTexGenf@12
+	glTexGenfv = glTexGenfv@12
+	glTexGeni = glTexGeni@12
+	glTexGeniv = glTexGeniv@12
+	glTexImage1D = glTexImage1D@32
+	glTexImage2D = glTexImage2D@36
+	glTexParameterf = glTexParameterf@12
+	glTexParameterfv = glTexParameterfv@12
+	glTexParameteri = glTexParameteri@12
+	glTexParameteriv = glTexParameteriv@12
+	glTexSubImage1D = glTexSubImage1D@28
+	glTexSubImage2D = glTexSubImage2D@36
+	glTranslated = glTranslated@24
+	glTranslatef = glTranslatef@12
+	glVertex2d = glVertex2d@16
+	glVertex2dv = glVertex2dv@4
+	glVertex2f = glVertex2f@8
+	glVertex2fv = glVertex2fv@4
+	glVertex2i = glVertex2i@8
+	glVertex2iv = glVertex2iv@4
+	glVertex2s = glVertex2s@8
+	glVertex2sv = glVertex2sv@4
+	glVertex3d = glVertex3d@24
+	glVertex3dv = glVertex3dv@4
+	glVertex3f = glVertex3f@12
+	glVertex3fv = glVertex3fv@4
+	glVertex3i = glVertex3i@12
+	glVertex3iv = glVertex3iv@4
+	glVertex3s = glVertex3s@12
+	glVertex3sv = glVertex3sv@4
+	glVertex4d = glVertex4d@32
+	glVertex4dv = glVertex4dv@4
+	glVertex4f = glVertex4f@16
+	glVertex4fv = glVertex4fv@4
+	glVertex4i = glVertex4i@16
+	glVertex4iv = glVertex4iv@4
+	glVertex4s = glVertex4s@16
+	glVertex4sv = glVertex4sv@4
+	glVertexPointer = glVertexPointer@16
+	glViewport = glViewport@16
diff --git a/src/glsl/ast_function.cpp b/src/glsl/ast_function.cpp
index c5c5cae333b..e4e4a3fe148 100644
--- a/src/glsl/ast_function.cpp
+++ b/src/glsl/ast_function.cpp
@@ -610,6 +610,37 @@ match_subroutine_by_name(const char *name,
    return sig;
 }
 
+static ir_rvalue *
+generate_array_index(void *mem_ctx, exec_list *instructions,
+                     struct _mesa_glsl_parse_state *state, YYLTYPE loc,
+                     const ast_expression *array, ast_expression *idx,
+                     const char **function_name, exec_list *actual_parameters)
+{
+   if (array->oper == ast_array_index) {
+      /* This handles arrays of arrays */
+      ir_rvalue *outer_array = generate_array_index(mem_ctx, instructions,
+                                                    state, loc,
+                                                    array->subexpressions[0],
+                                                    array->subexpressions[1],
+                                                    function_name, actual_parameters);
+      ir_rvalue *outer_array_idx = idx->hir(instructions, state);
+
+      YYLTYPE index_loc = idx->get_location();
+      return _mesa_ast_array_index_to_hir(mem_ctx, state, outer_array,
+                                          outer_array_idx, loc,
+                                          index_loc);
+   } else {
+      ir_variable *sub_var = NULL;
+      *function_name = array->primary_expression.identifier;
+
+      match_subroutine_by_name(*function_name, actual_parameters,
+                               state, &sub_var);
+
+      ir_rvalue *outer_array_idx = idx->hir(instructions, state);
+      return new(mem_ctx) ir_dereference_array(sub_var, outer_array_idx);
+   }
+}
+
 static void
 print_function_prototypes(_mesa_glsl_parse_state *state, YYLTYPE *loc,
                           ir_function *f)
@@ -1989,16 +2020,18 @@ ast_function_expression::hir(exec_list *instructions,
       ir_variable *sub_var = NULL;
       ir_rvalue *array_idx = NULL;
 
+      process_parameters(instructions, &actual_parameters, &this->expressions,
+			 state);
+
       if (id->oper == ast_array_index) {
-         func_name = id->subexpressions[0]->primary_expression.identifier;
-	 array_idx = id->subexpressions[1]->hir(instructions, state);
+         array_idx = generate_array_index(ctx, instructions, state, loc,
+                                          id->subexpressions[0],
+                                          id->subexpressions[1], &func_name,
+                                          &actual_parameters);
       } else {
          func_name = id->primary_expression.identifier;
       }
 
-      process_parameters(instructions, &actual_parameters, &this->expressions,
-			 state);
-
       ir_function_signature *sig =
 	 match_function_by_name(func_name, &actual_parameters, state);
 
diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index 0c11ec58d20..961183636a9 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -487,54 +487,54 @@ bit_logic_result_type(const struct glsl_type *type_a,
                       ast_operators op,
                       struct _mesa_glsl_parse_state *state, YYLTYPE *loc)
 {
-    if (!state->check_bitwise_operations_allowed(loc)) {
-       return glsl_type::error_type;
-    }
+   if (!state->check_bitwise_operations_allowed(loc)) {
+      return glsl_type::error_type;
+   }
 
-    /* From page 50 (page 56 of PDF) of GLSL 1.30 spec:
-     *
-     *     "The bitwise operators and (&), exclusive-or (^), and inclusive-or
-     *     (|). The operands must be of type signed or unsigned integers or
-     *     integer vectors."
-     */
-    if (!type_a->is_integer()) {
-       _mesa_glsl_error(loc, state, "LHS of `%s' must be an integer",
-                         ast_expression::operator_string(op));
-       return glsl_type::error_type;
-    }
-    if (!type_b->is_integer()) {
-       _mesa_glsl_error(loc, state, "RHS of `%s' must be an integer",
+   /* From page 50 (page 56 of PDF) of GLSL 1.30 spec:
+    *
+    *     "The bitwise operators and (&), exclusive-or (^), and inclusive-or
+    *     (|). The operands must be of type signed or unsigned integers or
+    *     integer vectors."
+    */
+   if (!type_a->is_integer()) {
+      _mesa_glsl_error(loc, state, "LHS of `%s' must be an integer",
                         ast_expression::operator_string(op));
-       return glsl_type::error_type;
-    }
+      return glsl_type::error_type;
+   }
+   if (!type_b->is_integer()) {
+      _mesa_glsl_error(loc, state, "RHS of `%s' must be an integer",
+                       ast_expression::operator_string(op));
+      return glsl_type::error_type;
+   }
 
-    /*     "The fundamental types of the operands (signed or unsigned) must
-     *     match,"
-     */
-    if (type_a->base_type != type_b->base_type) {
-       _mesa_glsl_error(loc, state, "operands of `%s' must have the same "
-                        "base type", ast_expression::operator_string(op));
-       return glsl_type::error_type;
-    }
+   /*     "The fundamental types of the operands (signed or unsigned) must
+    *     match,"
+    */
+   if (type_a->base_type != type_b->base_type) {
+      _mesa_glsl_error(loc, state, "operands of `%s' must have the same "
+                       "base type", ast_expression::operator_string(op));
+      return glsl_type::error_type;
+   }
 
-    /*     "The operands cannot be vectors of differing size." */
-    if (type_a->is_vector() &&
-        type_b->is_vector() &&
-        type_a->vector_elements != type_b->vector_elements) {
-       _mesa_glsl_error(loc, state, "operands of `%s' cannot be vectors of "
-                        "different sizes", ast_expression::operator_string(op));
-       return glsl_type::error_type;
-    }
+   /*     "The operands cannot be vectors of differing size." */
+   if (type_a->is_vector() &&
+       type_b->is_vector() &&
+       type_a->vector_elements != type_b->vector_elements) {
+      _mesa_glsl_error(loc, state, "operands of `%s' cannot be vectors of "
+                       "different sizes", ast_expression::operator_string(op));
+      return glsl_type::error_type;
+   }
 
-    /*     "If one operand is a scalar and the other a vector, the scalar is
-     *     applied component-wise to the vector, resulting in the same type as
-     *     the vector. The fundamental types of the operands [...] will be the
-     *     resulting fundamental type."
-     */
-    if (type_a->is_scalar())
-        return type_b;
-    else
-        return type_a;
+   /*     "If one operand is a scalar and the other a vector, the scalar is
+    *     applied component-wise to the vector, resulting in the same type as
+    *     the vector. The fundamental types of the operands [...] will be the
+    *     resulting fundamental type."
+    */
+   if (type_a->is_scalar())
+       return type_b;
+   else
+       return type_a;
 }
 
 static const struct glsl_type *
@@ -6294,6 +6294,18 @@ ast_interface_block::hir(exec_list *instructions,
 
    state->struct_specifier_depth--;
 
+   for (unsigned i = 0; i < num_variables; i++) {
+      if (fields[i].stream != -1 &&
+          (unsigned) fields[i].stream != this->layout.stream) {
+         _mesa_glsl_error(&loc, state,
+                          "stream layout qualifier on "
+                          "interface block member `%s' does not match "
+                          "the interface block (%d vs %d)",
+                          fields[i].name, fields[i].stream,
+                          this->layout.stream);
+      }
+   }
+
    if (!redeclaring_per_vertex) {
       validate_identifier(this->block_name, loc, state);
 
@@ -6634,6 +6646,8 @@ ast_interface_block::hir(exec_list *instructions,
          var->data.explicit_binding = this->layout.flags.q.explicit_binding;
          var->data.binding = this->layout.binding;
 
+         var->data.stream = this->layout.stream;
+
          state->symbols->add_variable(var);
          instructions->push_tail(var);
       }
@@ -6652,6 +6666,7 @@ ast_interface_block::hir(exec_list *instructions,
          var->data.centroid = fields[i].centroid;
          var->data.sample = fields[i].sample;
          var->data.patch = fields[i].patch;
+         var->data.stream = this->layout.stream;
          var->init_interface_type(block_type);
 
          if (var_mode == ir_var_shader_in || var_mode == ir_var_uniform)
@@ -6664,17 +6679,6 @@ ast_interface_block::hir(exec_list *instructions,
             var->data.matrix_layout = fields[i].matrix_layout;
          }
 
-         if (fields[i].stream != -1 &&
-             ((unsigned)fields[i].stream) != this->layout.stream) {
-            _mesa_glsl_error(&loc, state,
-                             "stream layout qualifier on "
-                             "interface block member `%s' does not match "
-                             "the interface block (%d vs %d)",
-                             var->name, fields[i].stream, this->layout.stream);
-         }
-
-         var->data.stream = this->layout.stream;
-
          if (var->data.mode == ir_var_shader_storage) {
             var->data.image_read_only = fields[i].image_read_only;
             var->data.image_write_only = fields[i].image_write_only;
diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy
index cd00f6e085b..2f2e10d7992 100644
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -2609,17 +2609,6 @@ interface_block:
 
       block->layout.is_default_qualifier = false;
 
-      foreach_list_typed (ast_declarator_list, member, link, &block->declarations) {
-         ast_type_qualifier& qualifier = member->type->qualifier;
-         if (qualifier.flags.q.stream && qualifier.stream != block->layout.stream) {
-               _mesa_glsl_error(& @1, state,
-                             "stream layout qualifier on "
-                             "interface block member does not match "
-                             "the interface block (%d vs %d)",
-                             qualifier.stream, block->layout.stream);
-               YYERROR;
-         }
-      }
       $$ = block;
    }
    | memory_qualifier interface_block
diff --git a/src/glsl/link_uniforms.cpp b/src/glsl/link_uniforms.cpp
index fe00aa30d07..8183e65d2f5 100644
--- a/src/glsl/link_uniforms.cpp
+++ b/src/glsl/link_uniforms.cpp
@@ -763,7 +763,8 @@ private:
       /* Assign explicit locations. */
       if (current_var->data.explicit_location) {
          /* Set sequential locations for struct fields. */
-         if (record_type != NULL) {
+         if (current_var->type->without_array()->is_record() ||
+             current_var->type->is_array_of_arrays()) {
             const unsigned entries = MAX2(1, this->uniforms[id].array_elements);
             this->uniforms[id].remap_location =
                this->explicit_location + field_counter;
@@ -1180,7 +1181,8 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
 
    /* Reserve all the explicit locations of the active uniforms. */
    for (unsigned i = 0; i < num_uniforms; i++) {
-      if (uniforms[i].type->is_subroutine())
+      if (uniforms[i].type->is_subroutine() ||
+          uniforms[i].is_shader_storage)
          continue;
 
       if (uniforms[i].remap_location != UNMAPPED_UNIFORM_LOC) {
@@ -1200,8 +1202,10 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
    /* Reserve locations for rest of the uniforms. */
    for (unsigned i = 0; i < num_uniforms; i++) {
 
-      if (uniforms[i].type->is_subroutine())
+      if (uniforms[i].type->is_subroutine() ||
+          uniforms[i].is_shader_storage)
          continue;
+
       /* Built-in uniforms should not get any location. */
       if (uniforms[i].builtin)
          continue;
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 25ca928aa43..07ea0e0c7e5 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -651,7 +651,7 @@ link_invalidate_variable_locations(exec_list *ir)
 
 
 /**
- * Set UsesClipDistance and ClipDistanceArraySize based on the given shader.
+ * Set clip_distance_array_size based on the given shader.
  *
  * Also check for errors based on incorrect usage of gl_ClipVertex and
  * gl_ClipDistance.
@@ -660,10 +660,10 @@ link_invalidate_variable_locations(exec_list *ir)
  */
 static void
 analyze_clip_usage(struct gl_shader_program *prog,
-                   struct gl_shader *shader, GLboolean *UsesClipDistance,
-                   GLuint *ClipDistanceArraySize)
+                   struct gl_shader *shader,
+                   GLuint *clip_distance_array_size)
 {
-   *ClipDistanceArraySize = 0;
+   *clip_distance_array_size = 0;
 
    if (!prog->IsES && prog->Version >= 130) {
       /* From section 7.1 (Vertex Shader Special Variables) of the
@@ -686,13 +686,14 @@ analyze_clip_usage(struct gl_shader_program *prog,
                       _mesa_shader_stage_to_string(shader->Stage));
          return;
       }
-      *UsesClipDistance = clip_distance.variable_found();
-      ir_variable *clip_distance_var =
-         shader->symbols->get_variable("gl_ClipDistance");
-      if (clip_distance_var)
-         *ClipDistanceArraySize = clip_distance_var->type->length;
-   } else {
-      *UsesClipDistance = false;
+
+      if (clip_distance.variable_found()) {
+         ir_variable *clip_distance_var =
+               shader->symbols->get_variable("gl_ClipDistance");
+
+         assert(clip_distance_var);
+         *clip_distance_array_size = clip_distance_var->type->length;
+      }
    }
 }
 
@@ -700,8 +701,7 @@ analyze_clip_usage(struct gl_shader_program *prog,
 /**
  * Verify that a vertex shader executable meets all semantic requirements.
  *
- * Also sets prog->Vert.UsesClipDistance and prog->Vert.ClipDistanceArraySize
- * as a side effect.
+ * Also sets prog->Vert.ClipDistanceArraySize as a side effect.
  *
  * \param shader  Vertex shader executable to be verified
  */
@@ -754,8 +754,7 @@ validate_vertex_shader_executable(struct gl_shader_program *prog,
       }
    }
 
-   analyze_clip_usage(prog, shader, &prog->Vert.UsesClipDistance,
-                      &prog->Vert.ClipDistanceArraySize);
+   analyze_clip_usage(prog, shader, &prog->Vert.ClipDistanceArraySize);
 }
 
 void
@@ -765,8 +764,7 @@ validate_tess_eval_shader_executable(struct gl_shader_program *prog,
    if (shader == NULL)
       return;
 
-   analyze_clip_usage(prog, shader, &prog->TessEval.UsesClipDistance,
-                      &prog->TessEval.ClipDistanceArraySize);
+   analyze_clip_usage(prog, shader, &prog->TessEval.ClipDistanceArraySize);
 }
 
 
@@ -797,8 +795,8 @@ validate_fragment_shader_executable(struct gl_shader_program *prog,
 /**
  * Verify that a geometry shader executable meets all semantic requirements
  *
- * Also sets prog->Geom.VerticesIn, prog->Geom.UsesClipDistance, and
- * prog->Geom.ClipDistanceArraySize as a side effect.
+ * Also sets prog->Geom.VerticesIn, and prog->Geom.ClipDistanceArraySize as
+ * a side effect.
  *
  * \param shader Geometry shader executable to be verified
  */
@@ -812,8 +810,7 @@ validate_geometry_shader_executable(struct gl_shader_program *prog,
    unsigned num_vertices = vertices_per_prim(prog->Geom.InputType);
    prog->Geom.VerticesIn = num_vertices;
 
-   analyze_clip_usage(prog, shader, &prog->Geom.UsesClipDistance,
-                      &prog->Geom.ClipDistanceArraySize);
+   analyze_clip_usage(prog, shader, &prog->Geom.ClipDistanceArraySize);
 }
 
 /**
@@ -3117,8 +3114,8 @@ check_explicit_uniform_locations(struct gl_context *ctx,
 
       foreach_in_list(ir_instruction, node, sh->ir) {
          ir_variable *var = node->as_variable();
-         if (var && (var->data.mode == ir_var_uniform || var->data.mode == ir_var_shader_storage) &&
-             var->data.explicit_location) {
+         if (var && (var->data.mode == ir_var_uniform &&
+                     var->data.explicit_location)) {
             bool ret;
             if (var->type->is_subroutine())
                ret = reserve_subroutine_explicit_locations(prog, sh, var);
diff --git a/src/glsl/lower_named_interface_blocks.cpp b/src/glsl/lower_named_interface_blocks.cpp
index 276a2dedf47..114bb5811b4 100644
--- a/src/glsl/lower_named_interface_blocks.cpp
+++ b/src/glsl/lower_named_interface_blocks.cpp
@@ -186,6 +186,7 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
             new_var->data.centroid = iface_t->fields.structure[i].centroid;
             new_var->data.sample = iface_t->fields.structure[i].sample;
             new_var->data.patch = iface_t->fields.structure[i].patch;
+            new_var->data.stream = var->data.stream;
 
             new_var->init_interface_type(iface_t);
             hash_table_insert(interface_namespace, new_var,
diff --git a/src/glsl/lower_subroutine.cpp b/src/glsl/lower_subroutine.cpp
index c1aed61a36a..a0df5e1df81 100644
--- a/src/glsl/lower_subroutine.cpp
+++ b/src/glsl/lower_subroutine.cpp
@@ -84,7 +84,7 @@ lower_subroutine_visitor::visit_leave(ir_call *ir)
          continue;
 
       if (ir->array_idx != NULL)
-         var = new(mem_ctx) ir_dereference_array(ir->sub_var, ir->array_idx->clone(mem_ctx, NULL));
+         var = ir->array_idx->clone(mem_ctx, NULL);
       else
          var = new(mem_ctx) ir_dereference_variable(ir->sub_var);
 
diff --git a/src/glsl/lower_ubo_reference.cpp b/src/glsl/lower_ubo_reference.cpp
index e818c048461..57a242b4074 100644
--- a/src/glsl/lower_ubo_reference.cpp
+++ b/src/glsl/lower_ubo_reference.cpp
@@ -238,6 +238,8 @@ interface_field_name(void *mem_ctx, char *base_name, ir_rvalue *d,
       case ir_type_swizzle: {
          ir_swizzle *s = (ir_swizzle *) ir;
          ir = s->val->as_dereference();
+         /* Skip swizzle in the next pass */
+         d = ir;
          break;
       }
 
diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp
index e57e834d948..129dd02781b 100644
--- a/src/glsl/nir/glsl_to_nir.cpp
+++ b/src/glsl/nir/glsl_to_nir.cpp
@@ -164,15 +164,20 @@ glsl_to_nir(const struct gl_shader_program *shader_prog,
    shader->info.outputs_written = sh->Program->OutputsWritten;
    shader->info.system_values_read = sh->Program->SystemValuesRead;
    shader->info.uses_texture_gather = sh->Program->UsesGather;
-   shader->info.uses_clip_distance_out = sh->Program->UsesClipDistanceOut;
+   shader->info.uses_clip_distance_out =
+      sh->Program->ClipDistanceArraySize != 0;
    shader->info.separate_shader = shader_prog->SeparateShader;
    shader->info.has_transform_feedback_varyings =
       shader_prog->TransformFeedback.NumVarying > 0;
 
    switch (stage) {
    case MESA_SHADER_GEOMETRY:
+      shader->info.gs.vertices_in = shader_prog->Geom.VerticesIn;
+      shader->info.gs.output_primitive = sh->Geom.OutputType;
       shader->info.gs.vertices_out = sh->Geom.VerticesOut;
       shader->info.gs.invocations = sh->Geom.Invocations;
+      shader->info.gs.uses_end_primitive = shader_prog->Geom.UsesEndPrimitive;
+      shader->info.gs.uses_streams = shader_prog->Geom.UsesStreams;
       break;
 
    case MESA_SHADER_FRAGMENT: {
diff --git a/src/glsl/nir/glsl_types.h b/src/glsl/nir/glsl_types.h
index b83e1ca3d2c..a8eade5f9e1 100644
--- a/src/glsl/nir/glsl_types.h
+++ b/src/glsl/nir/glsl_types.h
@@ -521,6 +521,11 @@ struct glsl_type {
       return base_type == GLSL_TYPE_ARRAY;
    }
 
+   bool is_array_of_arrays() const
+   {
+      return is_array() && fields.array->is_array();
+   }
+
    /**
     * Query whether or not a type is a record
     */
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 9939b9e91a2..d0304bebbb0 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -1521,11 +1521,23 @@ typedef struct nir_shader_info {
 
    union {
       struct {
+         /** The number of vertices recieves per input primitive */
+         unsigned vertices_in;
+
+         /** The output primitive type (GL enum value) */
+         unsigned output_primitive;
+
          /** The maximum number of vertices the geometry shader might write. */
          unsigned vertices_out;
 
          /** 1 .. MAX_GEOMETRY_SHADER_INVOCATIONS */
          unsigned invocations;
+
+         /** Whether or not this shader uses EndPrimitive */
+         bool uses_end_primitive;
+
+         /** Whether or not this shader uses non-zero streams */
+         bool uses_streams;
       } gs;
 
       struct {
@@ -1924,7 +1936,7 @@ void nir_dump_dom_frontier(nir_shader *shader, FILE *fp);
 void nir_dump_cfg_impl(nir_function_impl *impl, FILE *fp);
 void nir_dump_cfg(nir_shader *shader, FILE *fp);
 
-int nir_gs_count_vertices(nir_shader *shader);
+int nir_gs_count_vertices(const nir_shader *shader);
 
 bool nir_split_var_copies(nir_shader *shader);
 
diff --git a/src/glsl/nir/nir_gs_count_vertices.c b/src/glsl/nir/nir_gs_count_vertices.c
index e0bdf170d22..1c360673ddc 100644
--- a/src/glsl/nir/nir_gs_count_vertices.c
+++ b/src/glsl/nir/nir_gs_count_vertices.c
@@ -51,7 +51,7 @@ as_set_vertex_count(nir_instr *instr)
  * counting at the NIR level.
  */
 int
-nir_gs_count_vertices(nir_shader *shader)
+nir_gs_count_vertices(const nir_shader *shader)
 {
    int count = -1;
 
diff --git a/src/mesa/drivers/dri/i965/brw_cfg.h b/src/mesa/drivers/dri/i965/brw_cfg.h
index a09491781e6..a06b0aa1cd0 100644
--- a/src/mesa/drivers/dri/i965/brw_cfg.h
+++ b/src/mesa/drivers/dri/i965/brw_cfg.h
@@ -327,12 +327,12 @@ struct cfg_t {
 #define foreach_inst_in_block_reverse_safe(__type, __inst, __block) \
    foreach_in_list_reverse_safe(__type, __inst, &(__block)->instructions)
 
-#define foreach_inst_in_block_starting_from(__type, __scan_inst, __inst, __block) \
+#define foreach_inst_in_block_starting_from(__type, __scan_inst, __inst) \
    for (__type *__scan_inst = (__type *)__inst->next;          \
         !__scan_inst->is_tail_sentinel();                      \
         __scan_inst = (__type *)__scan_inst->next)
 
-#define foreach_inst_in_block_reverse_starting_from(__type, __scan_inst, __inst, __block) \
+#define foreach_inst_in_block_reverse_starting_from(__type, __scan_inst, __inst) \
    for (__type *__scan_inst = (__type *)__inst->prev;          \
         !__scan_inst->is_head_sentinel();                      \
         __scan_inst = (__type *)__scan_inst->prev)
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index f6d5ab87be9..d9967143d8a 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -90,6 +90,7 @@ struct brw_compiler {
    void (*shader_perf_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
 
    bool scalar_vs;
+   bool scalar_gs;
    struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES];
 };
 
@@ -488,6 +489,9 @@ struct brw_vue_prog_data {
    struct brw_stage_prog_data base;
    struct brw_vue_map vue_map;
 
+   /** Should the hardware deliver input VUE handles for URB pull loads? */
+   bool include_vue_handles;
+
    GLuint urb_read_length;
    GLuint total_grf;
 
@@ -596,21 +600,6 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
                unsigned *final_assembly_size,
                char **error_str);
 
-/**
- * Scratch data used when compiling a GLSL geometry shader.
- */
-struct brw_gs_compile
-{
-   struct brw_gs_prog_key key;
-   struct brw_gs_prog_data prog_data;
-   struct brw_vue_map input_vue_map;
-
-   struct brw_geometry_program *gp;
-
-   unsigned control_data_bits_per_vertex;
-   unsigned control_data_header_size_bits;
-};
-
 /**
  * Compile a vertex shader.
  *
@@ -618,10 +607,11 @@ struct brw_gs_compile
  */
 const unsigned *
 brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
-               struct brw_gs_compile *c,
+               void *mem_ctx,
+               const struct brw_gs_prog_key *key,
+               struct brw_gs_prog_data *prog_data,
                const struct nir_shader *shader,
                struct gl_shader_program *shader_prog,
-               void *mem_ctx,
                int shader_time_index,
                unsigned *final_assembly_size,
                char **error_str);
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index a8cde20e045..169d092f90e 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -918,8 +918,8 @@ enum opcode {
     * Source 0: [required] Color 0.
     * Source 1: [optional] Color 1 (for dual source blend messages).
     * Source 2: [optional] Src0 Alpha.
-    * Source 3: [optional] Source Depth (passthrough from the thread payload).
-    * Source 4: [optional] Destination Depth (gl_FragDepth).
+    * Source 3: [optional] Source Depth (gl_FragDepth)
+    * Source 4: [optional (gen4-5)] Destination Depth passthrough from thread
     * Source 5: [optional] Sample Mask (gl_SampleMask).
     * Source 6: [required] Number of color components (as a UD immediate).
     */
@@ -1033,7 +1033,19 @@ enum opcode {
    SHADER_OPCODE_GEN4_SCRATCH_WRITE,
    SHADER_OPCODE_GEN7_SCRATCH_READ,
 
+   /**
+    * Gen8+ SIMD8 URB Read message.
+    *
+    * Source 0: The header register, containing URB handles (g1).
+    *
+    * Currently only supports constant offsets, in inst->offset.
+    */
+   SHADER_OPCODE_URB_READ_SIMD8,
+
    SHADER_OPCODE_URB_WRITE_SIMD8,
+   SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT,
+   SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
+   SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT,
 
    /**
     * Return the index of an arbitrary live channel (i.e. one of the channels
@@ -2385,7 +2397,7 @@ enum brw_pixel_shader_coverage_mask_mode {
 # define GEN8_PSX_ATTRIBUTE_ENABLE                      (1 << 8)
 # define GEN8_PSX_SHADER_DISABLES_ALPHA_TO_COVERAGE     (1 << 7)
 # define GEN8_PSX_SHADER_IS_PER_SAMPLE                  (1 << 6)
-# define GEN8_PSX_SHADER_COMPUTES_STENCIL               (1 << 5)
+# define GEN9_PSX_SHADER_COMPUTES_STENCIL               (1 << 5)
 # define GEN9_PSX_SHADER_PULLS_BARY                     (1 << 3)
 # define GEN8_PSX_SHADER_HAS_UAV                        (1 << 2)
 # define GEN8_PSX_SHADER_USES_INPUT_COVERAGE_MASK       (1 << 1)
diff --git a/src/mesa/drivers/dri/i965/brw_eu_compact.c b/src/mesa/drivers/dri/i965/brw_eu_compact.c
index b798931140f..f787ea3d4f8 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_compact.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_compact.c
@@ -690,7 +690,7 @@ set_control_index(const struct brw_device_info *devinfo,
 
    for (int i = 0; i < 32; i++) {
       if (control_index_table[i] == uncompacted) {
-         brw_compact_inst_set_control_index(dst, i);
+         brw_compact_inst_set_control_index(devinfo, dst, i);
 	 return true;
       }
    }
@@ -711,7 +711,7 @@ set_datatype_index(const struct brw_device_info *devinfo, brw_compact_inst *dst,
 
    for (int i = 0; i < 32; i++) {
       if (datatype_table[i] == uncompacted) {
-         brw_compact_inst_set_datatype_index(dst, i);
+         brw_compact_inst_set_datatype_index(devinfo, dst, i);
 	 return true;
       }
    }
@@ -732,7 +732,7 @@ set_subreg_index(const struct brw_device_info *devinfo, brw_compact_inst *dst,
 
    for (int i = 0; i < 32; i++) {
       if (subreg_table[i] == uncompacted) {
-         brw_compact_inst_set_subreg_index(dst, i);
+         brw_compact_inst_set_subreg_index(devinfo, dst, i);
 	 return true;
       }
    }
@@ -764,7 +764,7 @@ set_src0_index(const struct brw_device_info *devinfo,
    if (!get_src_index(uncompacted, &compacted))
       return false;
 
-   brw_compact_inst_set_src0_index(dst, compacted);
+   brw_compact_inst_set_src0_index(devinfo, dst, compacted);
 
    return true;
 }
@@ -784,7 +784,7 @@ set_src1_index(const struct brw_device_info *devinfo, brw_compact_inst *dst,
          return false;
    }
 
-   brw_compact_inst_set_src1_index(dst, compacted);
+   brw_compact_inst_set_src1_index(devinfo, dst, compacted);
 
    return true;
 }
@@ -804,7 +804,7 @@ set_3src_control_index(const struct brw_device_info *devinfo,
 
    for (unsigned i = 0; i < ARRAY_SIZE(gen8_3src_control_index_table); i++) {
       if (gen8_3src_control_index_table[i] == uncompacted) {
-         brw_compact_inst_set_3src_control_index(dst, i);
+         brw_compact_inst_set_3src_control_index(devinfo, dst, i);
 	 return true;
       }
    }
@@ -838,7 +838,7 @@ set_3src_source_index(const struct brw_device_info *devinfo,
 
    for (unsigned i = 0; i < ARRAY_SIZE(gen8_3src_source_index_table); i++) {
       if (gen8_3src_source_index_table[i] == uncompacted) {
-         brw_compact_inst_set_3src_source_index(dst, i);
+         brw_compact_inst_set_3src_source_index(devinfo, dst, i);
 	 return true;
       }
    }
@@ -909,7 +909,7 @@ brw_try_compact_3src_instruction(const struct brw_device_info *devinfo,
       return false;
 
 #define compact(field) \
-   brw_compact_inst_set_3src_##field(dst, brw_inst_3src_##field(devinfo, src))
+   brw_compact_inst_set_3src_##field(devinfo, dst, brw_inst_3src_##field(devinfo, src))
 
    compact(opcode);
 
@@ -921,7 +921,7 @@ brw_try_compact_3src_instruction(const struct brw_device_info *devinfo,
 
    compact(dst_reg_nr);
    compact(src0_rep_ctrl);
-   brw_compact_inst_set_3src_cmpt_control(dst, true);
+   brw_compact_inst_set_3src_cmpt_control(devinfo, dst, true);
    compact(debug_control);
    compact(saturate);
    compact(src1_rep_ctrl);
@@ -1003,36 +1003,52 @@ brw_try_compact_instruction(const struct brw_device_info *devinfo,
 
    memset(&temp, 0, sizeof(temp));
 
-   brw_compact_inst_set_opcode(&temp, brw_inst_opcode(devinfo, src));
-   brw_compact_inst_set_debug_control(&temp, brw_inst_debug_control(devinfo, src));
+#define compact(field) \
+   brw_compact_inst_set_##field(devinfo, &temp, brw_inst_##field(devinfo, src))
+
+   compact(opcode);
+   compact(debug_control);
+
    if (!set_control_index(devinfo, &temp, src))
       return false;
    if (!set_datatype_index(devinfo, &temp, src))
       return false;
    if (!set_subreg_index(devinfo, &temp, src, is_immediate))
       return false;
-   brw_compact_inst_set_acc_wr_control(&temp,
-                                       brw_inst_acc_wr_control(devinfo, src));
-   brw_compact_inst_set_cond_modifier(&temp,
-                                      brw_inst_cond_modifier(devinfo, src));
+
+   if (devinfo->gen >= 6) {
+      compact(acc_wr_control);
+   } else {
+      compact(mask_control_ex);
+   }
+
+   compact(cond_modifier);
+
    if (devinfo->gen <= 6)
-      brw_compact_inst_set_flag_subreg_nr(&temp,
-                                          brw_inst_flag_subreg_nr(devinfo, src));
-   brw_compact_inst_set_cmpt_control(&temp, true);
+      compact(flag_subreg_nr);
+
+   brw_compact_inst_set_cmpt_control(devinfo, &temp, true);
+
    if (!set_src0_index(devinfo, &temp, src))
       return false;
    if (!set_src1_index(devinfo, &temp, src, is_immediate))
       return false;
-   brw_compact_inst_set_dst_reg_nr(&temp, brw_inst_dst_da_reg_nr(devinfo, src));
-   brw_compact_inst_set_src0_reg_nr(&temp, brw_inst_src0_da_reg_nr(devinfo, src));
+
+   brw_compact_inst_set_dst_reg_nr(devinfo, &temp,
+                                   brw_inst_dst_da_reg_nr(devinfo, src));
+   brw_compact_inst_set_src0_reg_nr(devinfo, &temp,
+                                    brw_inst_src0_da_reg_nr(devinfo, src));
+
    if (is_immediate) {
-      brw_compact_inst_set_src1_reg_nr(&temp,
+      brw_compact_inst_set_src1_reg_nr(devinfo, &temp,
                                        brw_inst_imm_ud(devinfo, src) & 0xff);
    } else {
-      brw_compact_inst_set_src1_reg_nr(&temp,
+      brw_compact_inst_set_src1_reg_nr(devinfo, &temp,
                                        brw_inst_src1_da_reg_nr(devinfo, src));
    }
 
+#undef compact
+
    *dst = temp;
 
    return true;
@@ -1043,7 +1059,7 @@ set_uncompacted_control(const struct brw_device_info *devinfo, brw_inst *dst,
                         brw_compact_inst *src)
 {
    uint32_t uncompacted =
-      control_index_table[brw_compact_inst_control_index(src)];
+      control_index_table[brw_compact_inst_control_index(devinfo, src)];
 
    if (devinfo->gen >= 8) {
       brw_inst_set_bits(dst, 33, 31, (uncompacted >> 16));
@@ -1064,7 +1080,8 @@ static void
 set_uncompacted_datatype(const struct brw_device_info *devinfo, brw_inst *dst,
                          brw_compact_inst *src)
 {
-   uint32_t uncompacted = datatype_table[brw_compact_inst_datatype_index(src)];
+   uint32_t uncompacted =
+      datatype_table[brw_compact_inst_datatype_index(devinfo, src)];
 
    if (devinfo->gen >= 8) {
       brw_inst_set_bits(dst, 63, 61, (uncompacted >> 18));
@@ -1080,7 +1097,8 @@ static void
 set_uncompacted_subreg(const struct brw_device_info *devinfo, brw_inst *dst,
                        brw_compact_inst *src)
 {
-   uint16_t uncompacted = subreg_table[brw_compact_inst_subreg_index(src)];
+   uint16_t uncompacted =
+      subreg_table[brw_compact_inst_subreg_index(devinfo, src)];
 
    brw_inst_set_bits(dst, 100, 96, (uncompacted >> 10));
    brw_inst_set_bits(dst,  68, 64, (uncompacted >>  5) & 0x1f);
@@ -1091,7 +1109,7 @@ static void
 set_uncompacted_src0(const struct brw_device_info *devinfo, brw_inst *dst,
                      brw_compact_inst *src)
 {
-   uint32_t compacted = brw_compact_inst_src0_index(src);
+   uint32_t compacted = brw_compact_inst_src0_index(devinfo, src);
    uint16_t uncompacted = src_index_table[compacted];
 
    brw_inst_set_bits(dst, 88, 77, uncompacted);
@@ -1102,11 +1120,12 @@ set_uncompacted_src1(const struct brw_device_info *devinfo, brw_inst *dst,
                      brw_compact_inst *src, bool is_immediate)
 {
    if (is_immediate) {
-      signed high5 = brw_compact_inst_src1_index(src);
+      signed high5 = brw_compact_inst_src1_index(devinfo, src);
       /* Replicate top bit of src1_index into high 20 bits of the immediate. */
       brw_inst_set_imm_ud(devinfo, dst, (high5 << 27) >> 19);
    } else {
-      uint16_t uncompacted = src_index_table[brw_compact_inst_src1_index(src)];
+      uint16_t uncompacted =
+         src_index_table[brw_compact_inst_src1_index(devinfo, src)];
 
       brw_inst_set_bits(dst, 120, 109, uncompacted);
    }
@@ -1118,7 +1137,7 @@ set_uncompacted_3src_control_index(const struct brw_device_info *devinfo,
 {
    assert(devinfo->gen >= 8);
 
-   uint32_t compacted = brw_compact_inst_3src_control_index(src);
+   uint32_t compacted = brw_compact_inst_3src_control_index(devinfo, src);
    uint32_t uncompacted = gen8_3src_control_index_table[compacted];
 
    brw_inst_set_bits(dst, 34, 32, (uncompacted >> 21) & 0x7);
@@ -1134,7 +1153,7 @@ set_uncompacted_3src_source_index(const struct brw_device_info *devinfo,
 {
    assert(devinfo->gen >= 8);
 
-   uint32_t compacted = brw_compact_inst_3src_source_index(src);
+   uint32_t compacted = brw_compact_inst_3src_source_index(devinfo, src);
    uint64_t uncompacted = gen8_3src_source_index_table[compacted];
 
    brw_inst_set_bits(dst,  83,  83, (uncompacted >> 43) & 0x1);
@@ -1160,7 +1179,7 @@ brw_uncompact_3src_instruction(const struct brw_device_info *devinfo,
    assert(devinfo->gen >= 8);
 
 #define uncompact(field) \
-   brw_inst_set_3src_##field(devinfo, dst, brw_compact_inst_3src_##field(src))
+   brw_inst_set_3src_##field(devinfo, dst, brw_compact_inst_3src_##field(devinfo, src))
 
    uncompact(opcode);
 
@@ -1190,13 +1209,16 @@ brw_uncompact_instruction(const struct brw_device_info *devinfo, brw_inst *dst,
 {
    memset(dst, 0, sizeof(*dst));
 
-   if (devinfo->gen >= 8 && is_3src(brw_compact_inst_3src_opcode(src))) {
+   if (devinfo->gen >= 8 && is_3src(brw_compact_inst_3src_opcode(devinfo, src))) {
       brw_uncompact_3src_instruction(devinfo, dst, src);
       return;
    }
 
-   brw_inst_set_opcode(devinfo, dst, brw_compact_inst_opcode(src));
-   brw_inst_set_debug_control(devinfo, dst, brw_compact_inst_debug_control(src));
+#define uncompact(field) \
+   brw_inst_set_##field(devinfo, dst, brw_compact_inst_##field(devinfo, src))
+
+   uncompact(opcode);
+   uncompact(debug_control);
 
    set_uncompacted_control(devinfo, dst, src);
    set_uncompacted_datatype(devinfo, dst, src);
@@ -1206,22 +1228,36 @@ brw_uncompact_instruction(const struct brw_device_info *devinfo, brw_inst *dst,
                        brw_inst_src1_reg_file(devinfo, dst) == BRW_IMMEDIATE_VALUE;
 
    set_uncompacted_subreg(devinfo, dst, src);
-   brw_inst_set_acc_wr_control(devinfo, dst, brw_compact_inst_acc_wr_control(src));
-   brw_inst_set_cond_modifier(devinfo, dst, brw_compact_inst_cond_modifier(src));
+
+   if (devinfo->gen >= 6) {
+      uncompact(acc_wr_control);
+   } else {
+      uncompact(mask_control_ex);
+   }
+
+   uncompact(cond_modifier);
+
    if (devinfo->gen <= 6)
-      brw_inst_set_flag_subreg_nr(devinfo, dst,
-                                  brw_compact_inst_flag_subreg_nr(src));
+      uncompact(flag_subreg_nr);
+
    set_uncompacted_src0(devinfo, dst, src);
    set_uncompacted_src1(devinfo, dst, src, is_immediate);
-   brw_inst_set_dst_da_reg_nr(devinfo, dst, brw_compact_inst_dst_reg_nr(src));
-   brw_inst_set_src0_da_reg_nr(devinfo, dst, brw_compact_inst_src0_reg_nr(src));
+
+   brw_inst_set_dst_da_reg_nr(devinfo, dst,
+                              brw_compact_inst_dst_reg_nr(devinfo, src));
+   brw_inst_set_src0_da_reg_nr(devinfo, dst,
+                               brw_compact_inst_src0_reg_nr(devinfo, src));
+
    if (is_immediate) {
       brw_inst_set_imm_ud(devinfo, dst,
                           brw_inst_imm_ud(devinfo, dst) |
-                          brw_compact_inst_src1_reg_nr(src));
+                          brw_compact_inst_src1_reg_nr(devinfo, src));
    } else {
-      brw_inst_set_src1_da_reg_nr(devinfo, dst, brw_compact_inst_src1_reg_nr(src));
+      brw_inst_set_src1_da_reg_nr(devinfo, dst,
+                                  brw_compact_inst_src1_reg_nr(devinfo, src));
    }
+
+#undef uncompact
 }
 
 void brw_debug_compact_uncompact(const struct brw_device_info *devinfo,
@@ -1415,8 +1451,8 @@ brw_compact_instructions(struct brw_codegen *p, int start_offset,
          if ((offset & sizeof(brw_compact_inst)) != 0 && devinfo->is_g4x){
             brw_compact_inst *align = store + offset;
             memset(align, 0, sizeof(*align));
-            brw_compact_inst_set_opcode(align, BRW_OPCODE_NENOP);
-            brw_compact_inst_set_cmpt_control(align, true);
+            brw_compact_inst_set_opcode(devinfo, align, BRW_OPCODE_NENOP);
+            brw_compact_inst_set_cmpt_control(devinfo, align, true);
             offset += sizeof(brw_compact_inst);
             compacted_count--;
             compacted_counts[src_offset / sizeof(brw_inst)] = compacted_count;
@@ -1524,8 +1560,8 @@ brw_compact_instructions(struct brw_codegen *p, int start_offset,
    if (p->next_insn_offset & sizeof(brw_compact_inst)) {
       brw_compact_inst *align = store + offset;
       memset(align, 0, sizeof(*align));
-      brw_compact_inst_set_opcode(align, BRW_OPCODE_NOP);
-      brw_compact_inst_set_cmpt_control(align, true);
+      brw_compact_inst_set_opcode(devinfo, align, BRW_OPCODE_NOP);
+      brw_compact_inst_set_cmpt_control(devinfo, align, true);
       p->next_insn_offset += sizeof(brw_compact_inst);
    }
    p->nr_insn = p->next_insn_offset / sizeof(brw_inst);
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 0562c5a9981..8320cd77299 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -281,6 +281,10 @@ fs_inst::is_send_from_grf() const
    case SHADER_OPCODE_TYPED_SURFACE_READ:
    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
    case SHADER_OPCODE_URB_WRITE_SIMD8:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
+   case SHADER_OPCODE_URB_READ_SIMD8:
       return true;
    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
       return src[1].file == GRF;
@@ -782,6 +786,10 @@ fs_inst::regs_read(int arg) const
    switch (opcode) {
    case FS_OPCODE_FB_WRITE:
    case SHADER_OPCODE_URB_WRITE_SIMD8:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
+   case SHADER_OPCODE_URB_READ_SIMD8:
    case SHADER_OPCODE_UNTYPED_ATOMIC:
    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
@@ -911,6 +919,9 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
    case SHADER_OPCODE_TYPED_SURFACE_READ:
    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
    case SHADER_OPCODE_URB_WRITE_SIMD8:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
@@ -2239,13 +2250,15 @@ fs_visitor::opt_sampler_eot()
    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
       return false;
 
-   /* This optimisation doesn't seem to work for textureGather for some
-    * reason. I can't find any documentation or known workarounds to indicate
-    * that this is expected, but considering that it is probably pretty
-    * unlikely that a shader would directly write out the results from
-    * textureGather we might as well just disable it.
+   /* 3D Sampler » Messages » Message Format
+    *
+    * “Response Length of zero is allowed on all SIMD8* and SIMD16* sampler
+    *  messages except sample+killpix, resinfo, sampleinfo, LOD, and gather4*”
     */
-   if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
+   if (tex_inst->opcode == SHADER_OPCODE_TXS ||
+       tex_inst->opcode == SHADER_OPCODE_SAMPLEINFO ||
+       tex_inst->opcode == SHADER_OPCODE_LOD ||
+       tex_inst->opcode == SHADER_OPCODE_TG4 ||
        tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
       return false;
 
@@ -2457,7 +2470,7 @@ fs_visitor::compute_to_mrf()
       /* Found a move of a GRF to a MRF.  Let's see if we can go
        * rewrite the thing that made this GRF to write into the MRF.
        */
-      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
+      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
 	 if (scan_inst->dst.file == GRF &&
 	     scan_inst->dst.reg == inst->src[0].reg) {
 	    /* Found the last thing to write our reg we want to turn
@@ -2805,7 +2818,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
     * we assume that there are no outstanding dependencies on entry to the
     * program.
     */
-   foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
+   foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
       /* If we hit control flow, assume that there *are* outstanding
        * dependencies, and force their cleanup before our instruction.
        */
@@ -2871,7 +2884,7 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
    /* Walk forwards looking for writes to registers we're writing which aren't
     * read before being written.
     */
-   foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
+   foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) {
       /* If we hit control flow, force resolve all remaining dependencies. */
       if (block->end() == scan_inst) {
          for (int i = 0; i < write_len; i++) {
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 171338dcc0b..50e98becf03 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -62,6 +62,8 @@ namespace brw {
    class fs_live_variables;
 }
 
+struct brw_gs_compile;
+
 static inline fs_reg
 offset(fs_reg reg, const brw::fs_builder& bld, unsigned delta)
 {
@@ -99,7 +101,12 @@ public:
               const nir_shader *shader,
               unsigned dispatch_width,
               int shader_time_index);
-
+   fs_visitor(const struct brw_compiler *compiler, void *log_data,
+              void *mem_ctx,
+              struct brw_gs_compile *gs_compile,
+              struct brw_gs_prog_data *prog_data,
+              const nir_shader *shader);
+   void init();
    ~fs_visitor();
 
    fs_reg vgrf(const glsl_type *const type);
@@ -298,6 +305,8 @@ public:
    const void *const key;
    const struct brw_sampler_prog_key_data *key_tex;
 
+   struct brw_gs_compile *gs_compile;
+
    struct brw_stage_prog_data *prog_data;
    struct gl_program *prog;
 
@@ -415,6 +424,7 @@ private:
                       struct brw_reg implied_header,
                       GLuint nr);
    void generate_fb_write(fs_inst *inst, struct brw_reg payload);
+   void generate_urb_read(fs_inst *inst, struct brw_reg dst, struct brw_reg payload);
    void generate_urb_write(fs_inst *inst, struct brw_reg payload);
    void generate_cs_terminate(fs_inst *inst, struct brw_reg payload);
    void generate_barrier(fs_inst *inst, struct brw_reg src);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp
index 469f2ea4e16..883e8d2a49f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp
@@ -87,8 +87,7 @@ opt_cmod_propagation_local(bblock_t *block)
          continue;
 
       bool read_flag = false;
-      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst,
-                                                  block) {
+      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
          if (scan_inst->overwrites_reg(inst->src[0])) {
             if (scan_inst->is_partial_write() ||
                 scan_inst->dst.reg_offset != inst->src[0].reg_offset)
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 13c495cd395..bb7e792044f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -354,6 +354,28 @@ fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
    }
 }
 
+void
+fs_generator::generate_urb_read(fs_inst *inst,
+                                struct brw_reg dst,
+                                struct brw_reg header)
+{
+   assert(header.file == BRW_GENERAL_REGISTER_FILE);
+   assert(header.type == BRW_REGISTER_TYPE_UD);
+
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, dst);
+   brw_set_src0(p, send, header);
+   brw_set_src1(p, send, brw_imm_ud(0u));
+
+   brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB);
+   brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ);
+
+   brw_inst_set_mlen(p->devinfo, send, inst->mlen);
+   brw_inst_set_rlen(p->devinfo, send, inst->regs_written);
+   brw_inst_set_header_present(p->devinfo, send, true);
+   brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset);
+}
+
 void
 fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
 {
@@ -368,6 +390,14 @@ fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
    brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB);
    brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE);
 
+   if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
+       inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
+      brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true);
+
+   if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
+       inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
+      brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true);
+
    brw_inst_set_mlen(p->devinfo, insn, inst->mlen);
    brw_inst_set_rlen(p->devinfo, insn, 0);
    brw_inst_set_eot(p->devinfo, insn, inst->eot);
@@ -2001,7 +2031,14 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          fill_count++;
 	 break;
 
+      case SHADER_OPCODE_URB_READ_SIMD8:
+         generate_urb_read(inst, dst, src[0]);
+         break;
+
       case SHADER_OPCODE_URB_WRITE_SIMD8:
+      case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+      case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+      case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
 	 generate_urb_write(inst, src[0]);
 	 break;
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index feedbfbb2e3..7b5a0482519 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -30,6 +30,7 @@
 #include "brw_fs_surface_builder.h"
 #include "brw_nir.h"
 #include "brw_fs_surface_builder.h"
+#include "brw_vec4_gs_visitor.h"
 
 using namespace brw;
 using namespace brw::surface_access;
@@ -188,6 +189,18 @@ emit_system_values_block(nir_block *block, void *void_visitor)
             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_INSTANCE_ID);
          break;
 
+      case nir_intrinsic_load_invocation_id:
+         assert(v->stage == MESA_SHADER_GEOMETRY);
+         reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
+         if (reg->file == BAD_FILE) {
+            const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);
+            fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+            fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+            abld.SHR(iid, g1, fs_reg(27u));
+            *reg = iid;
+         }
+         break;
+
       case nir_intrinsic_load_sample_pos:
          assert(v->stage == MESA_SHADER_FRAGMENT);
          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
@@ -1367,9 +1380,17 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
    case nir_intrinsic_load_vertex_id:
       unreachable("should be lowered by lower_vertex_id()");
 
+   case nir_intrinsic_load_primitive_id:
+      assert(stage == MESA_SHADER_GEOMETRY);
+      assert(((struct brw_gs_prog_data *)prog_data)->include_primitive_id);
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
+              retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
+      break;
+
    case nir_intrinsic_load_vertex_id_zero_base:
    case nir_intrinsic_load_base_vertex:
    case nir_intrinsic_load_instance_id:
+   case nir_intrinsic_load_invocation_id:
    case nir_intrinsic_load_sample_mask_in:
    case nir_intrinsic_load_sample_id: {
       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp
index 8792a8c7b1d..862e3245d43 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp
@@ -64,7 +64,7 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block)
       int src_end_ip = v->live_intervals->end[src_var];
 
       bool interfered = false;
-      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
+      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
          if (scan_inst->overwrites_reg(inst->src[0])) {
             if (scan_inst->is_partial_write() ||
                 (scan_inst->dst.type != inst->dst.type &&
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index f825fed4daf..7cc4f3c927a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -41,6 +41,7 @@
 #include "brw_wm.h"
 #include "brw_cs.h"
 #include "brw_vec4.h"
+#include "brw_vec4_gs_visitor.h"
 #include "brw_fs.h"
 #include "main/uniforms.h"
 #include "glsl/nir/glsl_types.h"
@@ -868,13 +869,14 @@ void
 fs_visitor::emit_urb_writes()
 {
    int slot, urb_offset, length;
-   struct brw_vs_prog_data *vs_prog_data =
-      (struct brw_vs_prog_data *) prog_data;
-   const struct brw_vs_prog_key *key =
+   int starting_urb_offset = 0;
+   const struct brw_vue_prog_data *vue_prog_data =
+      (const struct brw_vue_prog_data *) this->prog_data;
+   const struct brw_vs_prog_key *vs_key =
       (const struct brw_vs_prog_key *) this->key;
    const GLbitfield64 psiz_mask =
       VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ;
-   const struct brw_vue_map *vue_map = &vs_prog_data->base.vue_map;
+   const struct brw_vue_map *vue_map = &vue_prog_data->vue_map;
    bool flush;
    fs_reg sources[8];
 
@@ -900,8 +902,21 @@ fs_visitor::emit_urb_writes()
       return;
    }
 
+   if (stage == MESA_SHADER_GEOMETRY) {
+      const struct brw_gs_prog_data *gs_prog_data =
+         (const struct brw_gs_prog_data *) prog_data;
+
+      /* We need to increment the Global Offset to skip over the control data
+       * header and the extra "Vertex Count" field (1 HWord) at the beginning
+       * of the VUE.  We're counting in OWords, so the units are doubled.
+       */
+      starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords;
+      if (gs_prog_data->static_vertex_count == -1)
+         starting_urb_offset += 2;
+   }
+
    length = 0;
-   urb_offset = 0;
+   urb_offset = starting_urb_offset;
    flush = false;
    for (slot = 0; slot < vue_map->num_slots; slot++) {
       int varying = vue_map->slot_to_varying[slot];
@@ -961,11 +976,11 @@ fs_visitor::emit_urb_writes()
             break;
          }
 
-         if ((varying == VARYING_SLOT_COL0 ||
+         if (stage == MESA_SHADER_VERTEX && vs_key->clamp_vertex_color &&
+             (varying == VARYING_SLOT_COL0 ||
               varying == VARYING_SLOT_COL1 ||
               varying == VARYING_SLOT_BFC0 ||
-              varying == VARYING_SLOT_BFC1) &&
-             key->clamp_vertex_color) {
+              varying == VARYING_SLOT_BFC1)) {
             /* We need to clamp these guys, so do a saturating MOV into a
              * temp register and use that for the payload.
              */
@@ -1005,10 +1020,10 @@ fs_visitor::emit_urb_writes()
 
          fs_inst *inst =
             abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
-         inst->eot = last;
+         inst->eot = last && stage == MESA_SHADER_VERTEX;
          inst->mlen = length + 1;
          inst->offset = urb_offset;
-         urb_offset = slot + 1;
+         urb_offset = starting_urb_offset + slot + 1;
          length = 0;
          flush = false;
       }
@@ -1071,11 +1086,33 @@ fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
                        unsigned dispatch_width,
                        int shader_time_index)
    : backend_shader(compiler, log_data, mem_ctx, shader, prog_data),
-     key(key), prog_data(prog_data), prog(prog),
+     key(key), gs_compile(NULL), prog_data(prog_data), prog(prog),
      dispatch_width(dispatch_width),
      shader_time_index(shader_time_index),
-     promoted_constants(0),
      bld(fs_builder(this, dispatch_width).at_end())
+{
+   init();
+}
+
+fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
+                       void *mem_ctx,
+                       struct brw_gs_compile *c,
+                       struct brw_gs_prog_data *prog_data,
+                       const nir_shader *shader)
+   : backend_shader(compiler, log_data, mem_ctx, shader,
+                    &prog_data->base.base),
+     key(&c->key), gs_compile(c),
+     prog_data(&prog_data->base.base), prog(NULL),
+     dispatch_width(8),
+     shader_time_index(ST_GS),
+     bld(fs_builder(this, dispatch_width).at_end())
+{
+   init();
+}
+
+
+void
+fs_visitor::init()
 {
    switch (stage) {
    case MESA_SHADER_FRAGMENT:
@@ -1094,6 +1131,8 @@ fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
       unreachable("unhandled shader stage");
    }
 
+   this->prog_data = this->stage_prog_data;
+
    this->failed = false;
    this->simd16_unsupported = false;
    this->no16_msg = NULL;
@@ -1119,6 +1158,8 @@ fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
    this->pull_constant_loc = NULL;
    this->push_constant_loc = NULL;
 
+   this->promoted_constants = 0,
+
    this->spilled_any_registers = false;
    this->do_dual_src = false;
 
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 10a7f28fdab..ed0890f430f 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -57,20 +57,14 @@ brw_codegen_gs_prog(struct brw_context *brw,
                     struct brw_geometry_program *gp,
                     struct brw_gs_prog_key *key)
 {
+   struct brw_compiler *compiler = brw->intelScreen->compiler;
    struct gl_shader *shader = prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
    struct brw_stage_state *stage_state = &brw->gs.base;
-   struct brw_gs_compile c;
-   memset(&c, 0, sizeof(c));
-   c.key = *key;
-   c.gp = gp;
-
-   c.prog_data.include_primitive_id =
-      (gp->program.Base.InputsRead & VARYING_BIT_PRIMITIVE_ID) != 0;
-
-   c.prog_data.invocations = gp->program.Invocations;
+   struct brw_gs_prog_data prog_data;
+   memset(&prog_data, 0, sizeof(prog_data));
 
    assign_gs_binding_table_offsets(brw->intelScreen->devinfo, prog,
-                                   &gp->program.Base, &c.prog_data);
+                                   &gp->program.Base, &prog_data);
 
    /* Allocate the references to the uniforms that will end up in the
     * prog_data associated with the compiled program, and which will be freed
@@ -83,215 +77,24 @@ brw_codegen_gs_prog(struct brw_context *brw,
    struct gl_shader *gs = prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
    int param_count = gp->program.Base.nir->num_uniforms * 4;
 
-   c.prog_data.base.base.param =
+   prog_data.base.base.param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
-   c.prog_data.base.base.pull_param =
+   prog_data.base.base.pull_param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
-   c.prog_data.base.base.image_param =
+   prog_data.base.base.image_param =
       rzalloc_array(NULL, struct brw_image_param, gs->NumImages);
-   c.prog_data.base.base.nr_params = param_count;
-   c.prog_data.base.base.nr_image_params = gs->NumImages;
+   prog_data.base.base.nr_params = param_count;
+   prog_data.base.base.nr_image_params = gs->NumImages;
 
    brw_nir_setup_glsl_uniforms(gp->program.Base.nir, prog, &gp->program.Base,
-                               &c.prog_data.base.base, false);
-
-   if (brw->gen >= 8) {
-      c.prog_data.static_vertex_count = !gp->program.Base.nir ? -1 :
-         nir_gs_count_vertices(gp->program.Base.nir);
-   }
-
-   if (brw->gen >= 7) {
-      if (gp->program.OutputType == GL_POINTS) {
-         /* When the output type is points, the geometry shader may output data
-          * to multiple streams, and EndPrimitive() has no effect.  So we
-          * configure the hardware to interpret the control data as stream ID.
-          */
-         c.prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;
-
-         /* We only have to emit control bits if we are using streams */
-         if (prog->Geom.UsesStreams)
-            c.control_data_bits_per_vertex = 2;
-         else
-            c.control_data_bits_per_vertex = 0;
-      } else {
-         /* When the output type is triangle_strip or line_strip, EndPrimitive()
-          * may be used to terminate the current strip and start a new one
-          * (similar to primitive restart), and outputting data to multiple
-          * streams is not supported.  So we configure the hardware to interpret
-          * the control data as EndPrimitive information (a.k.a. "cut bits").
-          */
-         c.prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;
-
-         /* We only need to output control data if the shader actually calls
-          * EndPrimitive().
-          */
-         c.control_data_bits_per_vertex = gp->program.UsesEndPrimitive ? 1 : 0;
-      }
-   } else {
-      /* There are no control data bits in gen6. */
-      c.control_data_bits_per_vertex = 0;
-
-      /* If it is using transform feedback, enable it */
-      if (prog->TransformFeedback.NumVarying)
-         c.prog_data.gen6_xfb_enabled = true;
-      else
-         c.prog_data.gen6_xfb_enabled = false;
-   }
-   c.control_data_header_size_bits =
-      gp->program.VerticesOut * c.control_data_bits_per_vertex;
-
-   /* 1 HWORD = 32 bytes = 256 bits */
-   c.prog_data.control_data_header_size_hwords =
-      ALIGN(c.control_data_header_size_bits, 256) / 256;
+                               &prog_data.base.base, compiler->scalar_gs);
 
    GLbitfield64 outputs_written = gp->program.Base.OutputsWritten;
 
    brw_compute_vue_map(brw->intelScreen->devinfo,
-                       &c.prog_data.base.vue_map, outputs_written,
+                       &prog_data.base.vue_map, outputs_written,
                        prog ? prog->SeparateShader : false);
 
-   /* Compute the output vertex size.
-    *
-    * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex
-    * Size (p168):
-    *
-    *     [0,62] indicating [1,63] 16B units
-    *
-    *     Specifies the size of each vertex stored in the GS output entry
-    *     (following any Control Header data) as a number of 128-bit units
-    *     (minus one).
-    *
-    *     Programming Restrictions: The vertex size must be programmed as a
-    *     multiple of 32B units with the following exception: Rendering is
-    *     disabled (as per SOL stage state) and the vertex size output by the
-    *     GS thread is 16B.
-    *
-    *     If rendering is enabled (as per SOL state) the vertex size must be
-    *     programmed as a multiple of 32B units. In other words, the only time
-    *     software can program a vertex size with an odd number of 16B units
-    *     is when rendering is disabled.
-    *
-    * Note: B=bytes in the above text.
-    *
-    * It doesn't seem worth the extra trouble to optimize the case where the
-    * vertex size is 16B (especially since this would require special-casing
-    * the GEN assembly that writes to the URB).  So we just set the vertex
-    * size to a multiple of 32B (2 vec4's) in all cases.
-    *
-    * The maximum output vertex size is 62*16 = 992 bytes (31 hwords).  We
-    * budget that as follows:
-    *
-    *   512 bytes for varyings (a varying component is 4 bytes and
-    *             gl_MaxGeometryOutputComponents = 128)
-    *    16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
-    *             bytes)
-    *    16 bytes overhead for gl_Position (we allocate it a slot in the VUE
-    *             even if it's not used)
-    *    32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
-    *             whenever clip planes are enabled, even if the shader doesn't
-    *             write to gl_ClipDistance)
-    *    16 bytes overhead since the VUE size must be a multiple of 32 bytes
-    *             (see below)--this causes up to 1 VUE slot to be wasted
-    *   400 bytes available for varying packing overhead
-    *
-    * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes)
-    * per interpolation type, so this is plenty.
-    *
-    */
-   unsigned output_vertex_size_bytes = c.prog_data.base.vue_map.num_slots * 16;
-   assert(brw->gen == 6 ||
-          output_vertex_size_bytes <= GEN7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
-   c.prog_data.output_vertex_size_hwords =
-      ALIGN(output_vertex_size_bytes, 32) / 32;
-
-   /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
-    * That divides up as follows:
-    *
-    *     64 bytes for the control data header (cut indices or StreamID bits)
-    *   4096 bytes for varyings (a varying component is 4 bytes and
-    *              gl_MaxGeometryTotalOutputComponents = 1024)
-    *   4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
-    *              bytes/vertex and gl_MaxGeometryOutputVertices is 256)
-    *   4096 bytes overhead for gl_Position (we allocate it a slot in the VUE
-    *              even if it's not used)
-    *   8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
-    *              whenever clip planes are enabled, even if the shader doesn't
-    *              write to gl_ClipDistance)
-    *   4096 bytes overhead since the VUE size must be a multiple of 32
-    *              bytes (see above)--this causes up to 1 VUE slot to be wasted
-    *   8128 bytes available for varying packing overhead
-    *
-    * Worst-case varying packing overhead is 3/4 of a varying slot per
-    * interpolation type, which works out to 3072 bytes, so this would allow
-    * us to accommodate 2 interpolation types without any danger of running
-    * out of URB space.
-    *
-    * In practice, the risk of running out of URB space is very small, since
-    * the above figures are all worst-case, and most of them scale with the
-    * number of output vertices.  So we'll just calculate the amount of space
-    * we need, and if it's too large, fail to compile.
-    *
-    * The above is for gen7+ where we have a single URB entry that will hold
-    * all the output. In gen6, we will have to allocate URB entries for every
-    * vertex we emit, so our URB entries only need to be large enough to hold
-    * a single vertex. Also, gen6 does not have a control data header.
-    */
-   unsigned output_size_bytes;
-   if (brw->gen >= 7) {
-      output_size_bytes =
-         c.prog_data.output_vertex_size_hwords * 32 * gp->program.VerticesOut;
-      output_size_bytes += 32 * c.prog_data.control_data_header_size_hwords;
-   } else {
-      output_size_bytes = c.prog_data.output_vertex_size_hwords * 32;
-   }
-
-   /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
-    * which comes before the control header.
-    */
-   if (brw->gen >= 8)
-      output_size_bytes += 32;
-
-   assert(output_size_bytes >= 1);
-   int max_output_size_bytes = GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES;
-   if (brw->gen == 6)
-      max_output_size_bytes = GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES;
-   if (output_size_bytes > max_output_size_bytes)
-      return false;
-
-
-   /* URB entry sizes are stored as a multiple of 64 bytes in gen7+ and
-    * a multiple of 128 bytes in gen6.
-    */
-   if (brw->gen >= 7)
-      c.prog_data.base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
-   else
-      c.prog_data.base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128;
-
-   c.prog_data.output_topology =
-      get_hw_prim_for_gl_prim(gp->program.OutputType);
-
-   /* The GLSL linker will have already matched up GS inputs and the outputs
-    * of prior stages.  The driver does extend VS outputs in some cases, but
-    * only for legacy OpenGL or Gen4-5 hardware, neither of which offer
-    * geometry shader support.  So we can safely ignore that.
-    *
-    * For SSO pipelines, we use a fixed VUE map layout based on variable
-    * locations, so we can rely on rendezvous-by-location making this work.
-    *
-    * However, we need to ignore VARYING_SLOT_PRIMITIVE_ID, as it's not
-    * written by previous stages and shows up via payload magic.
-    */
-   GLbitfield64 inputs_read =
-      gp->program.Base.InputsRead & ~VARYING_BIT_PRIMITIVE_ID;
-   brw_compute_vue_map(brw->intelScreen->devinfo,
-                       &c.input_vue_map, inputs_read,
-                       prog->SeparateShader);
-
-   /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
-    * need to program a URB read length of ceiling(num_slots / 2).
-    */
-   c.prog_data.base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2;
-
    if (unlikely(INTEL_DEBUG & DEBUG_GS))
       brw_dump_ir("geometry", prog, gs, NULL);
 
@@ -303,25 +106,25 @@ brw_codegen_gs_prog(struct brw_context *brw,
    unsigned program_size;
    char *error_str;
    const unsigned *program =
-      brw_compile_gs(brw->intelScreen->compiler, brw, &c,
-                     shader->Program->nir, prog,
-                     mem_ctx, st_index, &program_size, &error_str);
+      brw_compile_gs(brw->intelScreen->compiler, brw, mem_ctx, key,
+                     &prog_data, shader->Program->nir, prog,
+                     st_index, &program_size, &error_str);
    if (program == NULL) {
       ralloc_free(mem_ctx);
       return false;
    }
 
    /* Scratch space is used for register spilling */
-   if (c.prog_data.base.base.total_scratch) {
+   if (prog_data.base.base.total_scratch) {
       brw_get_scratch_bo(brw, &stage_state->scratch_bo,
-			 c.prog_data.base.base.total_scratch *
+			 prog_data.base.base.total_scratch *
                          brw->max_gs_threads);
    }
 
    brw_upload_cache(&brw->cache, BRW_CACHE_GS_PROG,
-                    &c.key, sizeof(c.key),
+                    key, sizeof(*key),
                     program, program_size,
-                    &c.prog_data, sizeof(c.prog_data),
+                    &prog_data, sizeof(prog_data),
                     &stage_state->prog_offset, &brw->gs.prog_data);
    ralloc_free(mem_ctx);
 
diff --git a/src/mesa/drivers/dri/i965/brw_inst.h b/src/mesa/drivers/dri/i965/brw_inst.h
index ab37b709d65..4ed95c473cd 100644
--- a/src/mesa/drivers/dri/i965/brw_inst.h
+++ b/src/mesa/drivers/dri/i965/brw_inst.h
@@ -181,7 +181,8 @@ F(saturate,             31,  31)
 F(debug_control,        30,  30)
 F(cmpt_control,         29,  29)
 FC(branch_control,      28,  28, devinfo->gen >= 8)
-F(acc_wr_control,       28,  28)
+FC(acc_wr_control,      28,  28, devinfo->gen >= 6)
+FC(mask_control_ex,     28,  28, devinfo->is_g4x || devinfo->gen == 5)
 F(cond_modifier,        27,  24)
 FC(math_function,       27,  24, devinfo->gen >= 6)
 F(exec_size,            23,  21)
@@ -392,6 +393,7 @@ FF(urb_per_slot_offset,
    /* 4-6: */ -1, -1, -1, -1, -1, -1, -1, -1,
    /* 7:   */ MD(16), MD(16),
    /* 8:   */ MD(17), MD(17))
+FC(urb_channel_mask_present, MD(15), MD(15), devinfo->gen >= 8)
 FC(urb_complete, MD(15), MD(15), devinfo->gen < 8)
 FC(urb_used, MD(14), MD(14), devinfo->gen < 7)
 FC(urb_allocate, MD(13), MD(13), devinfo->gen < 7)
@@ -738,7 +740,7 @@ typedef struct {
  * Bits indices range from 0..63.
  */
 static inline unsigned
-brw_compact_inst_bits(brw_compact_inst *inst, unsigned high, unsigned low)
+brw_compact_inst_bits(const brw_compact_inst *inst, unsigned high, unsigned low)
 {
    const uint64_t mask = (1ull << (high - low + 1)) - 1;
 
@@ -762,56 +764,65 @@ brw_compact_inst_set_bits(brw_compact_inst *inst, unsigned high, unsigned low,
    inst->data = (inst->data & ~mask) | (value << low);
 }
 
-#define F(name, high, low)                                      \
-static inline void                                              \
-brw_compact_inst_set_##name(brw_compact_inst *inst, unsigned v) \
-{                                                               \
-   brw_compact_inst_set_bits(inst, high, low, v);               \
-}                                                               \
-                                                                \
-static inline unsigned                                          \
-brw_compact_inst_##name(brw_compact_inst *inst)                 \
-{                                                               \
-   return brw_compact_inst_bits(inst, high, low);               \
+#define FC(name, high, low, assertions)                            \
+static inline void                                                 \
+brw_compact_inst_set_##name(const struct brw_device_info *devinfo, \
+                            brw_compact_inst *inst, unsigned v)    \
+{                                                                  \
+   assert(assertions);                                             \
+   (void) devinfo;                                                 \
+   brw_compact_inst_set_bits(inst, high, low, v);                  \
+}                                                                  \
+static inline unsigned                                             \
+brw_compact_inst_##name(const struct brw_device_info *devinfo,     \
+                        const brw_compact_inst *inst)              \
+{                                                                  \
+   assert(assertions);                                             \
+   (void) devinfo;                                                 \
+   return brw_compact_inst_bits(inst, high, low);                  \
 }
 
-F(src1_reg_nr,    63, 56)
-F(src0_reg_nr,    55, 48)
-F(dst_reg_nr,     47, 40)
-F(src1_index,     39, 35)
-F(src0_index,     34, 30)
-F(cmpt_control,   29, 29) /* Same location as brw_inst */
-F(flag_subreg_nr, 28, 28) /* <= Gen6 only */
-F(cond_modifier,  27, 24) /* Same location as brw_inst */
-F(acc_wr_control, 23, 23)
-F(subreg_index,   22, 18)
-F(datatype_index, 17, 13)
-F(control_index,  12,  8)
-F(debug_control,   7,  7)
-F(opcode,          6,  0) /* Same location as brw_inst */
+/* A simple macro for fields which stay in the same place on all generations. */
+#define F(name, high, low) FC(name, high, low, true)
+
+F(src1_reg_nr,      63, 56)
+F(src0_reg_nr,      55, 48)
+F(dst_reg_nr,       47, 40)
+F(src1_index,       39, 35)
+F(src0_index,       34, 30)
+F(cmpt_control,     29, 29) /* Same location as brw_inst */
+FC(flag_subreg_nr,  28, 28, devinfo->gen <= 6)
+F(cond_modifier,    27, 24) /* Same location as brw_inst */
+FC(acc_wr_control,  23, 23, devinfo->gen >= 6)
+FC(mask_control_ex, 23, 23, devinfo->is_g4x || devinfo->gen == 5)
+F(subreg_index,     22, 18)
+F(datatype_index,   17, 13)
+F(control_index,    12,  8)
+F(debug_control,     7,  7)
+F(opcode,            6,  0) /* Same location as brw_inst */
 
 /**
  * (Gen8+) Compacted three-source instructions:
  *  @{
  */
-F(3src_src2_reg_nr,    63, 57)
-F(3src_src1_reg_nr,    56, 50)
-F(3src_src0_reg_nr,    49, 43)
-F(3src_src2_subreg_nr, 42, 40)
-F(3src_src1_subreg_nr, 39, 37)
-F(3src_src0_subreg_nr, 36, 34)
-F(3src_src2_rep_ctrl,  33, 33)
-F(3src_src1_rep_ctrl,  32, 32)
-F(3src_saturate,       31, 31)
-F(3src_debug_control,  30, 30)
-F(3src_cmpt_control,   29, 29)
-F(3src_src0_rep_ctrl,  28, 28)
+FC(3src_src2_reg_nr,    63, 57, devinfo->gen >= 8)
+FC(3src_src1_reg_nr,    56, 50, devinfo->gen >= 8)
+FC(3src_src0_reg_nr,    49, 43, devinfo->gen >= 8)
+FC(3src_src2_subreg_nr, 42, 40, devinfo->gen >= 8)
+FC(3src_src1_subreg_nr, 39, 37, devinfo->gen >= 8)
+FC(3src_src0_subreg_nr, 36, 34, devinfo->gen >= 8)
+FC(3src_src2_rep_ctrl,  33, 33, devinfo->gen >= 8)
+FC(3src_src1_rep_ctrl,  32, 32, devinfo->gen >= 8)
+FC(3src_saturate,       31, 31, devinfo->gen >= 8)
+FC(3src_debug_control,  30, 30, devinfo->gen >= 8)
+FC(3src_cmpt_control,   29, 29, devinfo->gen >= 8)
+FC(3src_src0_rep_ctrl,  28, 28, devinfo->gen >= 8)
 /* Reserved */
-F(3src_dst_reg_nr,     18, 12)
-F(3src_source_index,   11, 10)
-F(3src_control_index,   9,  8)
+FC(3src_dst_reg_nr,     18, 12, devinfo->gen >= 8)
+FC(3src_source_index,   11, 10, devinfo->gen >= 8)
+FC(3src_control_index,   9,  8, devinfo->gen >= 8)
 /* Bit 7 is Reserved (for future Opcode expansion) */
-F(3src_opcode,          6,  0)
+FC(3src_opcode,          6,  0, devinfo->gen >= 8)
 /** @} */
 
 #undef F
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index 22b0227756e..6433dec9041 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -91,7 +91,7 @@ static struct gl_program *brwNewProgram( struct gl_context *ctx,
       if (prog) {
          prog->id = get_new_program_id(brw->intelScreen);
 
-         return _mesa_init_gl_program(&prog->program, target, id);
+         return _mesa_init_gl_program(&prog->program.Base, target, id);
       } else {
          return NULL;
       }
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 6be2a6e5b55..e48f559afa7 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -79,6 +79,8 @@ is_scalar_shader_stage(const struct brw_compiler *compiler, int stage)
    case MESA_SHADER_FRAGMENT:
    case MESA_SHADER_COMPUTE:
       return true;
+   case MESA_SHADER_GEOMETRY:
+      return compiler->scalar_gs;
    case MESA_SHADER_VERTEX:
       return compiler->scalar_vs;
    default:
@@ -101,6 +103,9 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
    if (devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS))
       compiler->scalar_vs = true;
 
+   if (devinfo->gen >= 8 && brw_env_var_as_boolean("INTEL_SCALAR_GS", false))
+      compiler->scalar_gs = true;
+
    nir_shader_compiler_options *nir_options =
       rzalloc(compiler, nir_shader_compiler_options);
    nir_options->native_integers = true;
@@ -411,6 +416,14 @@ brw_instruction_name(enum opcode op)
       return "gen7_scratch_read";
    case SHADER_OPCODE_URB_WRITE_SIMD8:
       return "gen8_urb_write_simd8";
+   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+      return "gen8_urb_write_simd8_per_slot";
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+      return "gen8_urb_write_simd8_masked";
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
+      return "gen8_urb_write_simd8_masked_per_slot";
+   case SHADER_OPCODE_URB_READ_SIMD8:
+      return "urb_read_simd8";
 
    case SHADER_OPCODE_FIND_LIVE_CHANNEL:
       return "find_live_channel";
@@ -964,6 +977,9 @@ backend_instruction::has_side_effects() const
    case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
    case SHADER_OPCODE_MEMORY_FENCE:
    case SHADER_OPCODE_URB_WRITE_SIMD8:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
    case FS_OPCODE_FB_WRITE:
    case SHADER_OPCODE_BARRIER:
       return true;
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index 2e47690d403..8899b30c1ae 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -233,6 +233,18 @@ bool opt_predicated_break(struct backend_shader *s);
 extern "C" {
 #endif
 
+/**
+ * Scratch data used when compiling a GLSL geometry shader.
+ */
+struct brw_gs_compile
+{
+   struct brw_gs_prog_key key;
+   struct brw_vue_map input_vue_map;
+
+   unsigned control_data_bits_per_vertex;
+   unsigned control_data_header_size_bits;
+};
+
 void
 brw_assign_common_binding_table_offsets(gl_shader_stage stage,
                                         const struct brw_device_info *devinfo,
diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index 2955c8dcc2e..a2948293a62 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -40,36 +40,32 @@
 #define FILE_DEBUG_FLAG DEBUG_MIPTREE
 
 static unsigned int
-tr_mode_horizontal_texture_alignment(const struct brw_context *brw,
-                                     const struct intel_mipmap_tree *mt)
+tr_mode_horizontal_texture_alignment(const struct intel_mipmap_tree *mt)
 {
-   const unsigned *align_yf, *align_ys;
-   const unsigned bpp = _mesa_get_format_bytes(mt->format) * 8;
-   unsigned ret_align, divisor;
+   unsigned ret_align, divisor, multiplier_ys;
 
-   /* Horizontal alignment tables for TRMODE_{YF,YS}. Value in below
-    * tables specifies the horizontal alignment requirement in elements
-    * for the surface. An element is defined as a pixel in uncompressed
-    * surface formats, and as a compression block in compressed surface
-    * formats. For MSFMT_DEPTH_STENCIL type multisampled surfaces, an
+   /* Values in below tables specifiy the horizontal alignment requirement
+    * in elements for TRMODE_YF surface. An element is defined as a pixel in
+    * uncompressed surface formats, and as a compression block in compressed
+    * surface formats. For MSFMT_DEPTH_STENCIL type multisampled surfaces, an
     * element is a sample.
     */
    const unsigned align_1d_yf[] = {4096, 2048, 1024, 512, 256};
-   const unsigned align_1d_ys[] = {65536, 32768, 16384, 8192, 4096};
    const unsigned align_2d_yf[] = {64, 64, 32, 32, 16};
-   const unsigned align_2d_ys[] = {256, 256, 128, 128, 64};
    const unsigned align_3d_yf[] = {16, 8, 8, 8, 4};
-   const unsigned align_3d_ys[] = {64, 32, 32, 32, 16};
-   int i = 0;
 
-   /* Alignment computations below assume bpp >= 8 and a power of 2. */
-   assert (bpp >= 8 && bpp <= 128 && _mesa_is_pow_two(bpp));
+   assert(mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE);
+
+   /* Alignment computations below assume a power of 2 cpp. */
+   assert (mt->cpp >= 1 && mt->cpp <= 16 && _mesa_is_pow_two(mt->cpp));
+   /* Compute array index. */
+   const int i = ffs(mt->cpp) - 1;
 
    switch(mt->target) {
    case GL_TEXTURE_1D:
    case GL_TEXTURE_1D_ARRAY:
-      align_yf = align_1d_yf;
-      align_ys = align_1d_ys;
+      ret_align = align_1d_yf[i];
+      multiplier_ys = 16;
       break;
    case GL_TEXTURE_2D:
    case GL_TEXTURE_RECTANGLE:
@@ -78,22 +74,19 @@ tr_mode_horizontal_texture_alignment(const struct brw_context *brw,
    case GL_TEXTURE_CUBE_MAP_ARRAY:
    case GL_TEXTURE_2D_MULTISAMPLE:
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
-      align_yf = align_2d_yf;
-      align_ys = align_2d_ys;
+      ret_align = align_2d_yf[i];
+      multiplier_ys = 4;
       break;
    case GL_TEXTURE_3D:
-      align_yf = align_3d_yf;
-      align_ys = align_3d_ys;
+      ret_align = align_3d_yf[i];
+      multiplier_ys = 4;
       break;
    default:
       unreachable("not reached");
    }
 
-   /* Compute array index. */
-   i = ffs(bpp/8) - 1;
-
-   ret_align = mt->tr_mode == INTEL_MIPTREE_TRMODE_YF ?
-               align_yf[i] : align_ys[i];
+   if (mt->tr_mode == INTEL_MIPTREE_TRMODE_YS)
+      ret_align *= multiplier_ys;
 
    assert(_mesa_is_pow_two(mt->num_samples));
 
@@ -148,26 +141,20 @@ intel_horizontal_texture_alignment_unit(struct brw_context *brw,
 }
 
 static unsigned int
-tr_mode_vertical_texture_alignment(const struct brw_context *brw,
-                                   const struct intel_mipmap_tree *mt)
+tr_mode_vertical_texture_alignment(const struct intel_mipmap_tree *mt)
 {
-   const unsigned *align_yf, *align_ys;
-   const unsigned bpp = _mesa_get_format_bytes(mt->format) * 8;
-   unsigned ret_align, divisor;
+   unsigned ret_align, divisor, multiplier_ys;
 
-   /* Vertical alignment tables for TRMODE_YF and TRMODE_YS. */
+   /* Vertical alignment tables for TRMODE_YF */
    const unsigned align_2d_yf[] = {64, 32, 32, 16, 16};
-   const unsigned align_2d_ys[] = {256, 128, 128, 64, 64};
    const unsigned align_3d_yf[] = {16, 16, 16, 8, 8};
-   const unsigned align_3d_ys[] = {32, 32, 32, 16, 16};
-   int i = 0;
 
-   assert(brw->gen >= 9 &&
-          mt->target != GL_TEXTURE_1D &&
-          mt->target != GL_TEXTURE_1D_ARRAY);
+   assert(mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE);
 
-   /* Alignment computations below assume bpp >= 8 and a power of 2. */
-   assert (bpp >= 8 && bpp <= 128 && _mesa_is_pow_two(bpp)) ;
+   /* Alignment computations below assume a power of 2 cpp. */
+   assert (mt->cpp >= 1 && mt->cpp <= 16 && _mesa_is_pow_two(mt->cpp)) ;
+   /* Compute array index. */
+   const int i = ffs(mt->cpp) - 1;
 
    switch(mt->target) {
    case GL_TEXTURE_2D:
@@ -177,22 +164,21 @@ tr_mode_vertical_texture_alignment(const struct brw_context *brw,
    case GL_TEXTURE_CUBE_MAP_ARRAY:
    case GL_TEXTURE_2D_MULTISAMPLE:
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
-      align_yf = align_2d_yf;
-      align_ys = align_2d_ys;
+      ret_align = align_2d_yf[i];
+      multiplier_ys = 4;
       break;
    case GL_TEXTURE_3D:
-      align_yf = align_3d_yf;
-      align_ys = align_3d_ys;
+      ret_align = align_3d_yf[i];
+      multiplier_ys = 2;
       break;
+   case GL_TEXTURE_1D:
+   case GL_TEXTURE_1D_ARRAY:
    default:
-      unreachable("not reached");
+      unreachable("Unexpected miptree target");
    }
 
-   /* Compute array index. */
-   i = ffs(bpp / 8) - 1;
-
-   ret_align = mt->tr_mode == INTEL_MIPTREE_TRMODE_YF ?
-               align_yf[i] : align_ys[i];
+   if (mt->tr_mode == INTEL_MIPTREE_TRMODE_YS)
+      ret_align *= multiplier_ys;
 
    assert(_mesa_is_pow_two(mt->num_samples));
 
@@ -779,8 +765,8 @@ intel_miptree_set_alignment(struct brw_context *brw,
    } else if (brw->gen >= 9 && mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE) {
       /* XY_FAST_COPY_BLT doesn't support horizontal alignment < 32 or
        * vertical alignment < 64. */
-      mt->halign = MAX2(tr_mode_horizontal_texture_alignment(brw, mt), 32);
-      mt->valign = MAX2(tr_mode_vertical_texture_alignment(brw, mt), 64);
+      mt->halign = MAX2(tr_mode_horizontal_texture_alignment(mt), 32);
+      mt->valign = MAX2(tr_mode_vertical_texture_alignment(mt), 64);
    } else {
       mt->halign =
          intel_horizontal_texture_alignment_unit(brw, mt, layout_flags);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index befc92445d3..3e7078d0b32 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1111,7 +1111,7 @@ vec4_visitor::opt_register_coalesce()
        */
       vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev;
       foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst,
-                                                  inst, block) {
+                                                  inst) {
          _scan_inst = scan_inst;
 
          if (inst->src[0].in_range(scan_inst->dst, scan_inst->regs_written)) {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
index 1b929b3df2c..6bc39473137 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
@@ -104,7 +104,7 @@ vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       break;
 
    case nir_intrinsic_load_primitive_id:
-      assert(c->prog_data.include_primitive_id);
+      assert(gs_prog_data->include_primitive_id);
       dest = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
       emit(MOV(dest, retype(brw_vec4_grf(1, 0), BRW_REGISTER_TYPE_D)));
       break;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index a715cf5a6cb..9402489e628 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -35,14 +35,16 @@ namespace brw {
 vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler,
                                  void *log_data,
                                  struct brw_gs_compile *c,
+                                 struct brw_gs_prog_data *prog_data,
                                  const nir_shader *shader,
                                  void *mem_ctx,
                                  bool no_spills,
                                  int shader_time_index)
    : vec4_visitor(compiler, log_data, &c->key.tex,
-                  &c->prog_data.base, shader,  mem_ctx,
+                  &prog_data->base, shader,  mem_ctx,
                   no_spills, shader_time_index),
-     c(c)
+     c(c),
+     gs_prog_data(prog_data)
 {
 }
 
@@ -78,9 +80,9 @@ vec4_gs_visitor::setup_varying_inputs(int payload_reg, int *attribute_map,
     * so the total number of input slots that will be delivered to the GS (and
     * thus the stride of the input arrays) is urb_read_length * 2.
     */
-   const unsigned num_input_vertices = c->gp->program.VerticesIn;
+   const unsigned num_input_vertices = nir->info.gs.vertices_in;
    assert(num_input_vertices <= MAX_GS_INPUT_VERTICES);
-   unsigned input_array_stride = c->prog_data.base.urb_read_length * 2;
+   unsigned input_array_stride = prog_data->urb_read_length * 2;
 
    for (int slot = 0; slot < c->input_vue_map.num_slots; slot++) {
       int varying = c->input_vue_map.slot_to_varying[slot];
@@ -106,7 +108,7 @@ vec4_gs_visitor::setup_payload()
     * to be interleaved, so one register contains two attribute slots.
     */
    int attributes_per_reg =
-      c->prog_data.base.dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
+      prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
 
    /* If a geometry shader tries to read from an input that wasn't written by
     * the vertex shader, that produces undefined results, but it shouldn't
@@ -124,7 +126,7 @@ vec4_gs_visitor::setup_payload()
    reg++;
 
    /* If the shader uses gl_PrimitiveIDIn, that goes in r1. */
-   if (c->prog_data.include_primitive_id)
+   if (gs_prog_data->include_primitive_id)
       attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg++;
 
    reg = setup_uniforms(reg);
@@ -182,9 +184,9 @@ vec4_gs_visitor::emit_prolog()
     * to account for the fact that the vertex shader stored it in the w
     * component of VARYING_SLOT_PSIZ.
     */
-   if (c->gp->program.Base.InputsRead & VARYING_BIT_PSIZ) {
+   if (nir->info.inputs_read & VARYING_BIT_PSIZ) {
       this->current_annotation = "swizzle gl_PointSize input";
-      for (int vertex = 0; vertex < c->gp->program.VerticesIn; vertex++) {
+      for (int vertex = 0; vertex < (int)nir->info.gs.vertices_in; vertex++) {
          dst_reg dst(ATTR,
                      BRW_VARYING_SLOT_COUNT * vertex + VARYING_SLOT_PSIZ);
          dst.type = BRW_REGISTER_TYPE_F;
@@ -222,7 +224,7 @@ vec4_gs_visitor::emit_thread_end()
     */
    int base_mrf = 1;
 
-   bool static_vertex_count = c->prog_data.static_vertex_count != -1;
+   bool static_vertex_count = gs_prog_data->static_vertex_count != -1;
 
    /* If the previous instruction was a URB write, we don't need to issue
     * a second one - we can just set the EOT bit on the previous write.
@@ -271,7 +273,7 @@ vec4_gs_visitor::emit_urb_write_header(int mrf)
    vec4_instruction *inst = emit(MOV(mrf_reg, r0));
    inst->force_writemask_all = true;
    emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, this->vertex_count,
-        (uint32_t) c->prog_data.output_vertex_size_hwords);
+        (uint32_t) gs_prog_data->output_vertex_size_hwords);
 }
 
 
@@ -285,12 +287,12 @@ vec4_gs_visitor::emit_urb_write_opcode(bool complete)
    (void) complete;
 
    vec4_instruction *inst = emit(GS_OPCODE_URB_WRITE);
-   inst->offset = c->prog_data.control_data_header_size_hwords;
+   inst->offset = gs_prog_data->control_data_header_size_hwords;
 
    /* We need to increment Global Offset by 1 to make room for Broadwell's
     * extra "Vertex Count" payload at the beginning of the URB entry.
     */
-   if (devinfo->gen >= 8 && c->prog_data.static_vertex_count == -1)
+   if (devinfo->gen >= 8 && gs_prog_data->static_vertex_count == -1)
       inst->offset++;
 
    inst->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
@@ -409,7 +411,7 @@ vec4_gs_visitor::emit_control_data_bits()
     * URB entry.  Since this is an OWord message, Global Offset is counted
     * in 128-bit units, so we must set it to 2.
     */
-   if (devinfo->gen >= 8 && c->prog_data.static_vertex_count == -1)
+   if (devinfo->gen >= 8 && gs_prog_data->static_vertex_count == -1)
       inst->offset = 2;
    inst->base_mrf = base_mrf;
    inst->mlen = 2;
@@ -536,7 +538,7 @@ vec4_gs_visitor::gs_emit_vertex(int stream_id)
     * do for GL_POINTS outputs that don't use streams).
     */
    if (c->control_data_header_size_bits > 0 &&
-       c->prog_data.control_data_format ==
+       gs_prog_data->control_data_format ==
           GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
        this->current_annotation = "emit vertex: Stream control data bits";
        set_stream_control_data_bits(stream_id);
@@ -552,7 +554,7 @@ vec4_gs_visitor::gs_end_primitive()
     * consists of cut bits.  Fortunately, the only time it isn't is when the
     * output type is points, in which case EndPrimitive() is a no-op.
     */
-   if (c->prog_data.control_data_format !=
+   if (gs_prog_data->control_data_format !=
        GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
       return;
    }
@@ -598,27 +600,231 @@ vec4_gs_visitor::gs_end_primitive()
 
 extern "C" const unsigned *
 brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
-               struct brw_gs_compile *c,
+               void *mem_ctx,
+               const struct brw_gs_prog_key *key,
+               struct brw_gs_prog_data *prog_data,
                const nir_shader *shader,
                struct gl_shader_program *shader_prog,
-               void *mem_ctx,
                int shader_time_index,
                unsigned *final_assembly_size,
                char **error_str)
 {
+   struct brw_gs_compile c;
+   memset(&c, 0, sizeof(c));
+   c.key = *key;
+
+   prog_data->include_primitive_id =
+      (shader->info.inputs_read & VARYING_BIT_PRIMITIVE_ID) != 0;
+
+   prog_data->invocations = shader->info.gs.invocations;
+
+   if (compiler->devinfo->gen >= 8)
+      prog_data->static_vertex_count = nir_gs_count_vertices(shader);
+
+   if (compiler->devinfo->gen >= 7) {
+      if (shader->info.gs.output_primitive == GL_POINTS) {
+         /* When the output type is points, the geometry shader may output data
+          * to multiple streams, and EndPrimitive() has no effect.  So we
+          * configure the hardware to interpret the control data as stream ID.
+          */
+         prog_data->control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;
+
+         /* We only have to emit control bits if we are using streams */
+         if (shader_prog && shader_prog->Geom.UsesStreams)
+            c.control_data_bits_per_vertex = 2;
+         else
+            c.control_data_bits_per_vertex = 0;
+      } else {
+         /* When the output type is triangle_strip or line_strip, EndPrimitive()
+          * may be used to terminate the current strip and start a new one
+          * (similar to primitive restart), and outputting data to multiple
+          * streams is not supported.  So we configure the hardware to interpret
+          * the control data as EndPrimitive information (a.k.a. "cut bits").
+          */
+         prog_data->control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;
+
+         /* We only need to output control data if the shader actually calls
+          * EndPrimitive().
+          */
+         c.control_data_bits_per_vertex =
+            shader->info.gs.uses_end_primitive ? 1 : 0;
+      }
+   } else {
+      /* There are no control data bits in gen6. */
+      c.control_data_bits_per_vertex = 0;
+
+      /* If it is using transform feedback, enable it */
+      if (shader->info.has_transform_feedback_varyings)
+         prog_data->gen6_xfb_enabled = true;
+      else
+         prog_data->gen6_xfb_enabled = false;
+   }
+   c.control_data_header_size_bits =
+      shader->info.gs.vertices_out * c.control_data_bits_per_vertex;
+
+   /* 1 HWORD = 32 bytes = 256 bits */
+   prog_data->control_data_header_size_hwords =
+      ALIGN(c.control_data_header_size_bits, 256) / 256;
+
+   /* Compute the output vertex size.
+    *
+    * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex
+    * Size (p168):
+    *
+    *     [0,62] indicating [1,63] 16B units
+    *
+    *     Specifies the size of each vertex stored in the GS output entry
+    *     (following any Control Header data) as a number of 128-bit units
+    *     (minus one).
+    *
+    *     Programming Restrictions: The vertex size must be programmed as a
+    *     multiple of 32B units with the following exception: Rendering is
+    *     disabled (as per SOL stage state) and the vertex size output by the
+    *     GS thread is 16B.
+    *
+    *     If rendering is enabled (as per SOL state) the vertex size must be
+    *     programmed as a multiple of 32B units. In other words, the only time
+    *     software can program a vertex size with an odd number of 16B units
+    *     is when rendering is disabled.
+    *
+    * Note: B=bytes in the above text.
+    *
+    * It doesn't seem worth the extra trouble to optimize the case where the
+    * vertex size is 16B (especially since this would require special-casing
+    * the GEN assembly that writes to the URB).  So we just set the vertex
+    * size to a multiple of 32B (2 vec4's) in all cases.
+    *
+    * The maximum output vertex size is 62*16 = 992 bytes (31 hwords).  We
+    * budget that as follows:
+    *
+    *   512 bytes for varyings (a varying component is 4 bytes and
+    *             gl_MaxGeometryOutputComponents = 128)
+    *    16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
+    *             bytes)
+    *    16 bytes overhead for gl_Position (we allocate it a slot in the VUE
+    *             even if it's not used)
+    *    32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
+    *             whenever clip planes are enabled, even if the shader doesn't
+    *             write to gl_ClipDistance)
+    *    16 bytes overhead since the VUE size must be a multiple of 32 bytes
+    *             (see below)--this causes up to 1 VUE slot to be wasted
+    *   400 bytes available for varying packing overhead
+    *
+    * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes)
+    * per interpolation type, so this is plenty.
+    *
+    */
+   unsigned output_vertex_size_bytes = prog_data->base.vue_map.num_slots * 16;
+   assert(compiler->devinfo->gen == 6 ||
+          output_vertex_size_bytes <= GEN7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
+   prog_data->output_vertex_size_hwords =
+      ALIGN(output_vertex_size_bytes, 32) / 32;
+
+   /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
+    * That divides up as follows:
+    *
+    *     64 bytes for the control data header (cut indices or StreamID bits)
+    *   4096 bytes for varyings (a varying component is 4 bytes and
+    *              gl_MaxGeometryTotalOutputComponents = 1024)
+    *   4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
+    *              bytes/vertex and gl_MaxGeometryOutputVertices is 256)
+    *   4096 bytes overhead for gl_Position (we allocate it a slot in the VUE
+    *              even if it's not used)
+    *   8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
+    *              whenever clip planes are enabled, even if the shader doesn't
+    *              write to gl_ClipDistance)
+    *   4096 bytes overhead since the VUE size must be a multiple of 32
+    *              bytes (see above)--this causes up to 1 VUE slot to be wasted
+    *   8128 bytes available for varying packing overhead
+    *
+    * Worst-case varying packing overhead is 3/4 of a varying slot per
+    * interpolation type, which works out to 3072 bytes, so this would allow
+    * us to accommodate 2 interpolation types without any danger of running
+    * out of URB space.
+    *
+    * In practice, the risk of running out of URB space is very small, since
+    * the above figures are all worst-case, and most of them scale with the
+    * number of output vertices.  So we'll just calculate the amount of space
+    * we need, and if it's too large, fail to compile.
+    *
+    * The above is for gen7+ where we have a single URB entry that will hold
+    * all the output. In gen6, we will have to allocate URB entries for every
+    * vertex we emit, so our URB entries only need to be large enough to hold
+    * a single vertex. Also, gen6 does not have a control data header.
+    */
+   unsigned output_size_bytes;
+   if (compiler->devinfo->gen >= 7) {
+      output_size_bytes =
+         prog_data->output_vertex_size_hwords * 32 * shader->info.gs.vertices_out;
+      output_size_bytes += 32 * prog_data->control_data_header_size_hwords;
+   } else {
+      output_size_bytes = prog_data->output_vertex_size_hwords * 32;
+   }
+
+   /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
+    * which comes before the control header.
+    */
+   if (compiler->devinfo->gen >= 8)
+      output_size_bytes += 32;
+
+   assert(output_size_bytes >= 1);
+   int max_output_size_bytes = GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES;
+   if (compiler->devinfo->gen == 6)
+      max_output_size_bytes = GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES;
+   if (output_size_bytes > max_output_size_bytes)
+      return false;
+
+
+   /* URB entry sizes are stored as a multiple of 64 bytes in gen7+ and
+    * a multiple of 128 bytes in gen6.
+    */
+   if (compiler->devinfo->gen >= 7)
+      prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+   else
+      prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128;
+
+   prog_data->output_topology =
+      get_hw_prim_for_gl_prim(shader->info.gs.output_primitive);
+
+   /* The GLSL linker will have already matched up GS inputs and the outputs
+    * of prior stages.  The driver does extend VS outputs in some cases, but
+    * only for legacy OpenGL or Gen4-5 hardware, neither of which offer
+    * geometry shader support.  So we can safely ignore that.
+    *
+    * For SSO pipelines, we use a fixed VUE map layout based on variable
+    * locations, so we can rely on rendezvous-by-location making this work.
+    *
+    * However, we need to ignore VARYING_SLOT_PRIMITIVE_ID, as it's not
+    * written by previous stages and shows up via payload magic.
+    */
+   GLbitfield64 inputs_read =
+      shader->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID;
+   brw_compute_vue_map(compiler->devinfo,
+                       &c.input_vue_map, inputs_read,
+                       shader->info.separate_shader);
+
+   /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
+    * need to program a URB read length of ceiling(num_slots / 2).
+    */
+   prog_data->base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2;
+
+   /* Now that prog_data setup is done, we are ready to actually compile the
+    * program.
+    */
+
    if (compiler->devinfo->gen >= 7) {
       /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
        * so without spilling. If the GS invocations count > 1, then we can't use
        * dual object mode.
        */
-      if (c->prog_data.invocations <= 1 &&
+      if (prog_data->invocations <= 1 &&
           likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) {
-         c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
+         prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
 
-         vec4_gs_visitor v(compiler, log_data, c, shader,
+         vec4_gs_visitor v(compiler, log_data, &c, prog_data, shader,
                            mem_ctx, true /* no_spills */, shader_time_index);
          if (v.run()) {
-            vec4_generator g(compiler, log_data, &c->prog_data.base, mem_ctx,
+            vec4_generator g(compiler, log_data, &prog_data->base, mem_ctx,
                              INTEL_DEBUG & DEBUG_GS, "geometry", "GS");
             return g.generate_assembly(v.cfg, final_assembly_size, shader);
          }
@@ -648,28 +854,28 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
     * mode is more performant when invocations > 1. Gen6 only supports
     * SINGLE mode.
     */
-   if (c->prog_data.invocations <= 1 || compiler->devinfo->gen < 7)
-      c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X1_SINGLE;
+   if (prog_data->invocations <= 1 || compiler->devinfo->gen < 7)
+      prog_data->base.dispatch_mode = DISPATCH_MODE_4X1_SINGLE;
    else
-      c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_INSTANCE;
+      prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_INSTANCE;
 
    vec4_gs_visitor *gs = NULL;
    const unsigned *ret = NULL;
 
    if (compiler->devinfo->gen >= 7)
-      gs = new vec4_gs_visitor(compiler, log_data, c, shader,
-                               mem_ctx, false /* no_spills */,
+      gs = new vec4_gs_visitor(compiler, log_data, &c, prog_data,
+                               shader, mem_ctx, false /* no_spills */,
                                shader_time_index);
    else
-      gs = new gen6_gs_visitor(compiler, log_data, c, shader_prog, shader,
-                               mem_ctx, false /* no_spills */,
+      gs = new gen6_gs_visitor(compiler, log_data, &c, prog_data, shader_prog,
+                               shader, mem_ctx, false /* no_spills */,
                                shader_time_index);
 
    if (!gs->run()) {
       if (error_str)
          *error_str = ralloc_strdup(mem_ctx, gs->fail_msg);
    } else {
-      vec4_generator g(compiler, log_data, &c->prog_data.base, mem_ctx,
+      vec4_generator g(compiler, log_data, &prog_data->base, mem_ctx,
                        INTEL_DEBUG & DEBUG_GS, "geometry", "GS");
       ret = g.generate_assembly(gs->cfg, final_assembly_size, shader);
    }
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
index c52552768c8..6ca83a9d9a3 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
@@ -41,6 +41,7 @@ public:
    vec4_gs_visitor(const struct brw_compiler *compiler,
                    void *log_data,
                    struct brw_gs_compile *c,
+                   struct brw_gs_prog_data *prog_data,
                    const nir_shader *shader,
                    void *mem_ctx,
                    bool no_spills,
@@ -70,6 +71,7 @@ protected:
    src_reg vertex_count;
    src_reg control_data_bits;
    const struct brw_gs_compile * const c;
+   struct brw_gs_prog_data * const gs_prog_data;
 };
 
 } /* namespace brw */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 5be9c6a6b2d..6d155285820 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1222,6 +1222,9 @@ vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
 void
 vec4_visitor::emit_ndc_computation()
 {
+   if (output_reg[VARYING_SLOT_POS].file == BAD_FILE)
+      return;
+
    /* Get the position */
    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
 
@@ -1287,7 +1290,8 @@ vec4_visitor::emit_psiz_and_flags(dst_reg reg)
        * Later, clipping will detect ucp[6] and ensure the primitive is
        * clipped against all fixed planes.
        */
-      if (devinfo->has_negative_rhw_bug) {
+      if (devinfo->has_negative_rhw_bug &&
+          output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE) {
          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
@@ -1335,8 +1339,10 @@ vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
    assert(varying < VARYING_SLOT_MAX);
    assert(output_reg[varying].type == reg.type);
    current_annotation = output_reg_annotation[varying];
-   /* Copy the register, saturating if necessary */
-   return emit(MOV(reg, src_reg(output_reg[varying])));
+   if (output_reg[varying].file != BAD_FILE)
+      return emit(MOV(reg, src_reg(output_reg[varying])));
+   else
+      return NULL;
 }
 
 void
@@ -1355,11 +1361,13 @@ vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
    }
    case BRW_VARYING_SLOT_NDC:
       current_annotation = "NDC";
-      emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
+      if (output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE)
+         emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
       break;
    case VARYING_SLOT_POS:
       current_annotation = "gl_Position";
-      emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
+      if (output_reg[VARYING_SLOT_POS].file != BAD_FILE)
+         emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
       break;
    case VARYING_SLOT_EDGE:
       /* This is present when doing unfilled polygons.  We're supposed to copy
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
index 485a80ee2fc..5dd4f98cecc 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
@@ -217,7 +217,7 @@ vec4_vs_visitor::emit_urb_slot(dst_reg reg, int varying)
        * shader.
        */
       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
-      if (key->clamp_vertex_color)
+      if (inst && key->clamp_vertex_color)
          inst->saturate = true;
       break;
    }
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index ba680a98f7e..5db4b3a86af 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -312,7 +312,7 @@ brw_vs_populate_key(struct brw_context *brw,
 
    if (ctx->Transform.ClipPlanesEnabled != 0 &&
        ctx->API == API_OPENGL_COMPAT &&
-       !vp->program.Base.UsesClipDistanceOut) {
+       vp->program.Base.ClipDistanceArraySize == 0) {
       key->nr_userclip_plane_consts =
          _mesa_logbase2(ctx->Transform.ClipPlanesEnabled) + 1;
    }
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
index 671a535a5bd..2fef188c17e 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
@@ -63,7 +63,7 @@ gen6_gs_visitor::emit_prolog()
    this->vertex_output = src_reg(this,
                                  glsl_type::uint_type,
                                  (prog_data->vue_map.num_slots + 1) *
-                                 c->gp->program.VerticesOut);
+                                 nir->info.gs.vertices_out);
    this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
    emit(MOV(dst_reg(this->vertex_output_offset), src_reg(0u)));
 
@@ -95,7 +95,7 @@ gen6_gs_visitor::emit_prolog()
    this->prim_count = src_reg(this, glsl_type::uint_type);
    emit(MOV(dst_reg(this->prim_count), 0u));
 
-   if (c->prog_data.gen6_xfb_enabled) {
+   if (gs_prog_data->gen6_xfb_enabled) {
       /* Create a virtual register to hold destination indices in SOL */
       this->destination_indices = src_reg(this, glsl_type::uvec4_type);
       /* Create a virtual register to hold number of written primitives */
@@ -128,7 +128,7 @@ gen6_gs_visitor::emit_prolog()
     * in the 3DSTATE_GS state packet. That information can be obtained by other
     * means though, so we can safely use r1 for this purpose.
     */
-   if (c->prog_data.include_primitive_id) {
+   if (gs_prog_data->include_primitive_id) {
       this->primitive_id =
          src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
       emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
@@ -177,7 +177,7 @@ gen6_gs_visitor::gs_emit_vertex(int stream_id)
    dst_reg dst(this->vertex_output);
    dst.reladdr = ralloc(mem_ctx, src_reg);
    memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
-   if (c->gp->program.OutputType == GL_POINTS) {
+   if (nir->info.gs.output_primitive == GL_POINTS) {
       /* If we are outputting points, then every vertex has PrimStart and
        * PrimEnd set.
        */
@@ -191,7 +191,7 @@ gen6_gs_visitor::gs_emit_vertex(int stream_id)
        * vertex.
        */
       emit(OR(dst, this->first_vertex,
-              (c->prog_data.output_topology << URB_WRITE_PRIM_TYPE_SHIFT)));
+              (gs_prog_data->output_topology << URB_WRITE_PRIM_TYPE_SHIFT)));
       emit(MOV(dst_reg(this->first_vertex), 0u));
    }
    emit(ADD(dst_reg(this->vertex_output_offset),
@@ -205,7 +205,7 @@ gen6_gs_visitor::gs_end_primitive()
    /* Calling EndPrimitive() is optional for point output. In this case we set
     * the PrimEnd flag when we process EmitVertex().
     */
-   if (c->gp->program.OutputType == GL_POINTS)
+   if (nir->info.gs.output_primitive == GL_POINTS)
       return;
 
    /* Otherwise we know that the last vertex we have processed was the last
@@ -217,7 +217,7 @@ gen6_gs_visitor::gs_end_primitive()
     * comparison below (hence the num_output_vertices + 1 in the comparison
     * below).
     */
-   unsigned num_output_vertices = c->gp->program.VerticesOut;
+   unsigned num_output_vertices = nir->info.gs.vertices_out;
    emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices + 1),
             BRW_CONDITIONAL_L));
    vec4_instruction *inst = emit(CMP(dst_null_d(),
@@ -320,7 +320,7 @@ gen6_gs_visitor::emit_thread_end()
     * first_vertex is not zero. This is only relevant for outputs other than
     * points because in the point case we set PrimEnd on all vertices.
     */
-   if (c->gp->program.OutputType != GL_POINTS) {
+   if (nir->info.gs.output_primitive != GL_POINTS) {
       emit(CMP(dst_null_d(), this->first_vertex, 0u, BRW_CONDITIONAL_Z));
       emit(IF(BRW_PREDICATE_NORMAL));
       gs_end_primitive();
@@ -353,7 +353,7 @@ gen6_gs_visitor::emit_thread_end()
       this->current_annotation = "gen6 thread end: ff_sync";
 
       vec4_instruction *inst;
-      if (c->prog_data.gen6_xfb_enabled) {
+      if (gs_prog_data->gen6_xfb_enabled) {
          src_reg sol_temp(this, glsl_type::uvec4_type);
          emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
               dst_reg(this->svbi),
@@ -443,7 +443,7 @@ gen6_gs_visitor::emit_thread_end()
       }
       emit(BRW_OPCODE_WHILE);
 
-      if (c->prog_data.gen6_xfb_enabled)
+      if (gs_prog_data->gen6_xfb_enabled)
          xfb_write();
    }
    emit(BRW_OPCODE_ENDIF);
@@ -465,7 +465,7 @@ gen6_gs_visitor::emit_thread_end()
     */
    this->current_annotation = "gen6 thread end: EOT";
 
-   if (c->prog_data.gen6_xfb_enabled) {
+   if (gs_prog_data->gen6_xfb_enabled) {
       /* When emitting EOT, set SONumPrimsWritten Increment Value. */
       src_reg data(this, glsl_type::uint_type);
       emit(AND(dst_reg(data), this->sol_prim_written, src_reg(0xffffu)));
@@ -507,7 +507,7 @@ gen6_gs_visitor::setup_payload()
     * information (and move the original value to a virtual register if
     * necessary).
     */
-   if (c->prog_data.include_primitive_id)
+   if (gs_prog_data->include_primitive_id)
       attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
    reg++;
 
@@ -530,9 +530,6 @@ gen6_gs_visitor::xfb_setup()
       BRW_SWIZZLE4(3, 3, 3, 3)
    };
 
-   struct brw_gs_prog_data *prog_data =
-      (struct brw_gs_prog_data *) &c->prog_data;
-
    const struct gl_transform_feedback_info *linked_xfb_info =
       &this->shader_prog->LinkedTransformFeedback;
    int i;
@@ -548,11 +545,11 @@ gen6_gs_visitor::xfb_setup()
     */
    assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS);
 
-   prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
-   for (i = 0; i < prog_data->num_transform_feedback_bindings; i++) {
-      prog_data->transform_feedback_bindings[i] =
+   gs_prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
+   for (i = 0; i < gs_prog_data->num_transform_feedback_bindings; i++) {
+      gs_prog_data->transform_feedback_bindings[i] =
          linked_xfb_info->Outputs[i].OutputRegister;
-      prog_data->transform_feedback_swizzles[i] =
+      gs_prog_data->transform_feedback_swizzles[i] =
          swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset];
    }
 }
@@ -561,13 +558,11 @@ void
 gen6_gs_visitor::xfb_write()
 {
    unsigned num_verts;
-   struct brw_gs_prog_data *prog_data =
-      (struct brw_gs_prog_data *) &c->prog_data;
 
-   if (!prog_data->num_transform_feedback_bindings)
+   if (!gs_prog_data->num_transform_feedback_bindings)
       return;
 
-   switch (c->prog_data.output_topology) {
+   switch (gs_prog_data->output_topology) {
    case _3DPRIM_POINTLIST:
       num_verts = 1;
       break;
@@ -627,7 +622,7 @@ gen6_gs_visitor::xfb_write()
    emit(BRW_OPCODE_ENDIF);
 
    /* Write transform feedback data for all processed vertices. */
-   for (int i = 0; i < c->gp->program.VerticesOut; i++) {
+   for (int i = 0; i < (int)nir->info.gs.vertices_out; i++) {
       emit(MOV(dst_reg(sol_temp), i));
       emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
                BRW_CONDITIONAL_L));
@@ -642,10 +637,8 @@ gen6_gs_visitor::xfb_write()
 void
 gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
 {
-   struct brw_gs_prog_data *prog_data =
-      (struct brw_gs_prog_data *) &c->prog_data;
    unsigned binding;
-   unsigned num_bindings = prog_data->num_transform_feedback_bindings;
+   unsigned num_bindings = gs_prog_data->num_transform_feedback_bindings;
    src_reg sol_temp(this, glsl_type::uvec4_type);
 
    /* Check for buffer overflow: we need room to write the complete primitive
@@ -666,7 +659,7 @@ gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
        */
       for (binding = 0; binding < num_bindings; ++binding) {
          unsigned char varying =
-            prog_data->transform_feedback_bindings[binding];
+            gs_prog_data->transform_feedback_bindings[binding];
 
          /* Set up the correct destination index for this vertex */
          vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
@@ -704,7 +697,7 @@ gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
          else if (varying == VARYING_SLOT_VIEWPORT)
             data.swizzle = BRW_SWIZZLE_ZZZZ;
          else
-            data.swizzle = prog_data->transform_feedback_swizzles[binding];
+            data.swizzle = gs_prog_data->transform_feedback_swizzles[binding];
 
          /* Write data */
          inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
index d02c67d8a74..311cf06833c 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
@@ -38,12 +38,13 @@ public:
    gen6_gs_visitor(const struct brw_compiler *comp,
                    void *log_data,
                    struct brw_gs_compile *c,
+                   struct brw_gs_prog_data *prog_data,
                    struct gl_shader_program *prog,
                    const nir_shader *shader,
                    void *mem_ctx,
                    bool no_spills,
                    int shader_time_index) :
-      vec4_gs_visitor(comp, log_data, c, shader, mem_ctx, no_spills,
+      vec4_gs_visitor(comp, log_data, c, prog_data, shader, mem_ctx, no_spills,
                       shader_time_index),
       shader_prog(prog)
       {
diff --git a/src/mesa/drivers/dri/i965/gen8_gs_state.c b/src/mesa/drivers/dri/i965/gen8_gs_state.c
index d766ca7bebf..6738e85eaba 100644
--- a/src/mesa/drivers/dri/i965/gen8_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_gs_state.c
@@ -68,6 +68,8 @@ gen8_upload_gs_state(struct brw_context *brw)
                  GEN7_GS_OUTPUT_VERTEX_SIZE_SHIFT) |
                 (brw->gs.prog_data->output_topology <<
                  GEN7_GS_OUTPUT_TOPOLOGY_SHIFT) |
+                (prog_data->include_vue_handles ?
+                 GEN7_GS_INCLUDE_VERTEX_HANDLES : 0) |
                 (prog_data->urb_read_length <<
                  GEN6_GS_URB_READ_LENGTH_SHIFT) |
                 (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT) |
diff --git a/src/mesa/main/dlist.c b/src/mesa/main/dlist.c
index e8059c7b260..2b65b2ea949 100644
--- a/src/mesa/main/dlist.c
+++ b/src/mesa/main/dlist.c
@@ -1400,7 +1400,7 @@ save_BlendFunci(GLuint buf, GLenum sfactor, GLenum dfactor)
    GET_CURRENT_CONTEXT(ctx);
    Node *n;
    ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
-   n = alloc_instruction(ctx, OPCODE_BLEND_FUNC_SEPARATE_I, 3);
+   n = alloc_instruction(ctx, OPCODE_BLEND_FUNC_I, 3);
    if (n) {
       n[1].ui = buf;
       n[2].e = sfactor;
@@ -9741,6 +9741,46 @@ print_list(struct gl_context *ctx, GLuint list, const char *fname)
                    n[3].f, n[4].f, n[5].f, n[6].f,
                    get_pointer(&n[7]));
             break;
+         case OPCODE_BLEND_COLOR:
+            fprintf(f, "BlendColor %f, %f, %f, %f\n",
+                    n[1].f, n[2].f, n[3].f, n[4].f);
+            break;
+         case OPCODE_BLEND_EQUATION:
+            fprintf(f, "BlendEquation %s\n",
+                    enum_string(n[1].e));
+            break;
+         case OPCODE_BLEND_EQUATION_SEPARATE:
+            fprintf(f, "BlendEquationSeparate %s, %s\n",
+                    enum_string(n[1].e),
+                    enum_string(n[2].e));
+            break;
+         case OPCODE_BLEND_FUNC_SEPARATE:
+            fprintf(f, "BlendFuncSeparate %s, %s, %s, %s\n",
+                    enum_string(n[1].e),
+                    enum_string(n[2].e),
+                    enum_string(n[3].e),
+                    enum_string(n[4].e));
+            break;
+         case OPCODE_BLEND_EQUATION_I:
+            fprintf(f, "BlendEquationi %u, %s\n",
+                    n[1].ui, enum_string(n[2].e));
+            break;
+         case OPCODE_BLEND_EQUATION_SEPARATE_I:
+            fprintf(f, "BlendEquationSeparatei %u, %s, %s\n",
+                    n[1].ui, enum_string(n[2].e), enum_string(n[3].e));
+            break;
+         case OPCODE_BLEND_FUNC_I:
+            fprintf(f, "BlendFunci %u, %s, %s\n",
+                    n[1].ui, enum_string(n[2].e), enum_string(n[3].e));
+            break;
+         case OPCODE_BLEND_FUNC_SEPARATE_I:
+            fprintf(f, "BlendFuncSeparatei %u, %s, %s, %s, %s\n",
+                    n[1].ui,
+                    enum_string(n[2].e),
+                    enum_string(n[3].e),
+                    enum_string(n[4].e),
+                    enum_string(n[5].e));
+            break;
          case OPCODE_CALL_LIST:
             fprintf(f, "CallList %d\n", (int) n[1].ui);
             break;
@@ -9761,6 +9801,9 @@ print_list(struct gl_context *ctx, GLuint list, const char *fname)
          case OPCODE_LINE_STIPPLE:
             fprintf(f, "LineStipple %d %x\n", n[1].i, (int) n[2].us);
             break;
+         case OPCODE_LINE_WIDTH:
+            fprintf(f, "LineWidth %f\n", n[1].f);
+            break;
          case OPCODE_LOAD_IDENTITY:
             fprintf(f, "LoadIdentity\n");
             break;
@@ -9790,6 +9833,9 @@ print_list(struct gl_context *ctx, GLuint list, const char *fname)
             fprintf(f, "Ortho %g %g %g %g %g %g\n",
                          n[1].f, n[2].f, n[3].f, n[4].f, n[5].f, n[6].f);
             break;
+         case OPCODE_POINT_SIZE:
+            fprintf(f, "PointSize %f\n", n[1].f);
+            break;
          case OPCODE_POP_ATTRIB:
             fprintf(f, "PopAttrib\n");
             break;
diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c
index faa63825380..2ed42eaffdd 100644
--- a/src/mesa/main/glformats.c
+++ b/src/mesa/main/glformats.c
@@ -2275,45 +2275,16 @@ _mesa_base_tex_format(const struct gl_context *ctx, GLint internalFormat)
       ; /* fallthrough */
    }
 
-   if (ctx->Extensions.TDFX_texture_compression_FXT1) {
-      switch (internalFormat) {
-      case GL_COMPRESSED_RGB_FXT1_3DFX:
-         return GL_RGB;
-      case GL_COMPRESSED_RGBA_FXT1_3DFX:
-         return GL_RGBA;
-      default:
-         ; /* fallthrough */
-      }
+   if (_mesa_is_compressed_format(ctx, internalFormat)) {
+      GLenum base_compressed =
+         _mesa_gl_compressed_format_base_format(internalFormat);
+      if (base_compressed)
+            return base_compressed;
    }
 
-   /* Assume that the ANGLE flag will always be set if the EXT flag is set.
-    */
-   if (ctx->Extensions.ANGLE_texture_compression_dxt) {
-      switch (internalFormat) {
-      case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
-         return GL_RGB;
-      case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
-      case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
-      case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
-         return GL_RGBA;
-      default:
-         ; /* fallthrough */
-      }
-   }
-
-   if (_mesa_is_desktop_gl(ctx)
-       && ctx->Extensions.ANGLE_texture_compression_dxt) {
-      switch (internalFormat) {
-      case GL_RGB_S3TC:
-      case GL_RGB4_S3TC:
-         return GL_RGB;
-      case GL_RGBA_S3TC:
-      case GL_RGBA4_S3TC:
-         return GL_RGBA;
-      default:
-         ; /* fallthrough */
-      }
-   }
+   if (ctx->Extensions.KHR_texture_compression_astc_ldr &&
+      _mesa_is_astc_format(internalFormat))
+        return GL_RGBA;
 
    if (ctx->Extensions.MESA_ycbcr_texture) {
       if (internalFormat == GL_YCBCR_MESA)
@@ -2390,16 +2361,10 @@ _mesa_base_tex_format(const struct gl_context *ctx, GLint internalFormat)
       case GL_SRGB8_EXT:
       case GL_COMPRESSED_SRGB_EXT:
          return GL_RGB;
-      case GL_COMPRESSED_SRGB_S3TC_DXT1_EXT:
-         return ctx->Extensions.EXT_texture_compression_s3tc ? GL_RGB : -1;
       case GL_SRGB_ALPHA_EXT:
       case GL_SRGB8_ALPHA8_EXT:
       case GL_COMPRESSED_SRGB_ALPHA_EXT:
          return GL_RGBA;
-      case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT:
-      case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT:
-      case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT:
-         return ctx->Extensions.EXT_texture_compression_s3tc ? GL_RGBA : -1;
       case GL_SLUMINANCE_ALPHA_EXT:
       case GL_SLUMINANCE8_ALPHA8_EXT:
       case GL_COMPRESSED_SLUMINANCE_ALPHA_EXT:
@@ -2544,104 +2509,6 @@ _mesa_base_tex_format(const struct gl_context *ctx, GLint internalFormat)
       }
    }
 
-   if (ctx->Extensions.ARB_texture_compression_rgtc) {
-      switch (internalFormat) {
-      case GL_COMPRESSED_RED_RGTC1:
-      case GL_COMPRESSED_SIGNED_RED_RGTC1:
-         return GL_RED;
-      case GL_COMPRESSED_RG_RGTC2:
-      case GL_COMPRESSED_SIGNED_RG_RGTC2:
-         return GL_RG;
-      default:
-         ; /* fallthrough */
-      }
-   }
-
-   if (ctx->Extensions.EXT_texture_compression_latc) {
-      switch (internalFormat) {
-      case GL_COMPRESSED_LUMINANCE_LATC1_EXT:
-      case GL_COMPRESSED_SIGNED_LUMINANCE_LATC1_EXT:
-         return GL_LUMINANCE;
-      case GL_COMPRESSED_LUMINANCE_ALPHA_LATC2_EXT:
-      case GL_COMPRESSED_SIGNED_LUMINANCE_ALPHA_LATC2_EXT:
-         return GL_LUMINANCE_ALPHA;
-      default:
-         ; /* fallthrough */
-      }
-   }
-
-   if (ctx->Extensions.ATI_texture_compression_3dc) {
-      switch (internalFormat) {
-      case GL_COMPRESSED_LUMINANCE_ALPHA_3DC_ATI:
-         return GL_LUMINANCE_ALPHA;
-      default:
-         ; /* fallthrough */
-      }
-   }
-
-   if (ctx->Extensions.OES_compressed_ETC1_RGB8_texture) {
-      switch (internalFormat) {
-      case GL_ETC1_RGB8_OES:
-         return GL_RGB;
-      default:
-         ; /* fallthrough */
-      }
-   }
-
-   if (_mesa_is_gles3(ctx) || ctx->Extensions.ARB_ES3_compatibility) {
-      switch (internalFormat) {
-      case GL_COMPRESSED_RGB8_ETC2:
-      case GL_COMPRESSED_SRGB8_ETC2:
-         return GL_RGB;
-      case GL_COMPRESSED_RGBA8_ETC2_EAC:
-      case GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC:
-      case GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
-      case GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
-         return GL_RGBA;
-      case GL_COMPRESSED_R11_EAC:
-      case GL_COMPRESSED_SIGNED_R11_EAC:
-         return GL_RED;
-      case GL_COMPRESSED_RG11_EAC:
-      case GL_COMPRESSED_SIGNED_RG11_EAC:
-         return GL_RG;
-      default:
-         ; /* fallthrough */
-      }
-   }
-
-   if (_mesa_is_desktop_gl(ctx) &&
-       ctx->Extensions.ARB_texture_compression_bptc) {
-      switch (internalFormat) {
-      case GL_COMPRESSED_RGBA_BPTC_UNORM:
-      case GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM:
-         return GL_RGBA;
-      case GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT:
-      case GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT:
-         return GL_RGB;
-      default:
-         ; /* fallthrough */
-      }
-   }
-
-   if (ctx->API == API_OPENGLES) {
-      switch (internalFormat) {
-      case GL_PALETTE4_RGB8_OES:
-      case GL_PALETTE4_R5_G6_B5_OES:
-      case GL_PALETTE8_RGB8_OES:
-      case GL_PALETTE8_R5_G6_B5_OES:
-	 return GL_RGB;
-      case GL_PALETTE4_RGBA8_OES:
-      case GL_PALETTE8_RGB5_A1_OES:
-      case GL_PALETTE4_RGBA4_OES:
-      case GL_PALETTE4_RGB5_A1_OES:
-      case GL_PALETTE8_RGBA8_OES:
-      case GL_PALETTE8_RGBA4_OES:
-	 return GL_RGBA;
-      default:
-         ; /* fallthrough */
-      }
-   }
-
    return -1; /* error */
 }
 
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index e57b98a412d..ab4fa083672 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -1891,7 +1891,7 @@ struct gl_program
     * For vertex and geometry shaders, true if the program uses the
     * gl_ClipDistance output.  Ignored for fragment shaders.
     */
-   GLboolean UsesClipDistanceOut;
+   unsigned ClipDistanceArraySize;
 
 
    /** Named parameters, constants, etc. from program text */
@@ -2619,7 +2619,6 @@ struct gl_shader_program
        * True if gl_ClipDistance is written to.  Copied into
        * gl_tess_eval_program by _mesa_copy_linked_program_data().
        */
-      GLboolean UsesClipDistance;
       GLuint ClipDistanceArraySize; /**< Size of the gl_ClipDistance array, or
                                          0 if not present. */
    } TessEval;
@@ -2642,7 +2641,6 @@ struct gl_shader_program
        * True if gl_ClipDistance is written to.  Copied into
        * gl_geometry_program by _mesa_copy_linked_program_data().
        */
-      GLboolean UsesClipDistance;
       GLuint ClipDistanceArraySize; /**< Size of the gl_ClipDistance array, or
                                          0 if not present. */
       bool UsesEndPrimitive;
@@ -2655,7 +2653,6 @@ struct gl_shader_program
        * True if gl_ClipDistance is written to.  Copied into gl_vertex_program
        * by _mesa_copy_linked_program_data().
        */
-      GLboolean UsesClipDistance;
       GLuint ClipDistanceArraySize; /**< Size of the gl_ClipDistance array, or
                                          0 if not present. */
    } Vert;
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index 18e463d4ccc..765602e50db 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -2068,7 +2068,7 @@ _mesa_copy_linked_program_data(gl_shader_stage type,
 {
    switch (type) {
    case MESA_SHADER_VERTEX:
-      dst->UsesClipDistanceOut = src->Vert.UsesClipDistance;
+      dst->ClipDistanceArraySize = src->Vert.ClipDistanceArraySize;
       break;
    case MESA_SHADER_TESS_CTRL: {
       struct gl_tess_ctrl_program *dst_tcp =
@@ -2083,7 +2083,7 @@ _mesa_copy_linked_program_data(gl_shader_stage type,
       dst_tep->Spacing = src->TessEval.Spacing;
       dst_tep->VertexOrder = src->TessEval.VertexOrder;
       dst_tep->PointMode = src->TessEval.PointMode;
-      dst->UsesClipDistanceOut = src->TessEval.UsesClipDistance;
+      dst->ClipDistanceArraySize = src->TessEval.ClipDistanceArraySize;
       break;
    }
    case MESA_SHADER_GEOMETRY: {
@@ -2093,7 +2093,7 @@ _mesa_copy_linked_program_data(gl_shader_stage type,
       dst_gp->Invocations = src->Geom.Invocations;
       dst_gp->InputType = src->Geom.InputType;
       dst_gp->OutputType = src->Geom.OutputType;
-      dst->UsesClipDistanceOut = src->Geom.UsesClipDistance;
+      dst->ClipDistanceArraySize = src->Geom.ClipDistanceArraySize;
       dst_gp->UsesEndPrimitive = src->Geom.UsesEndPrimitive;
       dst_gp->UsesStreams = src->Geom.UsesStreams;
       break;
diff --git a/src/mesa/main/texstore.c b/src/mesa/main/texstore.c
index e50964e79e4..d7671738b18 100644
--- a/src/mesa/main/texstore.c
+++ b/src/mesa/main/texstore.c
@@ -97,16 +97,16 @@ static const GLubyte map_1032[6] = { 1, 0, 3, 2, ZERO, ONE };
  * No pixel transfer operations or special texel encodings allowed.
  * 1D, 2D and 3D images supported.
  */
-static void
-memcpy_texture(struct gl_context *ctx,
-	       GLuint dimensions,
-               mesa_format dstFormat,
-               GLint dstRowStride,
-               GLubyte **dstSlices,
-               GLint srcWidth, GLint srcHeight, GLint srcDepth,
-               GLenum srcFormat, GLenum srcType,
-               const GLvoid *srcAddr,
-               const struct gl_pixelstore_attrib *srcPacking)
+void
+_mesa_memcpy_texture(struct gl_context *ctx,
+                     GLuint dimensions,
+                     mesa_format dstFormat,
+                     GLint dstRowStride,
+                     GLubyte **dstSlices,
+                     GLint srcWidth, GLint srcHeight, GLint srcDepth,
+                     GLenum srcFormat, GLenum srcType,
+                     const GLvoid *srcAddr,
+                     const struct gl_pixelstore_attrib *srcPacking)
 {
    const GLint srcRowStride = _mesa_image_row_stride(srcPacking, srcWidth,
                                                      srcFormat, srcType);
@@ -296,11 +296,11 @@ _mesa_texstore_ycbcr(TEXSTORE_PARAMS)
    assert(baseInternalFormat == GL_YCBCR_MESA);
 
    /* always just memcpy since no pixel transfer ops apply */
-   memcpy_texture(ctx, dims,
-                  dstFormat,
-                  dstRowStride, dstSlices,
-                  srcWidth, srcHeight, srcDepth, srcFormat, srcType,
-                  srcAddr, srcPacking);
+   _mesa_memcpy_texture(ctx, dims,
+                        dstFormat,
+                        dstRowStride, dstSlices,
+                        srcWidth, srcHeight, srcDepth, srcFormat, srcType,
+                        srcAddr, srcPacking);
 
    /* Check if we need byte swapping */
    /* XXX the logic here _might_ be wrong */
@@ -899,13 +899,15 @@ _mesa_texstore_memcpy(TEXSTORE_PARAMS)
       return GL_FALSE;
    }
 
-   memcpy_texture(ctx, dims,
-                  dstFormat,
-                  dstRowStride, dstSlices,
-                  srcWidth, srcHeight, srcDepth, srcFormat, srcType,
-                  srcAddr, srcPacking);
+   _mesa_memcpy_texture(ctx, dims,
+                        dstFormat,
+                        dstRowStride, dstSlices,
+                        srcWidth, srcHeight, srcDepth, srcFormat, srcType,
+                        srcAddr, srcPacking);
    return GL_TRUE;
 }
+
+
 /**
  * Store user data into texture memory.
  * Called via glTex[Sub]Image1/2/3D()
diff --git a/src/mesa/main/texstore.h b/src/mesa/main/texstore.h
index 2c974f74afb..f08dc08edde 100644
--- a/src/mesa/main/texstore.h
+++ b/src/mesa/main/texstore.h
@@ -74,6 +74,17 @@ _mesa_texstore_needs_transfer_ops(struct gl_context *ctx,
                                   GLenum baseInternalFormat,
                                   mesa_format dstFormat);
 
+extern void
+_mesa_memcpy_texture(struct gl_context *ctx,
+                     GLuint dimensions,
+                     mesa_format dstFormat,
+                     GLint dstRowStride,
+                     GLubyte **dstSlices,
+                     GLint srcWidth, GLint srcHeight, GLint srcDepth,
+                     GLenum srcFormat, GLenum srcType,
+                     const GLvoid *srcAddr,
+                     const struct gl_pixelstore_attrib *srcPacking);
+
 extern GLboolean
 _mesa_texstore_can_use_memcpy(struct gl_context *ctx,
                               GLenum baseInternalFormat, mesa_format dstFormat,
diff --git a/src/mesa/state_tracker/st_atom_rasterizer.c b/src/mesa/state_tracker/st_atom_rasterizer.c
index 0f01e9939de..55d5e66243c 100644
--- a/src/mesa/state_tracker/st_atom_rasterizer.c
+++ b/src/mesa/state_tracker/st_atom_rasterizer.c
@@ -239,7 +239,7 @@ static void update_raster_state( struct st_context *st )
 
    /* _NEW_MULTISAMPLE | _NEW_BUFFERS */
    raster->force_persample_interp =
-         st->can_force_persample_interp &&
+         !st->force_persample_in_shader &&
          ctx->Multisample._Enabled &&
          ctx->Multisample.SampleShading &&
          ctx->Multisample.MinSampleShadingValue *
diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index 1e880a107c0..0f9ea101889 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -64,7 +64,7 @@ update_fp( struct st_context *st )
    assert(stfp->Base.Base.Target == GL_FRAGMENT_PROGRAM_ARB);
 
    memset(&key, 0, sizeof(key));
-   key.st = st;
+   key.st = st->has_shareable_shaders ? NULL : st;
 
    /* _NEW_FRAG_CLAMP */
    key.clamp_color = st->clamp_frag_color_in_shader &&
@@ -76,7 +76,7 @@ update_fp( struct st_context *st )
     * Ignore sample qualifier while computing this flag.
     */
    key.persample_shading =
-      !st->can_force_persample_interp &&
+      st->force_persample_in_shader &&
       !(stfp->Base.Base.SystemValuesRead & (SYSTEM_BIT_SAMPLE_ID |
                                             SYSTEM_BIT_SAMPLE_POS)) &&
       _mesa_get_min_invocations_per_fragment(st->ctx, &stfp->Base, true) > 1;
@@ -119,7 +119,7 @@ update_vp( struct st_context *st )
    assert(stvp->Base.Base.Target == GL_VERTEX_PROGRAM_ARB);
 
    memset(&key, 0, sizeof key);
-   key.st = st;  /* variants are per-context */
+   key.st = st->has_shareable_shaders ? NULL : st;
 
    /* When this is true, we will add an extra input to the vertex
     * shader translation (for edgeflags), an extra output with
@@ -174,7 +174,7 @@ update_gp( struct st_context *st )
    assert(stgp->Base.Base.Target == GL_GEOMETRY_PROGRAM_NV);
 
    memset(&key, 0, sizeof(key));
-   key.st = st;
+   key.st = st->has_shareable_shaders ? NULL : st;
 
    st->gp_variant = st_get_gp_variant(st, stgp, &key);
 
@@ -210,7 +210,7 @@ update_tcp( struct st_context *st )
    assert(sttcp->Base.Base.Target == GL_TESS_CONTROL_PROGRAM_NV);
 
    memset(&key, 0, sizeof(key));
-   key.st = st;
+   key.st = st->has_shareable_shaders ? NULL : st;
 
    st->tcp_variant = st_get_tcp_variant(st, sttcp, &key);
 
@@ -246,7 +246,7 @@ update_tep( struct st_context *st )
    assert(sttep->Base.Base.Target == GL_TESS_EVALUATION_PROGRAM_NV);
 
    memset(&key, 0, sizeof(key));
-   key.st = st;
+   key.st = st->has_shareable_shaders ? NULL : st;
 
    st->tep_variant = st_get_tep_variant(st, sttep, &key);
 
diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index bb6dfe85644..cbc6845d771 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -269,7 +269,7 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    struct pipe_resource *vbuf = NULL;
 
    memset(&key, 0, sizeof(key));
-   key.st = st;
+   key.st = st->has_shareable_shaders ? NULL : st;
    key.bitmap = GL_TRUE;
    key.clamp_color = st->clamp_frag_color_in_shader &&
                      st->ctx->Color._ClampFragmentColor;
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 7e8633edc1a..262ad809c58 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -395,15 +395,35 @@ make_texture(struct st_context *st,
        * Note that the image is actually going to be upside down in
        * the texture.  We deal with that with texcoords.
        */
-      success = _mesa_texstore(ctx, 2,           /* dims */
-                               baseInternalFormat, /* baseInternalFormat */
-                               mformat,          /* mesa_format */
-                               transfer->stride, /* dstRowStride, bytes */
-                               &dest,            /* destSlices */
-                               width, height, 1, /* size */
-                               format, type,     /* src format/type */
-                               pixels,           /* data source */
-                               unpack);
+      if ((format == GL_RGBA || format == GL_BGRA)
+          && type == GL_UNSIGNED_BYTE) {
+         /* Use a memcpy-based texstore to avoid software pixel swizzling.
+          * We'll do the necessary swizzling with the pipe_sampler_view to
+          * give much better performance.
+          * XXX in the future, expand this to accomodate more format and
+          * type combinations.
+          */
+         _mesa_memcpy_texture(ctx, 2,
+                              mformat,          /* mesa_format */
+                              transfer->stride, /* dstRowStride, bytes */
+                              &dest,            /* destSlices */
+                              width, height, 1, /* size */
+                              format, type,     /* src format/type */
+                              pixels,           /* data source */
+                              unpack);
+         success = GL_TRUE;
+      }
+      else {
+         success = _mesa_texstore(ctx, 2,           /* dims */
+                                  baseInternalFormat, /* baseInternalFormat */
+                                  mformat,          /* mesa_format */
+                                  transfer->stride, /* dstRowStride, bytes */
+                                  &dest,            /* destSlices */
+                                  width, height, 1, /* size */
+                                  format, type,     /* src format/type */
+                                  pixels,           /* data source */
+                                  unpack);
+      }
 
       /* unmap */
       pipe_transfer_unmap(pipe, transfer);
@@ -667,7 +687,8 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    /* user textures, plus the drawpix textures */
    if (fpv) {
       struct pipe_sampler_view *sampler_views[PIPE_MAX_SAMPLERS];
-      uint num = MAX2(MAX2(fpv->drawpix_sampler, fpv->pixelmap_sampler) + 1,
+      uint num = MAX3(fpv->drawpix_sampler + 1,
+                      fpv->pixelmap_sampler + 1,
                       st->state.num_sampler_views[PIPE_SHADER_FRAGMENT]);
 
       memcpy(sampler_views, st->state.sampler_views[PIPE_SHADER_FRAGMENT],
@@ -914,7 +935,7 @@ get_color_fp_variant(struct st_context *st)
 
    memset(&key, 0, sizeof(key));
 
-   key.st = st;
+   key.st = st->has_shareable_shaders ? NULL : st;
    key.drawpixels = 1;
    key.scaleAndBias = (ctx->Pixel.RedBias != 0.0 ||
                        ctx->Pixel.RedScale != 1.0 ||
@@ -956,6 +977,69 @@ clamp_size(struct pipe_context *pipe, GLsizei *width, GLsizei *height,
 }
 
 
+/**
+ * Search the array of 4 swizzle components for the named component and return
+ * its position.
+ */
+static unsigned
+search_swizzle(const unsigned char swizzle[4], unsigned component)
+{
+   unsigned i;
+   for (i = 0; i < 4; i++) {
+      if (swizzle[i] == component)
+         return i;
+   }
+   assert(!"search_swizzle() failed");
+   return 0;
+}
+
+
+/**
+ * Set the sampler view's swizzle terms.  This is used to handle RGBA
+ * swizzling when the incoming image format isn't an exact match for
+ * the actual texture format.  For example, if we have glDrawPixels(
+ * GL_RGBA, GL_UNSIGNED_BYTE) and we chose the texture format
+ * PIPE_FORMAT_B8G8R8A8 then we can do use the sampler view swizzle to
+ * avoid swizzling all the pixels in software in the texstore code.
+ */
+static void
+setup_sampler_swizzle(struct pipe_sampler_view *sv, GLenum format, GLenum type)
+{
+   if ((format == GL_RGBA || format == GL_BGRA) && type == GL_UNSIGNED_BYTE) {
+      const struct util_format_description *desc =
+         util_format_description(sv->texture->format);
+      unsigned c0, c1, c2, c3;
+
+      /* Every gallium driver supports at least one 32-bit packed RGBA format.
+       * We must have chosen one for (GL_RGBA, GL_UNSIGNED_BYTE).
+       */
+      assert(desc->block.bits == 32);
+
+      /* invert the format's swizzle to setup the sampler's swizzle */
+      if (format == GL_RGBA) {
+         c0 = UTIL_FORMAT_SWIZZLE_X;
+         c1 = UTIL_FORMAT_SWIZZLE_Y;
+         c2 = UTIL_FORMAT_SWIZZLE_Z;
+         c3 = UTIL_FORMAT_SWIZZLE_W;
+      }
+      else {
+         assert(format == GL_BGRA);
+         c0 = UTIL_FORMAT_SWIZZLE_Z;
+         c1 = UTIL_FORMAT_SWIZZLE_Y;
+         c2 = UTIL_FORMAT_SWIZZLE_X;
+         c3 = UTIL_FORMAT_SWIZZLE_W;
+      }
+      sv->swizzle_r = search_swizzle(desc->swizzle, c0);
+      sv->swizzle_g = search_swizzle(desc->swizzle, c1);
+      sv->swizzle_b = search_swizzle(desc->swizzle, c2);
+      sv->swizzle_a = search_swizzle(desc->swizzle, c3);
+   }
+   else {
+      /* use the default sampler swizzle */
+   }
+}
+
+
 /**
  * Called via ctx->Driver.DrawPixels()
  */
@@ -974,6 +1058,7 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y,
    int num_sampler_view = 1;
    struct gl_pixelstore_attrib clippedUnpack;
    struct st_fp_variant *fpv = NULL;
+   struct pipe_resource *pt;
 
    /* Mesa state should be up to date by now */
    assert(ctx->NewState == 0x0);
@@ -1029,42 +1114,56 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y,
       st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT);
    }
 
-   /* draw with textured quad */
-   {
-      struct pipe_resource *pt
-         = make_texture(st, width, height, format, type, unpack, pixels);
-      if (pt) {
-         sv[0] = st_create_texture_sampler_view(st->pipe, pt);
-
-         if (sv[0]) {
-            /* Create a second sampler view to read stencil.
-             * The stencil is written using the shader stencil export
-             * functionality. */
-            if (write_stencil) {
-               enum pipe_format stencil_format =
-                     util_format_stencil_only(pt->format);
-               /* we should not be doing pixel map/transfer (see above) */
-               assert(num_sampler_view == 1);
-               sv[1] = st_create_texture_sampler_view_format(st->pipe, pt,
-                                                             stencil_format);
-               num_sampler_view++;
-            }
-
-            draw_textured_quad(ctx, x, y, ctx->Current.RasterPos[2],
-                               width, height,
-                               ctx->Pixel.ZoomX, ctx->Pixel.ZoomY,
-                               sv,
-                               num_sampler_view,
-                               driver_vp,
-                               driver_fp, fpv,
-                               color, GL_FALSE, write_depth, write_stencil);
-            pipe_sampler_view_reference(&sv[0], NULL);
-            if (num_sampler_view > 1)
-               pipe_sampler_view_reference(&sv[1], NULL);
-         }
-         pipe_resource_reference(&pt, NULL);
-      }
+   /* Put glDrawPixels image into a texture */
+   pt = make_texture(st, width, height, format, type, unpack, pixels);
+   if (!pt) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glDrawPixels");
+      return;
    }
+
+   /* create sampler view for the image */
+   sv[0] = st_create_texture_sampler_view(st->pipe, pt);
+   if (!sv[0]) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glDrawPixels");
+      pipe_resource_reference(&pt, NULL);
+      return;
+   }
+
+   /* Set up the sampler view's swizzle */
+   setup_sampler_swizzle(sv[0], format, type);
+
+   /* Create a second sampler view to read stencil.  The stencil is
+    * written using the shader stencil export functionality.
+    */
+   if (write_stencil) {
+      enum pipe_format stencil_format =
+         util_format_stencil_only(pt->format);
+      /* we should not be doing pixel map/transfer (see above) */
+      assert(num_sampler_view == 1);
+      sv[1] = st_create_texture_sampler_view_format(st->pipe, pt,
+                                                    stencil_format);
+      if (!sv[1]) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glDrawPixels");
+         pipe_resource_reference(&pt, NULL);
+         pipe_sampler_view_reference(&sv[0], NULL);
+         return;
+      }
+      num_sampler_view++;
+   }
+
+   draw_textured_quad(ctx, x, y, ctx->Current.RasterPos[2],
+                      width, height,
+                      ctx->Pixel.ZoomX, ctx->Pixel.ZoomY,
+                      sv,
+                      num_sampler_view,
+                      driver_vp,
+                      driver_fp, fpv,
+                      color, GL_FALSE, write_depth, write_stencil);
+   pipe_sampler_view_reference(&sv[0], NULL);
+   if (num_sampler_view > 1)
+      pipe_sampler_view_reference(&sv[1], NULL);
+
+   pipe_resource_reference(&pt, NULL);
 }
 
 
diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c
index 708bdf5011e..2c4eccf1e06 100644
--- a/src/mesa/state_tracker/st_cb_program.c
+++ b/src/mesa/state_tracker/st_cb_program.c
@@ -224,6 +224,7 @@ st_program_string_notify( struct gl_context *ctx,
                                            struct gl_program *prog )
 {
    struct st_context *st = st_context(ctx);
+   gl_shader_stage stage = _mesa_program_enum_to_shader_stage(target);
 
    if (target == GL_FRAGMENT_PROGRAM_ARB) {
       struct st_fragment_program *stfp = (struct st_fragment_program *) prog;
@@ -278,10 +279,10 @@ st_program_string_notify( struct gl_context *ctx,
          st->dirty.st |= ST_NEW_TESSEVAL_PROGRAM;
    }
 
-   if (ST_DEBUG & DEBUG_PRECOMPILE)
+   if (ST_DEBUG & DEBUG_PRECOMPILE ||
+       st->shader_has_one_variant[stage])
       st_precompile_shader_variant(st, prog);
 
-   /* XXX check if program is legal, within limits */
    return GL_TRUE;
 }
 
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index bef7307bb27..5abb17385c2 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -235,9 +235,11 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe,
                                               PIPE_BIND_SAMPLER_VIEW);
    st->prefer_blit_based_texture_transfer = screen->get_param(screen,
                               PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER);
-   st->can_force_persample_interp = screen->get_param(screen,
-                                          PIPE_CAP_FORCE_PERSAMPLE_INTERP);
-
+   st->force_persample_in_shader =
+      screen->get_param(screen, PIPE_CAP_SAMPLE_SHADING) &&
+      !screen->get_param(screen, PIPE_CAP_FORCE_PERSAMPLE_INTERP);
+   st->has_shareable_shaders = screen->get_param(screen,
+                                                 PIPE_CAP_SHAREABLE_SHADERS);
    st->needs_texcoord_semantic =
       screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD);
    st->apply_texture_swizzle_to_border_color =
@@ -292,6 +294,20 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe,
          ctx->Const.ShaderCompilerOptions[i].EmitNoIndirectSampler = true;
    }
 
+   /* Set which shader types can be compiled at link time. */
+   st->shader_has_one_variant[MESA_SHADER_VERTEX] =
+         st->has_shareable_shaders &&
+         !st->clamp_vert_color_in_shader;
+
+   st->shader_has_one_variant[MESA_SHADER_FRAGMENT] =
+         st->has_shareable_shaders &&
+         !st->clamp_frag_color_in_shader &&
+         !st->force_persample_in_shader;
+
+   st->shader_has_one_variant[MESA_SHADER_TESS_CTRL] = st->has_shareable_shaders;
+   st->shader_has_one_variant[MESA_SHADER_TESS_EVAL] = st->has_shareable_shaders;
+   st->shader_has_one_variant[MESA_SHADER_GEOMETRY] = st->has_shareable_shaders;
+
    _mesa_compute_version(ctx);
 
    if (ctx->Version == 0) {
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index f187d82449b..c243f5cd966 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -98,7 +98,15 @@ struct st_context
    boolean has_etc1;
    boolean has_etc2;
    boolean prefer_blit_based_texture_transfer;
-   boolean can_force_persample_interp;
+   boolean force_persample_in_shader;
+   boolean has_shareable_shaders;
+
+   /**
+    * If a shader can be created when we get its source.
+    * This means it has only 1 variant, not counting glBitmap and
+    * glDrawPixels.
+    */
+   boolean shader_has_one_variant[MESA_SHADER_STAGES];
 
    boolean needs_texcoord_semantic;
    boolean apply_texture_swizzle_to_border_color;
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index e2902923cb7..d4724b46e0a 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -249,6 +249,9 @@ void st_init_limits(struct pipe_screen *screen,
 
       if (options->EmitNoLoops)
          options->MaxUnrollIterations = MIN2(screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_INSTRUCTIONS), 65536);
+      else
+         options->MaxUnrollIterations = screen->get_shader_param(screen, sh,
+                                      PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT);
 
       options->LowerClipDistance = true;
    }
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 6a69ba7aa26..75ccaf2f26b 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -395,6 +395,10 @@ st_translate_vertex_program(struct st_context *st,
    if (ureg == NULL)
       return false;
 
+   if (stvp->Base.Base.ClipDistanceArraySize)
+      ureg_property(ureg, TGSI_PROPERTY_NUM_CLIPDIST_ENABLED,
+                    stvp->Base.Base.ClipDistanceArraySize);
+
    if (ST_DEBUG & DEBUG_MESA) {
       _mesa_print_program(&stvp->Base.Base);
       _mesa_print_program_parameters(st->ctx, &stvp->Base.Base);
@@ -1049,6 +1053,10 @@ st_translate_program_common(struct st_context *st,
    memset(outputMapping, 0, sizeof(outputMapping));
    memset(out_state, 0, sizeof(*out_state));
 
+   if (prog->ClipDistanceArraySize)
+      ureg_property(ureg, TGSI_PROPERTY_NUM_CLIPDIST_ENABLED,
+                    prog->ClipDistanceArraySize);
+
    /*
     * Convert Mesa program inputs to TGSI input register semantics.
     */
@@ -1728,6 +1736,12 @@ destroy_program_variants_cb(GLuint key, void *data, void *userData)
 void
 st_destroy_program_variants(struct st_context *st)
 {
+   /* If shaders can be shared with other contexts, the last context will
+    * call DeleteProgram on all shaders, releasing everything.
+    */
+   if (st->has_shareable_shaders)
+      return;
+
    /* ARB vert/frag program */
    _mesa_HashWalk(st->ctx->Shared->Programs,
                   destroy_program_variants_cb, st);
@@ -1774,7 +1788,7 @@ st_precompile_shader_variant(struct st_context *st,
       struct st_vp_variant_key key;
 
       memset(&key, 0, sizeof(key));
-      key.st = st;
+      key.st = st->has_shareable_shaders ? NULL : st;
       st_get_vp_variant(st, p, &key);
       break;
    }
@@ -1784,7 +1798,7 @@ st_precompile_shader_variant(struct st_context *st,
       struct st_tcp_variant_key key;
 
       memset(&key, 0, sizeof(key));
-      key.st = st;
+      key.st = st->has_shareable_shaders ? NULL : st;
       st_get_tcp_variant(st, p, &key);
       break;
    }
@@ -1794,7 +1808,7 @@ st_precompile_shader_variant(struct st_context *st,
       struct st_tep_variant_key key;
 
       memset(&key, 0, sizeof(key));
-      key.st = st;
+      key.st = st->has_shareable_shaders ? NULL : st;
       st_get_tep_variant(st, p, &key);
       break;
    }
@@ -1804,7 +1818,7 @@ st_precompile_shader_variant(struct st_context *st,
       struct st_gp_variant_key key;
 
       memset(&key, 0, sizeof(key));
-      key.st = st;
+      key.st = st->has_shareable_shaders ? NULL : st;
       st_get_gp_variant(st, p, &key);
       break;
    }
@@ -1814,7 +1828,7 @@ st_precompile_shader_variant(struct st_context *st,
       struct st_fp_variant_key key;
 
       memset(&key, 0, sizeof(key));
-      key.st = st;
+      key.st = st->has_shareable_shaders ? NULL : st;
       st_get_fp_variant(st, p, &key);
       break;
    }
diff --git a/src/mesa/tnl/t_vb_rendertmp.h b/src/mesa/tnl/t_vb_rendertmp.h
index 44dee763594..4bfc6b15d3b 100644
--- a/src/mesa/tnl/t_vb_rendertmp.h
+++ b/src/mesa/tnl/t_vb_rendertmp.h
@@ -124,19 +124,19 @@ static void TAG(render_line_loop)( struct gl_context *ctx,
    GLuint i;
    LOCAL_VARS;
 
-   (void) flags;
-
    INIT(GL_LINE_LOOP);
 
    if (start+1 < count) {
       if (TEST_PRIM_BEGIN(flags)) {
 	 RESET_STIPPLE;
+         /* draw the first line from v[0] to v[1] */
          if (ctx->Light.ProvokingVertex == GL_LAST_VERTEX_CONVENTION_EXT)
             RENDER_LINE( ELT(start), ELT(start+1) );
          else
             RENDER_LINE( ELT(start+1), ELT(start) );
       }
 
+      /* draw lines from v[1] to v[n-1] */
       for ( i = start+2 ; i < count ; i++) {
          if (ctx->Light.ProvokingVertex == GL_LAST_VERTEX_CONVENTION_EXT)
             RENDER_LINE( ELT(i-1), ELT(i) );
@@ -145,6 +145,7 @@ static void TAG(render_line_loop)( struct gl_context *ctx,
       }
 
       if ( TEST_PRIM_END(flags)) {
+         /* draw final line from v[n-1] to v[0] (the very first vertex) */
          if (ctx->Light.ProvokingVertex == GL_LAST_VERTEX_CONVENTION_EXT)
             RENDER_LINE( ELT(count-1), ELT(start) );
          else
diff --git a/src/mesa/vbo/vbo_context.h b/src/mesa/vbo/vbo_context.h
index a376efe34a7..e6b9d890d5f 100644
--- a/src/mesa/vbo/vbo_context.h
+++ b/src/mesa/vbo/vbo_context.h
@@ -196,6 +196,26 @@ vbo_get_default_vals_as_union(GLenum format)
    }
 }
 
+
+/**
+ * Compute the max number of vertices which can be stored in
+ * a vertex buffer, given the current vertex size, and the amount
+ * of space already used.
+ */
+static inline unsigned
+vbo_compute_max_verts(const struct vbo_exec_context *exec)
+{
+   unsigned n = (VBO_VERT_BUFFER_SIZE - exec->vtx.buffer_used) /
+      (exec->vtx.vertex_size * sizeof(GLfloat));
+   assert(n > 0);
+   /* Subtract one so we're always sure to have room for an extra
+    * vertex for GL_LINE_LOOP -> GL_LINE_STRIP conversion.
+    */
+   n--;
+   return n;
+}
+
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/src/mesa/vbo/vbo_exec.h b/src/mesa/vbo/vbo_exec.h
index 00378eb7984..a80b2c908d1 100644
--- a/src/mesa/vbo/vbo_exec.h
+++ b/src/mesa/vbo/vbo_exec.h
@@ -160,8 +160,6 @@ void vbo_exec_vtx_flush( struct vbo_exec_context *exec, GLboolean unmap );
 void vbo_exec_vtx_map( struct vbo_exec_context *exec );
 
 
-void vbo_exec_vtx_wrap( struct vbo_exec_context *exec );
-
 void vbo_exec_eval_update( struct vbo_exec_context *exec );
 
 void vbo_exec_do_EvalCoord2f( struct vbo_exec_context *exec, 
diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c
index 7ae08fe3062..a23d5aa08aa 100644
--- a/src/mesa/vbo/vbo_exec_api.c
+++ b/src/mesa/vbo/vbo_exec_api.c
@@ -61,7 +61,8 @@ static void reset_attrfv( struct vbo_exec_context *exec );
 
 /**
  * Close off the last primitive, execute the buffer, restart the
- * primitive.  
+ * primitive.  This is called when we fill a vertex buffer before
+ * hitting glEnd.
  */
 static void vbo_exec_wrap_buffers( struct vbo_exec_context *exec )
 {
@@ -71,17 +72,31 @@ static void vbo_exec_wrap_buffers( struct vbo_exec_context *exec )
       exec->vtx.buffer_ptr = exec->vtx.buffer_map;
    }
    else {
-      GLuint last_begin = exec->vtx.prim[exec->vtx.prim_count-1].begin;
+      struct _mesa_prim *last_prim = &exec->vtx.prim[exec->vtx.prim_count - 1];
+      const GLuint last_begin = last_prim->begin;
       GLuint last_count;
 
       if (_mesa_inside_begin_end(exec->ctx)) {
-	 GLint i = exec->vtx.prim_count - 1;
-	 assert(i >= 0);
-	 exec->vtx.prim[i].count = (exec->vtx.vert_count - 
-				    exec->vtx.prim[i].start);
+	 last_prim->count = exec->vtx.vert_count - last_prim->start;
       }
 
-      last_count = exec->vtx.prim[exec->vtx.prim_count-1].count;
+      last_count = last_prim->count;
+
+      /* Special handling for wrapping GL_LINE_LOOP */
+      if (last_prim->mode == GL_LINE_LOOP &&
+          last_count > 0 &&
+          !last_prim->end) {
+         /* draw this section of the incomplete line loop as a line strip */
+         last_prim->mode = GL_LINE_STRIP;
+         if (!last_prim->begin) {
+            /* This is not the first section of the line loop, so don't
+             * draw the 0th vertex.  We're saving it until we draw the
+             * very last section of the loop.
+             */
+            last_prim->start++;
+            last_prim->count--;
+         }
+      }
 
       /* Execute the buffer and save copied vertices.
        */
@@ -98,6 +113,7 @@ static void vbo_exec_wrap_buffers( struct vbo_exec_context *exec )
 
       if (_mesa_inside_begin_end(exec->ctx)) {
 	 exec->vtx.prim[0].mode = exec->ctx->Driver.CurrentExecPrimitive;
+	 exec->vtx.prim[0].begin = 0;
 	 exec->vtx.prim[0].start = 0;
 	 exec->vtx.prim[0].count = 0;
 	 exec->vtx.prim_count++;
@@ -113,7 +129,8 @@ static void vbo_exec_wrap_buffers( struct vbo_exec_context *exec )
  * Deal with buffer wrapping where provoked by the vertex buffer
  * filling up, as opposed to upgrade_vertex().
  */
-void vbo_exec_vtx_wrap( struct vbo_exec_context *exec )
+static void
+vbo_exec_vtx_wrap(struct vbo_exec_context *exec)
 {
    fi_type *data = exec->vtx.copied.buffer;
    GLuint i;
@@ -292,8 +309,7 @@ vbo_exec_wrap_upgrade_vertex(struct vbo_exec_context *exec,
     */
    exec->vtx.attrsz[attr] = newSize;
    exec->vtx.vertex_size += newSize - oldSize;
-   exec->vtx.max_vert = ((VBO_VERT_BUFFER_SIZE - exec->vtx.buffer_used) / 
-                         (exec->vtx.vertex_size * sizeof(GLfloat)));
+   exec->vtx.max_vert = vbo_compute_max_verts(exec);
    exec->vtx.vert_count = 0;
    exec->vtx.buffer_ptr = exec->vtx.buffer_map;
 
@@ -446,10 +462,6 @@ do {									\
                                                                         \
    assert(sz == 1 || sz == 2);                                          \
                                                                         \
-   if (unlikely(!(ctx->Driver.NeedFlush & FLUSH_UPDATE_CURRENT))) {     \
-      vbo_exec_begin_vertices(ctx);					\
-   }									\
-                                                                        \
    /* check if attribute size or type is changing */                    \
    if (unlikely(exec->vtx.active_sz[A] != N * sz) ||                    \
        unlikely(exec->vtx.attrtype[A] != T)) {                          \
@@ -470,6 +482,15 @@ do {									\
       /* This is a glVertex call */					\
       GLuint i;								\
 									\
+      if (unlikely((ctx->Driver.NeedFlush & FLUSH_UPDATE_CURRENT) == 0)) { \
+         vbo_exec_begin_vertices(ctx);                                  \
+      }                                                                 \
+                                                                        \
+      if (unlikely(!exec->vtx.buffer_ptr)) {                            \
+         vbo_exec_vtx_map(exec);                                        \
+      }                                                                 \
+      assert(exec->vtx.buffer_ptr);                                     \
+                                                                        \
       /* copy 32-bit words */                                           \
       for (i = 0; i < exec->vtx.vertex_size; i++)			\
 	 exec->vtx.buffer_ptr[i] = exec->vtx.vertex[i];			\
@@ -482,7 +503,10 @@ do {									\
 									\
       if (++exec->vtx.vert_count >= exec->vtx.max_vert)			\
 	 vbo_exec_vtx_wrap( exec );					\
-   }									\
+   } else {                                                             \
+      /* we now have accumulated per-vertex attributes */               \
+      ctx->Driver.NeedFlush |= FLUSH_UPDATE_CURRENT;                    \
+   }                                                                    \
 } while (0)
 
 #define ERROR(err) _mesa_error( ctx, err, __func__ )
@@ -814,11 +838,28 @@ static void GLAPIENTRY vbo_exec_End( void )
 
    if (exec->vtx.prim_count > 0) {
       /* close off current primitive */
-      int idx = exec->vtx.vert_count;
-      int i = exec->vtx.prim_count - 1;
+      struct _mesa_prim *last_prim = &exec->vtx.prim[exec->vtx.prim_count - 1];
 
-      exec->vtx.prim[i].end = 1;
-      exec->vtx.prim[i].count = idx - exec->vtx.prim[i].start;
+      last_prim->end = 1;
+      last_prim->count = exec->vtx.vert_count - last_prim->start;
+
+      /* Special handling for GL_LINE_LOOP */
+      if (last_prim->mode == GL_LINE_LOOP && last_prim->begin == 0) {
+         /* We're finishing drawing a line loop.  Append 0th vertex onto
+          * end of vertex buffer so we can draw it as a line strip.
+          */
+         const fi_type *src = exec->vtx.buffer_map;
+         fi_type *dst = exec->vtx.buffer_map +
+            exec->vtx.vert_count * exec->vtx.vertex_size;
+
+         /* copy 0th vertex to end of buffer */
+         memcpy(dst, src, exec->vtx.vertex_size * sizeof(fi_type));
+
+         assert(last_prim->start == 0);
+         last_prim->start++;  /* skip vertex0 */
+         /* note that last_prim->count stays unchanged */
+         last_prim->mode = GL_LINE_STRIP;
+      }
 
       try_vbo_merge(exec);
    }
diff --git a/src/mesa/vbo/vbo_exec_draw.c b/src/mesa/vbo/vbo_exec_draw.c
index 174cbc37c26..ed5d9e947b0 100644
--- a/src/mesa/vbo/vbo_exec_draw.c
+++ b/src/mesa/vbo/vbo_exec_draw.c
@@ -64,20 +64,23 @@ vbo_exec_debug_verts( struct vbo_exec_context *exec )
 }
 
 
-/*
- * NOTE: Need to have calculated primitives by this point -- do it on the fly.
- * NOTE: Old 'parity' issue is gone.
+/**
+ * Copy zero, one or two vertices from the current vertex buffer into
+ * the temporary "copy" buffer.
+ * This is used when a single primitive overflows a vertex buffer and
+ * we need to continue the primitive in a new vertex buffer.
+ * The temporary "copy" buffer holds the vertices which need to get
+ * copied from the old buffer to the new one.
  */
 static GLuint
 vbo_copy_vertices( struct vbo_exec_context *exec )
 {
-   GLuint nr = exec->vtx.prim[exec->vtx.prim_count-1].count;
+   struct _mesa_prim *last_prim = &exec->vtx.prim[exec->vtx.prim_count - 1];
+   const GLuint nr = last_prim->count;
    GLuint ovf, i;
-   GLuint sz = exec->vtx.vertex_size;
+   const GLuint sz = exec->vtx.vertex_size;
    fi_type *dst = exec->vtx.copied.buffer;
-   const fi_type *src = (exec->vtx.buffer_map +
-                         exec->vtx.prim[exec->vtx.prim_count-1].start * 
-                         exec->vtx.vertex_size);
+   const fi_type *src = exec->vtx.buffer_map + last_prim->start * sz;
 
    switch (exec->ctx->Driver.CurrentExecPrimitive) {
    case GL_POINTS:
@@ -106,6 +109,17 @@ vbo_copy_vertices( struct vbo_exec_context *exec )
 	 return 1;
       }
    case GL_LINE_LOOP:
+      if (last_prim->begin == 0) {
+         /* We're dealing with the second or later section of a split/wrapped
+          * GL_LINE_LOOP.  Since we're converting line loops to line strips,
+          * we've already increment the last_prim->start counter by one to
+          * skip the 0th vertex in the loop.  We need to undo that (effectively
+          * subtract one from last_prim->start) so that we copy the 0th vertex
+          * to the next vertex buffer.
+          */
+         src -= sz;
+      }
+      /* fall-through */
    case GL_TRIANGLE_FAN:
    case GL_POLYGON:
       if (nr == 0) {
@@ -123,7 +137,7 @@ vbo_copy_vertices( struct vbo_exec_context *exec )
    case GL_TRIANGLE_STRIP:
       /* no parity issue, but need to make sure the tri is not drawn twice */
       if (nr & 1) {
-	 exec->vtx.prim[exec->vtx.prim_count-1].count--;
+	 last_prim->count--;
       }
       /* fallthrough */
    case GL_QUAD_STRIP:
@@ -432,8 +446,7 @@ vbo_exec_vtx_flush(struct vbo_exec_context *exec, GLboolean keepUnmapped)
    if (keepUnmapped || exec->vtx.vertex_size == 0)
       exec->vtx.max_vert = 0;
    else
-      exec->vtx.max_vert = ((VBO_VERT_BUFFER_SIZE - exec->vtx.buffer_used) /
-                            (exec->vtx.vertex_size * sizeof(GLfloat)));
+      exec->vtx.max_vert = vbo_compute_max_verts(exec);
 
    exec->vtx.buffer_ptr = exec->vtx.buffer_map;
    exec->vtx.prim_count = 0;
diff --git a/src/mesa/vbo/vbo_save_api.c b/src/mesa/vbo/vbo_save_api.c
index fdc677f9a07..d49aa15b1b7 100644
--- a/src/mesa/vbo/vbo_save_api.c
+++ b/src/mesa/vbo/vbo_save_api.c
@@ -330,8 +330,7 @@ _save_reset_counters(struct gl_context *ctx)
  * previous prim.
  */
 static void
-merge_prims(struct gl_context *ctx,
-            struct _mesa_prim *prim_list,
+merge_prims(struct _mesa_prim *prim_list,
             GLuint *prim_count)
 {
    GLuint i;
@@ -361,6 +360,51 @@ merge_prims(struct gl_context *ctx,
    *prim_count = prev_prim - prim_list + 1;
 }
 
+
+/**
+ * Convert GL_LINE_LOOP primitive into GL_LINE_STRIP so that drivers
+ * don't have to worry about handling the _mesa_prim::begin/end flags.
+ * See https://bugs.freedesktop.org/show_bug.cgi?id=81174
+ */
+static void
+convert_line_loop_to_strip(struct vbo_save_context *save,
+                           struct vbo_save_vertex_list *node)
+{
+   struct _mesa_prim *prim = &node->prim[node->prim_count - 1];
+
+   assert(prim->mode == GL_LINE_LOOP);
+
+   if (prim->end) {
+      /* Copy the 0th vertex to end of the buffer and extend the
+       * vertex count by one to finish the line loop.
+       */
+      const GLuint sz = save->vertex_size;
+      /* 0th vertex: */
+      const fi_type *src = save->buffer + prim->start * sz;
+      /* end of buffer: */
+      fi_type *dst = save->buffer + (prim->start + prim->count) * sz;
+
+      memcpy(dst, src, sz * sizeof(float));
+
+      prim->count++;
+      node->count++;
+      save->vert_count++;
+      save->buffer_ptr += sz;
+      save->vertex_store->used += sz;
+   }
+
+   if (!prim->begin) {
+      /* Drawing the second or later section of a long line loop.
+       * Skip the 0th vertex.
+       */
+      prim->start++;
+      prim->count--;
+   }
+
+   prim->mode = GL_LINE_STRIP;
+}
+
+
 /**
  * Insert the active immediate struct onto the display list currently
  * being built.
@@ -442,7 +486,11 @@ _save_compile_vertex_list(struct gl_context *ctx)
     */
    save->copied.nr = _save_copy_vertices(ctx, node, save->buffer);
 
-   merge_prims(ctx, node->prim, &node->prim_count);
+   if (node->prim[node->prim_count - 1].mode == GL_LINE_LOOP) {
+      convert_line_loop_to_strip(save, node);
+   }
+
+   merge_prims(node->prim, &node->prim_count);
 
    /* Deal with GL_COMPILE_AND_EXECUTE:
     */
@@ -483,6 +531,10 @@ _save_compile_vertex_list(struct gl_context *ctx)
       save->buffer_ptr = vbo_save_map_vertex_store(ctx, save->vertex_store);
       save->out_of_memory = save->buffer_ptr == NULL;
    }
+   else {
+      /* update buffer_ptr for next vertex */
+      save->buffer_ptr = save->vertex_store->buffer + save->vertex_store->used;
+   }
 
    if (save->prim_store->used > VBO_SAVE_PRIM_SIZE - 6) {
       save->prim_store->refcount--;