diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index 08be8e6d061..227f16ef240 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -98,7 +98,6 @@ enum svga_hud {
 #define SVGA_MAX_CONST_BUF_SIZE (4096 * 4 * sizeof(int))
 
 #define CONST0_UPLOAD_ALIGNMENT 256
-
 #define SVGA_MAX_IMAGES         SVGA3D_MAX_UAVIEWS
 #define SVGA_MAX_SHADER_BUFFERS	SVGA3D_MAX_UAVIEWS
 #define SVGA_MAX_ATOMIC_BUFFERS	SVGA3D_MAX_UAVIEWS
@@ -624,6 +623,9 @@ struct svga_context
       /** bitmasks of which const buffers are changed */
       unsigned dirty_constbufs[PIPE_SHADER_TYPES];
 
+      /** bitmasks of which const buffers to be bound as raw buffers */
+      unsigned raw_constbufs[PIPE_SHADER_TYPES];
+
       unsigned texture_timestamp;
       unsigned uav_timestamp[2];
 
@@ -967,6 +969,21 @@ svga_rects_equal(const SVGA3dRect *r1, const SVGA3dRect *r2)
    return memcmp(r1, r2, sizeof(*r1)) == 0;
 }
 
+
+/* A helper function to return TRUE if sampler state mapping is
+ * to be used. Sampler state mapping is used in GL43 context
+ * if the number of sampler states exceeds the SVGA device limit or
+ * the sampler state mapping environment variable is set.
+ */
+static inline boolean
+svga_use_sampler_state_mapping(const struct svga_context *svga,
+                               unsigned num_sampler_states)
+{
+   return svga_have_gl43(svga) &&
+          (svga_screen(svga->pipe.screen)->debug.sampler_state_mapping ||
+           num_sampler_states > SVGA3D_DX_MAX_SAMPLERS);
+}
+
 /**
  * If the Gallium HUD is enabled, this will return the current time.
  * Otherwise, just return zero.
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 76368fb927f..9bdea1cf85e 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -639,13 +639,13 @@ vgpu10_get_shader_param(struct pipe_screen *screen,
       if (shader == PIPE_SHADER_FRAGMENT)
          return VGPU10_MAX_FS_INPUTS;
       else if (shader == PIPE_SHADER_GEOMETRY)
-         return VGPU10_MAX_GS_INPUTS;
+         return svgascreen->max_gs_inputs;
       else if (shader == PIPE_SHADER_TESS_CTRL)
          return VGPU11_MAX_HS_INPUT_CONTROL_POINTS;
       else if (shader == PIPE_SHADER_TESS_EVAL)
          return VGPU11_MAX_DS_INPUT_CONTROL_POINTS;
       else
-         return VGPU10_MAX_VS_INPUTS;
+         return svgascreen->max_vs_inputs;
    case PIPE_SHADER_CAP_MAX_OUTPUTS:
       if (shader == PIPE_SHADER_FRAGMENT)
          return VGPU10_MAX_FS_OUTPUTS;
@@ -656,7 +656,8 @@ vgpu10_get_shader_param(struct pipe_screen *screen,
       else if (shader == PIPE_SHADER_TESS_EVAL)
          return VGPU11_MAX_DS_OUTPUTS;
       else
-         return VGPU10_MAX_VS_OUTPUTS;
+         return svgascreen->max_vs_outputs;
+
    case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
       return VGPU10_MAX_CONSTANT_BUFFER_ELEMENT_COUNT * sizeof(float[4]);
    case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
@@ -973,6 +974,9 @@ svga_screen_create(struct svga_winsys_screen *sws)
       goto error2;
    }
 
+   svgascreen->debug.sampler_state_mapping =
+      debug_get_bool_option("SVGA_SAMPLER_STATE_MAPPING", FALSE);
+
    debug_printf("%s enabled\n",
                 sws->have_sm5 ? "SM5" :
                 sws->have_sm4_1 ? "SM4_1" :
@@ -1060,6 +1064,18 @@ svga_screen_create(struct svga_winsys_screen *sws)
       screen->is_format_supported = svga_is_dx_format_supported;
 
       svgascreen->max_viewports = SVGA3D_DX_MAX_VIEWPORTS;
+
+      /* Shader limits */
+      if (sws->have_sm4_1) {
+         svgascreen->max_vs_inputs  = VGPU10_1_MAX_VS_INPUTS;
+         svgascreen->max_vs_outputs = VGPU10_1_MAX_VS_OUTPUTS;
+         svgascreen->max_gs_inputs  = VGPU10_1_MAX_GS_INPUTS;
+      }
+      else {
+         svgascreen->max_vs_inputs  = VGPU10_MAX_VS_INPUTS;
+         svgascreen->max_vs_outputs = VGPU10_MAX_VS_OUTPUTS;
+         svgascreen->max_gs_inputs  = VGPU10_MAX_GS_INPUTS;
+      }
    }
    else {
       /* VGPU9 */
@@ -1097,6 +1113,11 @@ svga_screen_create(struct svga_winsys_screen *sws)
 
       /* Only one viewport */
       svgascreen->max_viewports = 1;
+
+      /* Shader limits */
+      svgascreen->max_vs_inputs  = 16;
+      svgascreen->max_vs_outputs = 10;
+      svgascreen->max_gs_inputs  = 0;
    }
 
    /* common VGPU9 / VGPU10 caps */
diff --git a/src/gallium/drivers/svga/svga_screen.h b/src/gallium/drivers/svga/svga_screen.h
index aa0001b11e5..c48b0f97dad 100644
--- a/src/gallium/drivers/svga/svga_screen.h
+++ b/src/gallium/drivers/svga/svga_screen.h
@@ -58,14 +58,20 @@ struct svga_screen
    unsigned max_const_buffers;
    unsigned max_viewports;
    unsigned ms_samples;
+   unsigned max_vs_inputs;
+   unsigned max_vs_outputs;
+   unsigned max_gs_inputs;
 
    struct {
-      boolean force_level_surface_view;
-      boolean force_surface_view;
-      boolean no_surface_view;
-      boolean force_sampler_view;
-      boolean no_sampler_view;
-      boolean no_cache_index_buffers;
+      unsigned force_level_surface_view:1;
+      unsigned force_surface_view:1;
+      unsigned no_surface_view:1;
+      unsigned force_sampler_view:1;
+      unsigned no_sampler_view:1;
+      unsigned no_cache_index_buffers:1;
+      unsigned tessellation:1;
+      unsigned sampler_state_mapping:1;
+      unsigned pad:24;
    } debug;
 
    unsigned texture_timestamp;
diff --git a/src/gallium/drivers/svga/svga_shader.c b/src/gallium/drivers/svga/svga_shader.c
index 3c48d6724f4..0253cd6931e 100644
--- a/src/gallium/drivers/svga/svga_shader.c
+++ b/src/gallium/drivers/svga/svga_shader.c
@@ -223,6 +223,16 @@ static const enum pipe_swizzle set_XXXY[PIPE_SWIZZLE_MAX] = {
    PIPE_SWIZZLE_NONE
 };
 
+static const enum pipe_swizzle set_YYYY[PIPE_SWIZZLE_MAX] = {
+   PIPE_SWIZZLE_Y,
+   PIPE_SWIZZLE_Y,
+   PIPE_SWIZZLE_Y,
+   PIPE_SWIZZLE_Y,
+   PIPE_SWIZZLE_0,
+   PIPE_SWIZZLE_1,
+   PIPE_SWIZZLE_NONE
+};
+
 
 static VGPU10_RESOURCE_RETURN_TYPE
 vgpu10_return_type(enum pipe_format format)
@@ -242,6 +252,17 @@ vgpu10_return_type(enum pipe_format format)
 }
 
 
+/**
+ * A helper function to return TRUE if the specified format
+ * is a supported format for sample_c instruction.
+ */
+static bool
+isValidSampleCFormat(enum pipe_format format)
+{
+   return util_format_is_depth_or_stencil(format);
+}
+
+
 /**
  * Initialize the shader-neutral fields of svga_compile_key from context
  * state.  This is basically the texture-related state.
@@ -253,15 +274,28 @@ svga_init_shader_key_common(const struct svga_context *svga,
                             struct svga_compile_key *key)
 {
    unsigned i, idx = 0;
+   unsigned sampler_slots = 0;
 
    assert(shader_type < ARRAY_SIZE(svga->curr.num_sampler_views));
 
    /* In case the number of samplers and sampler_views doesn't match,
-    * loop over the lower of the two counts.
+    * loop over the upper of the two counts.
     */
    key->num_textures = MAX2(svga->curr.num_sampler_views[shader_type],
                             svga->curr.num_samplers[shader_type]);
 
+   key->num_samplers = 0;
+
+   /* Set sampler_state_mapping only if GL43 is supported and
+    * the number of samplers exceeds SVGA limit or the sampler state
+    * mapping env is set.
+    */
+   boolean sampler_state_mapping =
+      svga_use_sampler_state_mapping(svga, svga->curr.num_samplers[shader_type]);
+
+   key->sampler_state_mapping =
+      key->num_textures && sampler_state_mapping ? 1 : 0;
+
    for (i = 0; i < key->num_textures; i++) {
       struct pipe_sampler_view *view = svga->curr.sampler_views[shader_type][i];
       const struct svga_sampler_state
@@ -269,22 +303,21 @@ svga_init_shader_key_common(const struct svga_context *svga,
 
       if (view) {
          assert(view->texture);
-         assert(view->texture->target < (1 << 4)); /* texture_target:4 */
 
          enum pipe_texture_target target = view->target;
+         assert(target < (1 << 4)); /* texture_target:4 */
 
 	 key->tex[i].target = target;
 	 key->tex[i].sampler_return_type = vgpu10_return_type(view->format);
 	 key->tex[i].sampler_view = 1;
 
-
          /* 1D/2D array textures with one slice and cube map array textures
           * with one cube are treated as non-arrays by the SVGA3D device.
           * Set the is_array flag only if we know that we have more than 1
           * element.  This will be used to select shader instruction/resource
           * types during shader translation.
           */
-         switch (view->texture->target) {
+         switch (target) {
          case PIPE_TEXTURE_1D_ARRAY:
          case PIPE_TEXTURE_2D_ARRAY:
             key->tex[i].is_array = view->texture->array_size > 1;
@@ -300,10 +333,12 @@ svga_init_shader_key_common(const struct svga_context *svga,
          key->tex[i].num_samples = view->texture->nr_samples;
 
          const enum pipe_swizzle *swizzle_tab;
-         if (view->texture->target == PIPE_BUFFER) {
+         if (target == PIPE_BUFFER) {
             SVGA3dSurfaceFormat svga_format;
             unsigned tf_flags;
 
+            assert(view->texture->target == PIPE_BUFFER);
+
             /* Apply any special swizzle mask for the view format if needed */
 
             svga_translate_texture_buffer_view_format(view->format,
@@ -334,11 +369,24 @@ svga_init_shader_key_common(const struct svga_context *svga,
                 view->texture->format == PIPE_FORMAT_DXT1_SRGB)
                swizzle_tab = set_alpha;
 
+            if (view->format == PIPE_FORMAT_X24S8_UINT ||
+                view->format == PIPE_FORMAT_X32_S8X24_UINT)
+               swizzle_tab = set_YYYY;
+
             /* Save the compare function as we need to handle
              * depth compare in the shader.
              */
             key->tex[i].compare_mode = sampler->compare_mode;
             key->tex[i].compare_func = sampler->compare_func;
+
+            /* Set the compare_in_shader bit if the view format
+             * is not a supported format for shadow compare.
+             * In this case, we'll do the comparison in the shader.
+             */
+            if ((sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) &&
+                !isValidSampleCFormat(view->format)) {
+               key->tex[i].compare_in_shader = TRUE;
+            }
          }
 
          key->tex[i].swizzle_r = swizzle_tab[view->swizzle_r];
@@ -364,6 +412,139 @@ svga_init_shader_key_common(const struct svga_context *svga,
                 key->tex[i].texel_bias = TRUE;
             }
          }
+
+         if (!sampler_state_mapping) {
+            /* Use the same index if sampler state mapping is not supported */
+            key->tex[i].sampler_index = i;
+            key->num_samplers = i + 1;
+         }
+         else {
+
+            /* The current samplers list can have redundant entries.
+             * In order to allow the number of bound samplers within the
+             * max limit supported by SVGA, we'll recreate the list with
+             * unique sampler state objects only.
+             */
+
+            /* Check to see if this sampler is already on the list.
+             * If so, set the sampler index of this sampler to the
+             * same sampler index.
+             */
+            for (unsigned j = 0; j <= i; j++) {
+               if (svga->curr.sampler[shader_type][j] == sampler) {
+
+                  if (!(sampler_slots & (1 << j))) {
+
+                     /* if this sampler is not added to the new list yet,
+                      * set its sampler index to the next sampler index,
+                      * increment the sampler count, and mark this
+                      * sampler as added to the list.
+                      */
+
+                     unsigned next_index =
+                        MIN2(key->num_samplers, SVGA3D_DX_MAX_SAMPLERS-1);
+
+                     key->tex[i].sampler_index = next_index;
+                     key->num_samplers = next_index + 1;
+
+                     if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+                        /* reserve one slot for the alternate sampler */
+                        key->num_samplers++;
+                     }
+
+                     sampler_slots |= (1 << j);
+                  }
+                  else {
+                     key->tex[i].sampler_index = key->tex[j].sampler_index;
+                  }
+                  break;
+               }
+            }
+         }
+      }
+   }
+
+   if (svga_have_gl43(svga)) {
+      if (shader->info.images_declared ||
+          shader->info.shader_buffers_declared) {
+
+         /* Save the uavSpliceIndex which is the index used for the first uav
+          * in the draw pipeline. For compute, uavSpliceIndex is always 0.
+          */
+         if (shader_type != PIPE_SHADER_COMPUTE)
+            key->uav_splice_index = svga->state.hw_draw.uavSpliceIndex;
+
+         unsigned uav_splice_index = key->uav_splice_index;
+
+         /* Also get the texture data type to be used in the uav declaration */
+         struct svga_image_view *cur_image_view =
+            &svga->curr.image_views[shader_type][0];
+
+         for (unsigned i = 0; i < ARRAY_SIZE(svga->curr.image_views[shader_type]);
+              i++, cur_image_view++) {
+
+            struct pipe_resource *resource = cur_image_view->desc.resource;
+
+            if (resource) {
+               key->images[i].return_type =
+                  svga_get_texture_datatype(cur_image_view->desc.format);
+
+               key->images[i].is_array = resource->array_size > 1;
+
+               /* Save the image resource target in the shader key because
+                * for single layer image view, the resource target in the
+                * tgsi shader is changed to a different texture target.
+                */
+               key->images[i].resource_target = resource->target;
+               if (resource->target == PIPE_TEXTURE_3D ||
+                   resource->target == PIPE_TEXTURE_1D_ARRAY ||
+                   resource->target == PIPE_TEXTURE_2D_ARRAY ||
+                   resource->target == PIPE_TEXTURE_CUBE ||
+                   resource->target == PIPE_TEXTURE_CUBE_ARRAY) {
+                  key->images[i].is_single_layer =
+                     cur_image_view->desc.u.tex.first_layer ==
+                     cur_image_view->desc.u.tex.last_layer;
+               }
+
+               key->images[i].uav_index = cur_image_view->uav_index + uav_splice_index;
+            }
+            else
+               key->images[i].uav_index = SVGA3D_INVALID_ID;
+         }
+
+         struct svga_shader_buffer *cur_sbuf =
+            &svga->curr.shader_buffers[shader_type][0];
+
+         for (unsigned i = 0; i < ARRAY_SIZE(svga->curr.shader_buffers[shader_type]);
+              i++, cur_sbuf++) {
+
+            if (cur_sbuf->resource)
+               key->shader_buf_uav_index[i] = cur_sbuf->uav_index + uav_splice_index;
+            else
+               key->shader_buf_uav_index[i] = SVGA3D_INVALID_ID;
+         }
+
+         struct svga_shader_buffer *cur_buf = &svga->curr.atomic_buffers[0];
+
+         for (unsigned i = 0; i < ARRAY_SIZE(svga->curr.atomic_buffers);
+              i++, cur_buf++) {
+
+            if (cur_buf->resource)
+               key->atomic_buf_uav_index[i] = cur_buf->uav_index + uav_splice_index;
+            else
+               key->atomic_buf_uav_index[i] = SVGA3D_INVALID_ID;
+         }
+      }
+
+      /* Save info about which constant buffers are to be viewed
+       * as raw buffers in the shader key.
+       */
+      if (shader->info.const_buffers_declared &
+          svga->state.raw_constbufs[shader_type]) {
+         key->raw_buffers = svga->state.raw_constbufs[shader_type];
+
+         /* beginning index for srv for raw buffers */
+         key->srv_raw_buf_index = PIPE_MAX_SAMPLERS;
       }
    }
 
diff --git a/src/gallium/drivers/svga/svga_shader.h b/src/gallium/drivers/svga/svga_shader.h
index dafcbc021e1..fe44101c6ba 100644
--- a/src/gallium/drivers/svga/svga_shader.h
+++ b/src/gallium/drivers/svga/svga_shader.h
@@ -121,15 +121,18 @@ struct svga_compile_key
    /* any shader type */
    int8_t generic_remap_table[MAX_GENERIC_VARYING];
    unsigned num_textures:8;
+   unsigned num_samplers:8;
    unsigned num_unnormalized_coords:8;
    unsigned clip_plane_enable:PIPE_MAX_CLIP_PLANES;
    unsigned last_vertex_stage:1;
    unsigned clamp_vertex_color:1;
+   unsigned sampler_state_mapping:1;    /* Set if use sampler state mapping */
    unsigned sprite_origin_lower_left:1;
    uint16_t sprite_coord_enable;
    struct {
       unsigned compare_mode:1;
       unsigned compare_func:3;
+      unsigned compare_in_shader:1;
       unsigned unnormalized:1;
       unsigned texel_bias:1;
       unsigned width_height_idx:5; /**< texture unit */
@@ -142,10 +145,25 @@ struct svga_compile_key
       unsigned target:4;
       unsigned sampler_return_type:4;
       unsigned sampler_view:1;
+      unsigned sampler_index:5;
    } tex[PIPE_MAX_SAMPLERS];
-   /* Note: svga_compile_keys_equal() depends on the variable-size
-    * tex[] array being at the end of this structure.
-    */
+
+   unsigned uav_splice_index:4;      /* starting uav index */
+   unsigned srv_raw_buf_index:8;     /* start index for srv raw buffers */
+   unsigned image_size_used:1;
+
+   uint16_t raw_buffers;             /* bitmask of raw buffers */
+
+   struct {
+      enum tgsi_return_type return_type;
+      enum pipe_texture_target resource_target;
+      unsigned is_array:1;
+      unsigned is_single_layer:1;
+      unsigned uav_index:7;
+   } images[PIPE_MAX_SHADER_IMAGES];
+
+   uint16_t shader_buf_uav_index[PIPE_MAX_SHADER_BUFFERS];
+   uint16_t atomic_buf_uav_index[PIPE_MAX_HW_ATOMIC_BUFFERS];
 };
 
 /* A key for a variant of token string of a shader */
@@ -223,7 +241,8 @@ struct svga_fs_variant
    unsigned fs_shadow_compare_units;
 
    /** For FS-based polygon stipple */
-   unsigned pstipple_sampler_unit;
+   unsigned pstipple_sampler_unit:8;
+   unsigned pstipple_sampler_state_index:8;
 };
 
 
@@ -368,8 +387,7 @@ static inline boolean
 svga_compile_keys_equal(const struct svga_compile_key *a,
                         const struct svga_compile_key *b)
 {
-   unsigned key_size =
-      (const char *) &a->tex[a->num_textures] - (const char *) a;
+   unsigned key_size = sizeof(*a);
 
    return memcmp(a, b, key_size) == 0;
 }
diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
index dd424fcb55f..2cba50def28 100644
--- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
@@ -66,7 +66,6 @@
         (VGPU10_MAX_IMMEDIATE_CONSTANT_BUFFER_ELEMENT_COUNT/4)
 #define MAX_TEMP_ARRAYS 64  /* Enough? */
 
-
 /**
  * Clipping is complicated.  There's four different cases which we
  * handle during VS/GS shader translation:
@@ -181,6 +180,18 @@ map_tgsi_semantic_to_sgn_name(enum tgsi_semantic name)
    return tgsi_semantic_to_sgn_name[name];
 }
 
+enum reemit_mode {
+   REEMIT_FALSE = 0,
+   REEMIT_TRUE = 1,
+   REEMIT_IN_PROGRESS = 2
+};
+
+struct svga_raw_buf_tmp {
+   bool indirect;
+   unsigned buffer_index:8;
+   unsigned element_index:8;
+   unsigned element_rel:8;
+};
 
 struct svga_shader_emitter_v10
 {
@@ -193,18 +204,21 @@ struct svga_shader_emitter_v10
    struct svga_compile_key key;
    struct tgsi_shader_info info;
    unsigned unit;
-   unsigned version; /**< Either 40 or 41 at this time */
+   unsigned version; /**< Either 40, 41, 50 or 51 at this time */
 
    unsigned cur_tgsi_token;     /**< current tgsi token position */
    unsigned inst_start_token;
    boolean discard_instruction; /**< throw away current instruction? */
    boolean reemit_instruction;  /**< reemit current instruction */
+   boolean reemit_tgsi_instruction;  /**< reemit current tgsi instruction */
    boolean skip_instruction;    /**< skip current instruction */
+   boolean use_sampler_state_mapping; /* use sampler state mapping */
+   enum reemit_mode reemit_rawbuf_instruction;
 
    union tgsi_immediate_data immediates[MAX_IMMEDIATE_COUNT][4];
    double (*immediates_dbl)[2];
    unsigned num_immediates;      /**< Number of immediates emitted */
-   unsigned common_immediate_pos[10];  /**< literals for common immediates */
+   unsigned common_immediate_pos[20];  /**< literals for common immediates */
    unsigned num_common_immediates;
    boolean immediates_emitted;
 
@@ -235,12 +249,36 @@ struct svga_shader_emitter_v10
     */
    unsigned num_shader_consts[SVGA_MAX_CONST_BUFS];
 
+   /* Raw constant buffers */
+   unsigned raw_buf_srv_start_index;  /* starting srv index for raw buffers */
+   unsigned raw_bufs;                 /* raw buffers bitmask */
+   unsigned raw_buf_tmp_index;        /* starting temp index for raw buffers */
+   unsigned raw_buf_cur_tmp_index;    /* current temp index for raw buffers */
+   struct svga_raw_buf_tmp raw_buf_tmp[3]; /* temporaries for raw buf source */
+
    /* Samplers */
    unsigned num_samplers;
    boolean sampler_view[PIPE_MAX_SAMPLERS];  /**< True if sampler view exists*/
    ubyte sampler_target[PIPE_MAX_SAMPLERS];  /**< TGSI_TEXTURE_x */
    ubyte sampler_return_type[PIPE_MAX_SAMPLERS];  /**< TGSI_RETURN_TYPE_x */
 
+   /* Images */
+   unsigned num_images;
+   unsigned image_mask;
+   struct tgsi_declaration_image image[PIPE_MAX_SHADER_IMAGES];
+   unsigned image_size_index;  /* starting index to cbuf for image size */
+
+   /* Shader buffers */
+   unsigned num_shader_bufs;
+
+   /* HW atomic buffers */
+   unsigned num_atomic_bufs;
+   unsigned atomic_bufs_mask;
+   unsigned max_atomic_counter_index;
+   VGPU10_OPCODE_TYPE cur_atomic_opcode;    /* current atomic opcode */
+
+   boolean uav_declared;  /* True if uav is declared */
+
    /* Index Range declaration */
    struct {
       unsigned start_index;
@@ -274,6 +312,11 @@ struct svga_shader_emitter_v10
       unsigned have_prescale:1;
    } vposition;
 
+   /* Shader limits */
+   unsigned max_vs_inputs;
+   unsigned max_vs_outputs;
+   unsigned max_gs_inputs;
+
    /* For vertex shaders only */
    struct {
       /* viewport constant */
@@ -299,6 +342,7 @@ struct svga_shader_emitter_v10
       unsigned face_tmp_index;   /**< temp face reg converted to -1 / +1 */
 
       unsigned pstipple_sampler_unit;
+      unsigned pstipple_sampler_state_index;
 
       unsigned fragcoord_input_index;  /**< real fragment position input reg */
       unsigned fragcoord_tmp_index;    /**< 1/w modified position temp reg */
@@ -311,12 +355,11 @@ struct svga_shader_emitter_v10
       /** TGSI index of sample mask input sys value */
       unsigned sample_mask_in_sys_index;
 
-      /** Which texture units are doing shadow comparison in the FS code */
-      unsigned shadow_compare_units;
-
       /* layer */
       unsigned layer_input_index;    /**< TGSI index of layer */
       unsigned layer_imm_index;      /**< immediate for default layer 0 */
+
+      boolean forceEarlyDepthStencil;  /**< true if Early Depth stencil test is enabled */
    } fs;
 
    /* For geometry shaders only */
@@ -383,6 +426,19 @@ struct svga_shader_emitter_v10
       } outer;
    } tes;
 
+   struct {
+      unsigned block_width;       /* thread group size in x dimension */
+      unsigned block_height;      /* thread group size in y dimension */
+      unsigned block_depth;       /* thread group size in z dimension */
+      unsigned thread_id_index;   /* thread id tgsi index */
+      unsigned block_id_index;    /* block id tgsi index */
+      bool shared_memory_declared;    /* set if shared memory is declared */
+      struct {
+         unsigned tgsi_index;   /* grid size tgsi index */
+         unsigned imm_index;    /* grid size imm index */
+      } grid_size;
+   } cs;
+
    /* For vertex or geometry shaders */
    enum clipping_mode clip_mode;
    unsigned clip_dist_out_index; /**< clip distance output register index */
@@ -410,6 +466,9 @@ struct svga_shader_emitter_v10
    /* For all shaders: const reg index for texture buffer size */
    unsigned texture_buffer_size_index[PIPE_MAX_SAMPLERS];
 
+   /** Which texture units are doing shadow comparison in the shader code */
+   unsigned shadow_compare_units;
+
    /* VS/TCS/TES/GS/FS Linkage info */
    struct shader_linkage linkage;
    struct tgsi_shader_info *prevShaderInfo;
@@ -435,6 +494,9 @@ static boolean emit_sampler_declarations(struct svga_shader_emitter_v10 *emit);
 static boolean emit_resource_declarations(struct svga_shader_emitter_v10 *emit);
 static boolean emit_vgpu10_immediates_block(struct svga_shader_emitter_v10 *emit);
 static boolean emit_index_range_declaration(struct svga_shader_emitter_v10 *emit);
+static void emit_image_declarations(struct svga_shader_emitter_v10 *emit);
+static void emit_shader_buf_declarations(struct svga_shader_emitter_v10 *emit);
+static void emit_atomic_buf_declarations(struct svga_shader_emitter_v10 *emit);
 static void emit_temp_prescale_instructions(struct svga_shader_emitter_v10 *emit);
 
 static boolean
@@ -459,6 +521,11 @@ emit_input_declaration(struct svga_shader_emitter_v10 *emit,
                        boolean addSignature,
                        SVGA3dDXSignatureSemanticName sgnName);
 
+static boolean
+emit_rawbuf_instruction(struct svga_shader_emitter_v10 *emit,
+                        unsigned inst_number,
+                        const struct tgsi_full_instruction *inst);
+
 static void
 create_temp_array(struct svga_shader_emitter_v10 *emit,
                   unsigned arrayID, unsigned first, unsigned count,
@@ -602,9 +669,9 @@ check_register_index(struct svga_shader_emitter_v10 *emit,
    case VGPU10_OPCODE_DCL_INPUT_PS_SGV:
    case VGPU10_OPCODE_DCL_INPUT_PS_SIV:
       if ((emit->unit == PIPE_SHADER_VERTEX &&
-           index >= VGPU10_MAX_VS_INPUTS) ||
+           index >= emit->max_vs_inputs) ||
           (emit->unit == PIPE_SHADER_GEOMETRY &&
-           index >= VGPU10_MAX_GS_INPUTS) ||
+           index >= emit->max_gs_inputs) ||
           (emit->unit == PIPE_SHADER_FRAGMENT &&
            index >= VGPU10_MAX_FS_INPUTS) ||
           (emit->unit == PIPE_SHADER_TESS_CTRL &&
@@ -625,7 +692,7 @@ check_register_index(struct svga_shader_emitter_v10 *emit,
        * Index will never be >= index >= VGPU11_MAX_HS_OUTPUTS + 2
        */
       if ((emit->unit == PIPE_SHADER_VERTEX &&
-           index >= VGPU10_MAX_VS_OUTPUTS) ||
+           index >= emit->max_vs_outputs) ||
           (emit->unit == PIPE_SHADER_GEOMETRY &&
            index >= VGPU10_MAX_GS_OUTPUTS) ||
           (emit->unit == PIPE_SHADER_FRAGMENT &&
@@ -655,6 +722,13 @@ check_register_index(struct svga_shader_emitter_v10 *emit,
       }
       break;
    case VGPU10_OPERAND_TYPE_OUTPUT_COVERAGE_MASK:
+   case VGPU10_OPERAND_TYPE_INPUT_GS_INSTANCE_ID:
+   case VGPU10_OPERAND_TYPE_OUTPUT_CONTROL_POINT_ID:
+   case VGPU10_OPERAND_TYPE_INPUT_CONTROL_POINT:
+   case VGPU10_OPERAND_TYPE_INPUT_DOMAIN_POINT:
+   case VGPU10_OPERAND_TYPE_INPUT_PATCH_CONSTANT:
+   case VGPU10_OPERAND_TYPE_INPUT_THREAD_GROUP_ID:
+   case VGPU10_OPERAND_TYPE_INPUT_THREAD_ID_IN_GROUP:
       /* nothing */
       break;
    default:
@@ -960,6 +1034,10 @@ translate_opcode(enum tgsi_opcode opcode)
       return VGPU10_OPCODE_EVAL_SAMPLE_INDEX;
    case TGSI_OPCODE_BARRIER:
       return VGPU10_OPCODE_SYNC;
+   case TGSI_OPCODE_DFMA:
+      return VGPU10_OPCODE_DFMA;
+   case TGSI_OPCODE_FMA:
+      return VGPU10_OPCODE_MAD;
 
    /* DX11.1 Opcodes */
    case TGSI_OPCODE_DDIV:
@@ -1474,7 +1552,7 @@ emit_src_register(struct svga_shader_emitter_v10 *emit,
 {
    enum tgsi_file_type file = reg->Register.File;
    unsigned index = reg->Register.Index;
-   const boolean indirect = reg->Register.Indirect;
+   boolean indirect = reg->Register.Indirect;
    unsigned tempArrayId = get_temp_array_id(emit, file, index);
    boolean index2d = (reg->Register.Dimension ||
                             tempArrayId > 0 ||
@@ -1528,11 +1606,8 @@ emit_src_register(struct svga_shader_emitter_v10 *emit,
             /* Emitted as vCoverage0.x */
             /* According to GLSL spec, the gl_SampleMaskIn array has ceil(s / 32)
              * elements where s is the maximum number of color samples supported
-             * by the implementation. With current implementation, we should not
-             * have more than one element. So assert if Index != 0
+             * by the implementation.
              */
-            assert((!reg->Register.Indirect && reg->Register.Index == 0) ||
-                   reg->Register.Indirect);
             operand0.value = 0;
             operand0.operandType = VGPU10_OPERAND_TYPE_INPUT_COVERAGE_MASK;
             operand0.indexDimension = VGPU10_OPERAND_INDEX_0D;
@@ -1749,12 +1824,93 @@ emit_src_register(struct svga_shader_emitter_v10 *emit,
          }
       }
    }
+   else if (emit->unit == PIPE_SHADER_COMPUTE) {
+      if (file == TGSI_FILE_SYSTEM_VALUE) {
+         if (index == emit->cs.thread_id_index) {
+            operand0.numComponents = VGPU10_OPERAND_4_COMPONENT;
+            operand0.operandType = VGPU10_OPERAND_TYPE_INPUT_THREAD_ID_IN_GROUP;
+            index = 0;
+         } else if (index == emit->cs.block_id_index) {
+            operand0.value = 0;
+            operand0.numComponents = VGPU10_OPERAND_4_COMPONENT;
+            operand0.operandType = VGPU10_OPERAND_TYPE_INPUT_THREAD_GROUP_ID;
+            operand0.indexDimension = VGPU10_OPERAND_INDEX_0D;
+            operand0.selectionMode = VGPU10_OPERAND_4_COMPONENT_SWIZZLE_MODE;
+            operand0.swizzleX = swizzleX;
+            operand0.swizzleY = swizzleY;
+            operand0.swizzleZ = swizzleZ;
+            operand0.swizzleW = swizzleW;
+            emit_dword(emit, operand0.value);
+            return;
+         } else if (index == emit->cs.grid_size.tgsi_index) {
+            file = TGSI_FILE_IMMEDIATE;
+            index = emit->cs.grid_size.imm_index;
+         }
+      }
+   }
 
    if (file == TGSI_FILE_ADDRESS) {
       index = emit->address_reg_index[index];
       file = TGSI_FILE_TEMPORARY;
    }
 
+   if (file == TGSI_FILE_CONSTANT) {
+      /**
+       * If this constant buffer is to be bound as srv raw buffer,
+       * then we have to load the constant to a temp first before
+       * it can be used as a source in the instruction.
+       * This is accomplished in two passes. The first pass is to
+       * identify if there is any constbuf to rawbuf translation.
+       * If there isn't, emit the instruction as usual.
+       * If there is, then we save the constant buffer reference info,
+       * and then instead of emitting the instruction at the end
+       * of the instruction, it will trigger a second pass of parsing
+       * this instruction. Before it starts the parsing, it will
+       * load the referenced raw buffer elements to temporaries.
+       * Then it will emit the instruction that replaces the
+       * constant buffer replaces with the corresponding temporaries.
+       */
+      if (emit->raw_bufs & (1 << index2)) {
+         if (emit->reemit_rawbuf_instruction != REEMIT_IN_PROGRESS) {
+            unsigned tmpIdx = emit->raw_buf_cur_tmp_index;
+
+            emit->raw_buf_tmp[tmpIdx].buffer_index = index2;
+
+            /* Save whether the element index is indirect indexing */
+            emit->raw_buf_tmp[tmpIdx].indirect = indirect;
+
+            /* If it is indirect index, save the temporary
+             * address index, otherwise, save the immediate index.
+             */
+            if (indirect) {
+               emit->raw_buf_tmp[tmpIdx].element_index =
+                  emit->address_reg_index[reg->Indirect.Index];
+               emit->raw_buf_tmp[tmpIdx].element_rel =
+                  reg->Register.Index;
+            }
+            else {
+               emit->raw_buf_tmp[tmpIdx].element_index = index;
+               emit->raw_buf_tmp[tmpIdx].element_rel = 0;
+            }
+
+            emit->raw_buf_cur_tmp_index++;
+            emit->reemit_rawbuf_instruction = REEMIT_TRUE;
+            emit->discard_instruction = TRUE;
+            emit->reemit_tgsi_instruction = TRUE;
+         }
+         else {
+            /* In the reemitting process, replace the constant buffer
+             * reference with temporary.
+             */
+            file = TGSI_FILE_TEMPORARY;
+            index = emit->raw_buf_cur_tmp_index + emit->raw_buf_tmp_index;
+            index2d = FALSE;
+            indirect = FALSE;
+            emit->raw_buf_cur_tmp_index++;
+         }
+      }
+   }
+
    if (file == TGSI_FILE_TEMPORARY) {
       if (need_temp_reg_initialization(emit, index)) {
          emit->initialize_temp_index = index;
@@ -1801,6 +1957,8 @@ emit_src_register(struct svga_shader_emitter_v10 *emit,
       }
    }
 
+   check_register_index(emit, operand0.operandType, index);
+
    /* Emit the operand tokens */
    emit_dword(emit, operand0.value);
    if (operand0.extended)
@@ -1834,6 +1992,7 @@ emit_src_register(struct svga_shader_emitter_v10 *emit,
       emit_dword(emit, remap_temp_index(emit, file, index));
 
       if (indirect) {
+         assert(operand0.operandType != VGPU10_OPERAND_TYPE_TEMP);
          emit_indirect_register(emit, reg->Indirect.Index);
       }
    }
@@ -1873,9 +2032,15 @@ emit_resource_register(struct svga_shader_emitter_v10 *emit,
  */
 static void
 emit_sampler_register(struct svga_shader_emitter_v10 *emit,
-                      unsigned sampler_number)
+                      unsigned unit)
 {
    VGPU10OperandToken0 operand0;
+   unsigned sampler_number;
+
+   sampler_number = emit->key.tex[unit].sampler_index;
+
+   if ((emit->shadow_compare_units & (1 << unit)) && emit->use_sampler_state_mapping)
+      sampler_number++;
 
    check_register_index(emit, VGPU10_OPERAND_TYPE_SAMPLER, sampler_number);
 
@@ -2861,6 +3026,22 @@ emit_vgpu10_property(struct svga_shader_emitter_v10 *emit,
       emit->tes.point_mode = prop->u[0].Data;
       break;
 
+   case TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH:
+      emit->cs.block_width = prop->u[0].Data;
+      break;
+
+   case TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT:
+      emit->cs.block_height = prop->u[0].Data;
+      break;
+
+   case TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH:
+      emit->cs.block_depth = prop->u[0].Data;
+      break;
+
+   case TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL:
+      emit->fs.forceEarlyDepthStencil = TRUE;
+      break;
+
    default:
       debug_printf("Unexpected TGSI property %s\n",
                    tgsi_property_names[prop->Property.PropertyName]);
@@ -2986,7 +3167,10 @@ alloc_common_immediates(struct svga_shader_emitter_v10 *emit)
    }
 
    emit->common_immediate_pos[n++] =
-      alloc_immediate_int4(emit, 0, 1, 0, -1);
+      alloc_immediate_int4(emit, 0, 1, 2, -1);
+
+   emit->common_immediate_pos[n++] =
+      alloc_immediate_int4(emit, 3, 4, 5, 6);
 
    if (emit->info.opcode_count[TGSI_OPCODE_IMSB] > 0 ||
        emit->info.opcode_count[TGSI_OPCODE_UMSB] > 0) {
@@ -3059,6 +3243,25 @@ alloc_common_immediates(struct svga_shader_emitter_v10 *emit)
       }
    }
 
+   /** TODO: allocate immediates for all possible element byte offset?
+    */
+   if (emit->raw_bufs) {
+      unsigned i;
+      for (i = 7; i < 12; i+=4) {
+         emit->common_immediate_pos[n++] =
+            alloc_immediate_int4(emit, i, (i+1), (i+2), (i+3));
+      }
+   }
+
+   if (emit->info.indirect_files &
+       (1 << TGSI_FILE_IMAGE | 1 << TGSI_FILE_BUFFER)) {
+      unsigned i;
+      for (i = 7; i < 8; i+=4) {
+         emit->common_immediate_pos[n++] =
+            alloc_immediate_int4(emit, i, (i+1), (i+2), (i+3));
+      }
+   }
+
    assert(n <= ARRAY_SIZE(emit->common_immediate_pos));
    emit->num_common_immediates = n;
 }
@@ -3139,6 +3342,8 @@ emit_hull_shader_declarations(struct svga_shader_emitter_v10 *emit)
    emit_dword(emit, opcode0.value);
    end_emit_instruction(emit);
 
+   alloc_common_immediates(emit);
+
    /* Declare constant registers */
    emit_constant_declaration(emit);
 
@@ -3146,7 +3351,14 @@ emit_hull_shader_declarations(struct svga_shader_emitter_v10 *emit)
    emit_sampler_declarations(emit);
    emit_resource_declarations(emit);
 
-   alloc_common_immediates(emit);
+   /* Declare images */
+   emit_image_declarations(emit);
+
+   /* Declare shader buffers */
+   emit_shader_buf_declarations(emit);
+
+   /* Declare atomic buffers */
+   emit_atomic_buf_declarations(emit);
 
    int nVertices = emit->key.tcs.vertices_per_patch;
    emit->tcs.imm_index =
@@ -3387,13 +3599,17 @@ emit_hull_shader_patch_constant_phase(struct svga_shader_emitter_v10 *emit,
 
       /* Usually this applies to TCS only. If shader is reading output of
        * patch constant in fork phase, we should reemit all instructions
-       * which are writting into ouput of patch constant in fork phase
+       * which are writting into output of patch constant in fork phase
        * to store results into temporaries.
        */
+      assert(!(emit->reemit_instruction && emit->reemit_rawbuf_instruction));
       if (emit->reemit_instruction) {
          assert(emit->unit == PIPE_SHADER_TESS_CTRL);
          ret = emit_vgpu10_instruction(emit, inst_number,
                                        &parse->FullToken.FullInstruction);
+      } else if (emit->reemit_rawbuf_instruction) {
+         ret = emit_rawbuf_instruction(emit, inst_number,
+                                       &parse->FullToken.FullInstruction);
       }
 
       if (!ret)
@@ -3404,6 +3620,25 @@ emit_hull_shader_patch_constant_phase(struct svga_shader_emitter_v10 *emit,
 }
 
 
+/**
+ * Emit the thread group declaration for compute shader.
+ */
+static void
+emit_compute_shader_declarations(struct svga_shader_emitter_v10 *emit)
+{
+   VGPU10OpcodeToken0 opcode0;
+
+   opcode0.value = 0;
+   opcode0.opcodeType = VGPU10_OPCODE_DCL_THREAD_GROUP;
+   begin_emit_instruction(emit);
+   emit_dword(emit, opcode0.value);
+   emit_dword(emit, emit->cs.block_width);
+   emit_dword(emit, emit->cs.block_height);
+   emit_dword(emit, emit->cs.block_depth);
+   end_emit_instruction(emit);
+}
+
+
 /**
  * Emit index range declaration.
  */
@@ -4557,6 +4792,41 @@ emit_system_value_declaration(struct svga_shader_emitter_v10 *emit,
                                 map_tgsi_semantic_to_sgn_name(semantic_name));
       }
       break;
+   case TGSI_SEMANTIC_THREAD_ID:
+      assert(emit->unit >= PIPE_SHADER_COMPUTE);
+      assert(emit->version >= 50);
+      emit->cs.thread_id_index = index;
+      emit_input_declaration(emit, VGPU10_OPCODE_DCL_INPUT,
+                             VGPU10_OPERAND_TYPE_INPUT_THREAD_ID_IN_GROUP,
+                             VGPU10_OPERAND_INDEX_0D,
+                             index, 1,
+                             VGPU10_NAME_UNDEFINED,
+                             VGPU10_OPERAND_4_COMPONENT,
+                             VGPU10_OPERAND_4_COMPONENT_MASK_MODE,
+                             VGPU10_OPERAND_4_COMPONENT_MASK_ALL,
+                             VGPU10_INTERPOLATION_UNDEFINED, TRUE,
+                             map_tgsi_semantic_to_sgn_name(semantic_name));
+      break;
+   case TGSI_SEMANTIC_BLOCK_ID:
+      assert(emit->unit >= PIPE_SHADER_COMPUTE);
+      assert(emit->version >= 50);
+      emit->cs.block_id_index = index;
+      emit_input_declaration(emit, VGPU10_OPCODE_DCL_INPUT,
+                             VGPU10_OPERAND_TYPE_INPUT_THREAD_GROUP_ID,
+                             VGPU10_OPERAND_INDEX_0D,
+                             index, 1,
+                             VGPU10_NAME_UNDEFINED,
+                             VGPU10_OPERAND_4_COMPONENT,
+                             VGPU10_OPERAND_4_COMPONENT_MASK_MODE,
+                             VGPU10_OPERAND_4_COMPONENT_MASK_ALL,
+                             VGPU10_INTERPOLATION_UNDEFINED, TRUE,
+                             map_tgsi_semantic_to_sgn_name(semantic_name));
+      break;
+   case TGSI_SEMANTIC_GRID_SIZE:
+      assert(emit->unit == PIPE_SHADER_COMPUTE);
+      assert(emit->version >= 50);
+      emit->cs.grid_size.tgsi_index = index;
+      break;
    default:
       debug_printf("unexpected system value semantic index %u / %s\n",
                    semantic_name, tgsi_semantic_names[semantic_name]);
@@ -4623,6 +4893,7 @@ emit_vgpu10_declaration(struct svga_shader_emitter_v10 *emit,
                          " but [%u] is the limit.\n",
                          num_consts,
                          VGPU10_MAX_CONSTANT_BUFFER_ELEMENT_COUNT);
+            emit->register_overflow = TRUE;
          }
          /* The linker doesn't enforce the max UBO size so we clamp here */
          emit->num_shader_consts[constbuf] =
@@ -4669,6 +4940,43 @@ emit_vgpu10_declaration(struct svga_shader_emitter_v10 *emit,
       }
       return TRUE;
 
+   case TGSI_FILE_IMAGE:
+      {
+         unsigned unit = decl->Range.First;
+         assert(decl->Range.First == decl->Range.Last);
+         assert(unit < PIPE_MAX_SHADER_IMAGES);
+         emit->image[unit] = decl->Image;
+         emit->image_mask |= 1 << unit;
+         emit->num_images++;
+      }
+      return TRUE;
+
+   case TGSI_FILE_HW_ATOMIC:
+      /* Declare the atomic buffer if it is not already declared. */
+      if (!(emit->atomic_bufs_mask & (1 << decl->Dim.Index2D))) {
+         emit->num_atomic_bufs++;
+         emit->atomic_bufs_mask |= (1 << decl->Dim.Index2D);
+      }
+
+      /* Remember the maximum atomic counter index encountered */
+      emit->max_atomic_counter_index =
+         MAX2(emit->max_atomic_counter_index, decl->Range.Last);
+      return TRUE;
+
+   case TGSI_FILE_MEMORY:
+      /* Record memory has been used. */
+      if (emit->unit == PIPE_SHADER_COMPUTE &&
+          decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED) {
+         emit->cs.shared_memory_declared = TRUE;
+      }
+
+      return TRUE;
+
+   case TGSI_FILE_BUFFER:
+      assert(emit->version >= 50);
+      emit->num_shader_bufs++;
+      return TRUE;
+
    default:
       assert(!"Unexpected type of declaration");
       return FALSE;
@@ -4676,7 +4984,6 @@ emit_vgpu10_declaration(struct svga_shader_emitter_v10 *emit,
 }
 
 
-
 /**
  * Emit input declarations for fragment shader.
  */
@@ -4928,27 +5235,6 @@ emit_tcs_input_declarations(struct svga_shader_emitter_v10 *emit)
    }
 
    if (emit->tcs.control_point_phase) {
-      if (emit->tcs.control_point_input_index == INVALID_INDEX) {
-
-         /* Add input control point declaration if it does not exist */
-         if ((indicesMask & (1 << emit->linkage.position_index)) == 0) {
-            emit->linkage.input_map[emit->linkage.num_inputs] =
-               emit->linkage.position_index;
-            emit->tcs.control_point_input_index = emit->linkage.num_inputs++;
-
-            emit_input_declaration(emit, VGPU10_OPCODE_DCL_INPUT,
-                                   VGPU10_OPERAND_TYPE_INPUT,
-                                   VGPU10_OPERAND_INDEX_2D,
-                                   emit->linkage.position_index,
-                                   emit->key.tcs.vertices_per_patch,
-                                   VGPU10_NAME_UNDEFINED,
-                                   VGPU10_OPERAND_4_COMPONENT,
-                                   VGPU10_OPERAND_4_COMPONENT_MASK_MODE,
-                                   VGPU10_OPERAND_4_COMPONENT_MASK_ALL,
-                                   VGPU10_INTERPOLATION_UNDEFINED, TRUE,
-                                   SVGADX_SIGNATURE_SEMANTIC_NAME_POSITION);
-         }
-      }
 
       /* Also add an address register for the indirection to the
        * input control points
@@ -5447,6 +5733,17 @@ emit_temporaries_declaration(struct svga_shader_emitter_v10 *emit)
       emit->tcs.invocation_id_tmp_index = total_temps++;
    }
 
+   if (emit->raw_bufs) {
+      /**
+       * Add 3 more temporaries if we need to translate constant buffer
+       * to srv raw buffer. Since we need to load the value to a temporary
+       * before it can be used as a source. There could be three source
+       * register in an instruction.
+       */
+      emit->raw_buf_tmp_index = total_temps;
+      total_temps+=3;
+   }
+
    for (i = 0; i < emit->num_address_regs; i++) {
       emit->address_reg_index[i] = total_temps++;
    }
@@ -5620,6 +5917,14 @@ emit_constant_declaration(struct svga_shader_emitter_v10 *emit)
    }
 
    if (total_consts > 0) {
+      if (total_consts > VGPU10_MAX_CONSTANT_BUFFER_ELEMENT_COUNT) {
+         debug_printf("Warning: Too many constants [%u] declared in constant"
+                      " buffer 0. %u is the limit.\n",
+                      total_consts,
+                      VGPU10_MAX_CONSTANT_BUFFER_ELEMENT_COUNT);
+         total_consts = VGPU10_MAX_CONSTANT_BUFFER_ELEMENT_COUNT;
+         emit->register_overflow = TRUE;
+      }
       begin_emit_instruction(emit);
       emit_dword(emit, opcode0.value);
       emit_dword(emit, operand0.value);
@@ -5629,14 +5934,41 @@ emit_constant_declaration(struct svga_shader_emitter_v10 *emit)
    }
 
    /* Declare remaining constant buffers (UBOs) */
+
    for (i = 1; i < ARRAY_SIZE(emit->num_shader_consts); i++) {
       if (emit->num_shader_consts[i] > 0) {
-         begin_emit_instruction(emit);
-         emit_dword(emit, opcode0.value);
-         emit_dword(emit, operand0.value);
-         emit_dword(emit, i);  /* which const buffer slot */
-         emit_dword(emit, emit->num_shader_consts[i]);
-         end_emit_instruction(emit);
+         if (emit->raw_bufs & (1 << i)) {
+            /* UBO declared as srv raw buffer */
+
+            VGPU10OpcodeToken0 opcode1;
+            VGPU10OperandToken0 operand1;
+
+            opcode1.value = 0;
+            opcode1.opcodeType = VGPU10_OPCODE_DCL_RESOURCE_RAW;
+            opcode1.resourceDimension = VGPU10_RESOURCE_DIMENSION_UNKNOWN;
+
+            operand1.value = 0;
+            operand1.numComponents = VGPU10_OPERAND_0_COMPONENT;
+            operand1.operandType = VGPU10_OPERAND_TYPE_RESOURCE;
+            operand1.indexDimension = VGPU10_OPERAND_INDEX_1D;
+            operand1.index0Representation = VGPU10_OPERAND_INDEX_IMMEDIATE32;
+
+            begin_emit_instruction(emit);
+            emit_dword(emit, opcode1.value);
+            emit_dword(emit, operand1.value);
+            emit_dword(emit, i + emit->raw_buf_srv_start_index);
+            end_emit_instruction(emit);
+         }
+         else {
+
+            /* UBO declared as const buffer */
+            begin_emit_instruction(emit);
+            emit_dword(emit, opcode0.value);
+            emit_dword(emit, operand0.value);
+            emit_dword(emit, i);  /* which const buffer slot */
+            emit_dword(emit, emit->num_shader_consts[i]);
+            end_emit_instruction(emit);
+         }
       }
    }
 
@@ -5652,7 +5984,8 @@ emit_sampler_declarations(struct svga_shader_emitter_v10 *emit)
 {
    unsigned i;
 
-   for (i = 0; i < emit->num_samplers; i++) {
+   for (i = 0; i < emit->key.num_samplers; i++) {
+
       VGPU10OpcodeToken0 opcode0;
       VGPU10OperandToken0 operand0;
 
@@ -5683,7 +6016,8 @@ emit_sampler_declarations(struct svga_shader_emitter_v10 *emit)
 static unsigned
 pipe_texture_to_resource_dimension(enum tgsi_texture_type target,
                                    unsigned num_samples,
-                                   boolean is_array)
+                                   boolean is_array,
+                                   boolean is_uav)
 {
    switch (target) {
    case PIPE_BUFFER:
@@ -5710,8 +6044,9 @@ pipe_texture_to_resource_dimension(enum tgsi_texture_type target,
       else
          return VGPU10_RESOURCE_DIMENSION_TEXTURE2D;
    case PIPE_TEXTURE_CUBE_ARRAY:
-         return is_array ? VGPU10_RESOURCE_DIMENSION_TEXTURECUBEARRAY :
-                         VGPU10_RESOURCE_DIMENSION_TEXTURECUBE;
+      return is_uav ? VGPU10_RESOURCE_DIMENSION_TEXTURE2DARRAY :
+             (is_array ? VGPU10_RESOURCE_DIMENSION_TEXTURECUBEARRAY :
+                         VGPU10_RESOURCE_DIMENSION_TEXTURECUBE);
    default:
       assert(!"Unexpected resource type");
       return VGPU10_RESOURCE_DIMENSION_TEXTURE2D;
@@ -5725,7 +6060,8 @@ pipe_texture_to_resource_dimension(enum tgsi_texture_type target,
 static unsigned
 tgsi_texture_to_resource_dimension(enum tgsi_texture_type target,
                                    unsigned num_samples,
-                                   boolean is_array)
+                                   boolean is_array,
+                                   boolean is_uav)
 {
    if (target == TGSI_TEXTURE_2D_MSAA && num_samples < 2) {
       target = TGSI_TEXTURE_2D;
@@ -5746,7 +6082,8 @@ tgsi_texture_to_resource_dimension(enum tgsi_texture_type target,
       return VGPU10_RESOURCE_DIMENSION_TEXTURE3D;
    case TGSI_TEXTURE_CUBE:
    case TGSI_TEXTURE_SHADOWCUBE:
-      return VGPU10_RESOURCE_DIMENSION_TEXTURECUBE;
+      return is_uav ? VGPU10_RESOURCE_DIMENSION_TEXTURE2DARRAY :
+                      VGPU10_RESOURCE_DIMENSION_TEXTURECUBE;
    case TGSI_TEXTURE_SHADOW1D:
       return VGPU10_RESOURCE_DIMENSION_TEXTURE1D;
    case TGSI_TEXTURE_SHADOW2D:
@@ -5766,6 +6103,9 @@ tgsi_texture_to_resource_dimension(enum tgsi_texture_type target,
       return is_array ? VGPU10_RESOURCE_DIMENSION_TEXTURE2DMSARRAY
          : VGPU10_RESOURCE_DIMENSION_TEXTURE2DMS;
    case TGSI_TEXTURE_CUBE_ARRAY:
+      return is_uav ? VGPU10_RESOURCE_DIMENSION_TEXTURE2DARRAY :
+             (is_array ? VGPU10_RESOURCE_DIMENSION_TEXTURECUBEARRAY :
+                         VGPU10_RESOURCE_DIMENSION_TEXTURECUBE);
    case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
       return is_array ? VGPU10_RESOURCE_DIMENSION_TEXTURECUBEARRAY
          : VGPU10_RESOURCE_DIMENSION_TEXTURECUBE;
@@ -5811,6 +6151,9 @@ emit_resource_declarations(struct svga_shader_emitter_v10 *emit)
 
    /* Emit resource decl for each sampler */
    for (i = 0; i < emit->num_samplers; i++) {
+      if (!(emit->info.samplers_declared & (1 << i)))
+         continue;
+
       VGPU10OpcodeToken0 opcode0;
       VGPU10OperandToken0 operand0;
       VGPU10ResourceReturnTypeToken return_type;
@@ -5822,13 +6165,15 @@ emit_resource_declarations(struct svga_shader_emitter_v10 *emit)
          opcode0.resourceDimension =
             tgsi_texture_to_resource_dimension(emit->sampler_target[i],
                                                emit->key.tex[i].num_samples,
-                                               emit->key.tex[i].is_array);
+                                               emit->key.tex[i].is_array,
+                                               FALSE);
       }
       else {
          opcode0.resourceDimension =
             pipe_texture_to_resource_dimension(emit->key.tex[i].target,
                                                emit->key.tex[i].num_samples,
-                                               emit->key.tex[i].is_array);
+                                               emit->key.tex[i].is_array,
+                                               FALSE);
       }
       opcode0.sampleCount = emit->key.tex[i].num_samples;
       operand0.value = 0;
@@ -5882,6 +6227,240 @@ emit_resource_declarations(struct svga_shader_emitter_v10 *emit)
    return TRUE;
 }
 
+
+/**
+ * Emit instruction to declare uav for the shader image
+ */
+static void
+emit_image_declarations(struct svga_shader_emitter_v10 *emit)
+{
+   unsigned i = 0;
+   unsigned unit = 0;
+   unsigned uav_mask = 0;
+
+   /* Emit uav decl for each image */
+   for (i = 0; i < emit->num_images; i++, unit++) {
+
+      /* Find the unit index of the next declared image.
+       */
+      while (!(emit->image_mask & (1 << unit))) {
+         unit++;
+      }
+
+      VGPU10OpcodeToken0 opcode0;
+      VGPU10OperandToken0 operand0;
+      VGPU10ResourceReturnTypeToken return_type;
+
+      /* If the corresponding uav for the image is already declared,
+       * skip this image declaration.
+       */
+      if (uav_mask & (1 << emit->key.images[unit].uav_index))
+         continue;
+
+      opcode0.value = 0;
+      opcode0.opcodeType = VGPU10_OPCODE_DCL_UAV_TYPED;
+      opcode0.uavResourceDimension =
+         tgsi_texture_to_resource_dimension(emit->image[unit].Resource,
+                                            0, emit->key.images[unit].is_array,
+                                            TRUE);
+
+      if (emit->key.images[unit].is_single_layer &&
+          emit->key.images[unit].resource_target == PIPE_TEXTURE_3D) {
+         opcode0.uavResourceDimension = VGPU10_RESOURCE_DIMENSION_TEXTURE3D;
+      }
+
+      /* Declare the uav as global coherent if the shader includes memory
+       * barrier instructions.
+       */
+      opcode0.globallyCoherent =
+         (emit->info.opcode_count[TGSI_OPCODE_MEMBAR] > 0) ? 1 : 0;
+
+      operand0.value = 0;
+      operand0.numComponents = VGPU10_OPERAND_0_COMPONENT;
+      operand0.operandType = VGPU10_OPERAND_TYPE_UAV;
+      operand0.indexDimension = VGPU10_OPERAND_INDEX_1D;
+      operand0.index0Representation = VGPU10_OPERAND_INDEX_IMMEDIATE32;
+
+      return_type.value = 0;
+      return_type.component0 =
+         return_type.component1 =
+         return_type.component2 =
+         return_type.component3 = emit->key.images[unit].return_type + 1;
+
+      assert(emit->key.images[unit].uav_index != SVGA3D_INVALID_ID);
+      begin_emit_instruction(emit);
+      emit_dword(emit, opcode0.value);
+      emit_dword(emit, operand0.value);
+      emit_dword(emit, emit->key.images[unit].uav_index);
+      emit_dword(emit, return_type.value);
+      end_emit_instruction(emit);
+
+      /* Mark the uav is already declared */
+      uav_mask |= 1 << emit->key.images[unit].uav_index;
+   }
+
+   emit->uav_declared |= uav_mask;
+}
+
+
+/**
+ * Emit instruction to declare uav for the shader buffer
+ */
+static void
+emit_shader_buf_declarations(struct svga_shader_emitter_v10 *emit)
+{
+   unsigned i;
+   unsigned uav_mask = 0;
+
+   /* Emit uav decl for each shader buffer */
+   for (i = 0; i < emit->num_shader_bufs; i++) {
+      VGPU10OpcodeToken0 opcode0;
+      VGPU10OperandToken0 operand0;
+
+      /* If the corresponding uav for the shader buf is already declared,
+       * skip this shader buffer declaration.
+       */
+      if (uav_mask & (1 << emit->key.shader_buf_uav_index[i]))
+         continue;
+
+      opcode0.value = 0;
+      opcode0.opcodeType = VGPU10_OPCODE_DCL_UAV_RAW;
+
+      /* Declare the uav as global coherent if the shader includes memory
+       * barrier instructions.
+       */
+      opcode0.globallyCoherent =
+         (emit->info.opcode_count[TGSI_OPCODE_MEMBAR] > 0) ? 1 : 0;
+
+      operand0.value = 0;
+      operand0.numComponents = VGPU10_OPERAND_0_COMPONENT;
+      operand0.operandType = VGPU10_OPERAND_TYPE_UAV;
+      operand0.indexDimension = VGPU10_OPERAND_INDEX_1D;
+      operand0.index0Representation = VGPU10_OPERAND_INDEX_IMMEDIATE32;
+
+      assert(emit->key.shader_buf_uav_index[i] != SVGA3D_INVALID_ID);
+      begin_emit_instruction(emit);
+      emit_dword(emit, opcode0.value);
+      emit_dword(emit, operand0.value);
+      emit_dword(emit, emit->key.shader_buf_uav_index[i]);
+      end_emit_instruction(emit);
+
+      /* Mark the uav is already declared */
+      uav_mask |= 1 << emit->key.shader_buf_uav_index[i];
+   }
+
+   emit->uav_declared |= uav_mask;
+}
+
+
+/**
+ * Emit instruction to declare thread group shared memory(tgsm) for shared memory
+ */
+static void
+emit_memory_declarations(struct svga_shader_emitter_v10 *emit)
+{
+   if (emit->cs.shared_memory_declared) {
+      VGPU10OpcodeToken0 opcode0;
+      VGPU10OperandToken0 operand0;
+
+      opcode0.value = 0;
+      opcode0.opcodeType = VGPU10_OPCODE_DCL_TGSM_RAW;
+
+      /* Declare the uav as global coherent if the shader includes memory
+       * barrier instructions.
+       */
+      opcode0.globallyCoherent =
+         (emit->info.opcode_count[TGSI_OPCODE_MEMBAR] > 0) ? 1 : 0;
+
+      operand0.value = 0;
+      operand0.numComponents = VGPU10_OPERAND_0_COMPONENT;
+      operand0.operandType = VGPU10_OPERAND_TYPE_THREAD_GROUP_SHARED_MEMORY;
+      operand0.indexDimension = VGPU10_OPERAND_INDEX_1D;
+      operand0.index0Representation = VGPU10_OPERAND_INDEX_IMMEDIATE32;
+
+      begin_emit_instruction(emit);
+      emit_dword(emit, opcode0.value);
+      emit_dword(emit, operand0.value);
+
+      /* In current state tracker, TGSI shader declares only one shared memory
+       * TODO: To fix TGSI shader in state tracker to get all shared memory
+       * declarations and then fix following indexing. For now, default index
+       * is 1 as per translated TGSI shader
+       */
+      emit_dword(emit, 1);
+      emit_dword(emit, emit->key.cs.mem_size); /* byte Count */
+      end_emit_instruction(emit);
+   }
+}
+
+
+/**
+ * Emit instruction to declare uav for atomic buffers
+ */
+static void
+emit_atomic_buf_declarations(struct svga_shader_emitter_v10 *emit)
+{
+   unsigned atomic_bufs_mask = emit->atomic_bufs_mask;
+   unsigned uav_mask = 0;
+
+   /* Emit uav decl for each atomic buffer */
+   while (atomic_bufs_mask) {
+      unsigned buf_index = u_bit_scan(&atomic_bufs_mask);
+      unsigned uav_index = emit->key.atomic_buf_uav_index[buf_index];
+
+      /* If the corresponding uav for the shader buf is already declared,
+       * skip this shader buffer declaration.
+       */
+      if (uav_mask & (1 << uav_index))
+         continue;
+
+      VGPU10OpcodeToken0 opcode0;
+      VGPU10OperandToken0 operand0;
+
+      assert(uav_index != SVGA3D_INVALID_ID);
+
+      opcode0.value = 0;
+      opcode0.opcodeType = VGPU10_OPCODE_DCL_UAV_RAW;
+      opcode0.uavResourceDimension = VGPU10_RESOURCE_DIMENSION_BUFFER;
+
+      /* Declare the uav as global coherent if the shader includes memory
+       * barrier instructions.
+       */
+      opcode0.globallyCoherent =
+         (emit->info.opcode_count[TGSI_OPCODE_MEMBAR] > 0) ? 1 : 0;
+      opcode0.uavHasCounter = 1;
+
+      operand0.value = 0;
+      operand0.numComponents = VGPU10_OPERAND_0_COMPONENT;
+      operand0.operandType = VGPU10_OPERAND_TYPE_UAV;
+      operand0.indexDimension = VGPU10_OPERAND_INDEX_1D;
+      operand0.index0Representation = VGPU10_OPERAND_INDEX_IMMEDIATE32;
+
+      begin_emit_instruction(emit);
+      emit_dword(emit, opcode0.value);
+      emit_dword(emit, operand0.value);
+      emit_dword(emit, uav_index);
+      end_emit_instruction(emit);
+
+      /* Mark the uav is already declared */
+      uav_mask |= 1 << uav_index;
+   }
+
+   emit->uav_declared |= uav_mask;
+
+   /* Allocate immediates to be used for index to the atomic buffers */
+   unsigned j = 0;
+   for (unsigned i = 0; i <= emit->num_atomic_bufs / 4; i++, j+=4) {
+      alloc_immediate_int4(emit, j+0, j+1, j+2, j+3);
+   }
+
+   /* Allocate immediates for the atomic counter index */
+   for (; j <= emit->max_atomic_counter_index; j+=4) {
+      alloc_immediate_int4(emit, j+0, j+1, j+2, j+3);
+   }
+}
+
+
 /**
  * Emit instruction with n=1, 2 or 3 source registers.
  */
@@ -6751,12 +7330,12 @@ emit_if(struct svga_shader_emitter_v10 *emit,
 
 
 /**
- * Emit code for TGSI_OPCODE_KILL_IF instruction (kill fragment if any of
+ * Emit code for conditional discard instruction (discard fragment if any of
  * the register components are negative).
  */
 static boolean
-emit_kill_if(struct svga_shader_emitter_v10 *emit,
-             const struct tgsi_full_instruction *inst)
+emit_cond_discard(struct svga_shader_emitter_v10 *emit,
+                  const struct tgsi_full_instruction *inst)
 {
    unsigned tmp = get_temp_index(emit);
    struct tgsi_full_src_register tmp_src = make_src_temp_reg(tmp);
@@ -6774,8 +7353,9 @@ emit_kill_if(struct svga_shader_emitter_v10 *emit,
 
    if (!same_swizzle_terms(&inst->Src[0])) {
       /* If the swizzle is not XXXX, YYYY, ZZZZ or WWWW we need to
-       * logically OR the swizzle terms.  Most uses of KILL_IF only
-       * test one channel so it's good to avoid these extra steps.
+       * logically OR the swizzle terms.  Most uses of this conditional
+       * discard instruction only test one channel so it's good to
+       * avoid these extra steps.
        */
       struct tgsi_full_src_register tmp_src_yyyy =
          scalar_src(&tmp_src, TGSI_SWIZZLE_Y);
@@ -6804,11 +7384,11 @@ emit_kill_if(struct svga_shader_emitter_v10 *emit,
 
 
 /**
- * Emit code for TGSI_OPCODE_KILL instruction (unconditional discard).
+ * Emit code for the unconditional discard instruction.
  */
 static boolean
-emit_kill(struct svga_shader_emitter_v10 *emit,
-          const struct tgsi_full_instruction *inst)
+emit_discard(struct svga_shader_emitter_v10 *emit,
+             const struct tgsi_full_instruction *inst)
 {
    struct tgsi_full_src_register zero = make_immediate_reg_float(emit, 0.0f);
 
@@ -7834,7 +8414,7 @@ begin_tex_swizzle(struct svga_shader_emitter_v10 *emit,
    swz->inst_dst = &inst->Dst[0];
    swz->coord_src = &inst->Src[0];
 
-   emit->fs.shadow_compare_units |= shadow_compare << unit;
+   emit->shadow_compare_units |= shadow_compare << unit;
 }
 
 
@@ -8045,12 +8625,16 @@ emit_tex(struct svga_shader_emitter_v10 *emit,
    struct tgsi_full_src_register coord;
    int offsets[3];
    struct tex_swizzle_info swz_info;
+   boolean compare_in_shader;
 
    /* check that the sampler returns a float */
    if (!is_valid_tex_instruction(emit, inst))
       return TRUE;
 
-   begin_tex_swizzle(emit, unit, inst, FALSE, &swz_info);
+   compare_in_shader = tgsi_is_shadow_target(target) &&
+                       emit->key.tex[unit].compare_in_shader;
+
+   begin_tex_swizzle(emit, unit, inst, compare_in_shader, &swz_info);
 
    get_texel_offsets(emit, inst, offsets);
 
@@ -8059,7 +8643,7 @@ emit_tex(struct svga_shader_emitter_v10 *emit,
    /* SAMPLE dst, coord(s0), resource, sampler */
    begin_emit_instruction(emit);
 
-   if (tgsi_is_shadow_target(target))
+   if (tgsi_is_shadow_target(target) && !compare_in_shader)
       opcode = VGPU10_OPCODE_SAMPLE_C;
    else
       opcode = VGPU10_OPCODE_SAMPLE;
@@ -8182,7 +8766,8 @@ emit_tg4(struct svga_shader_emitter_v10 *emit,
       emit_resource_register(emit, unit);
 
       /* sampler */
-      sampler = make_src_reg(TGSI_FILE_SAMPLER, unit);
+      sampler = make_src_reg(TGSI_FILE_SAMPLER,
+                             emit->key.tex[unit].sampler_index);
       sampler.Register.SwizzleX =
       sampler.Register.SwizzleY =
       sampler.Register.SwizzleZ =
@@ -8222,7 +8807,8 @@ emit_tg4(struct svga_shader_emitter_v10 *emit,
          emit_resource_register(emit, unit);
 
          /* sampler */
-         sampler = make_src_reg(TGSI_FILE_SAMPLER, unit);
+         sampler = make_src_reg(TGSI_FILE_SAMPLER,
+                                emit->key.tex[unit].sampler_index);
          sampler.Register.SwizzleX =
          sampler.Register.SwizzleY =
          sampler.Register.SwizzleZ =
@@ -8263,12 +8849,20 @@ emit_tex2(struct svga_shader_emitter_v10 *emit,
    struct tgsi_full_src_register coord, ref;
    int offsets[3];
    struct tex_swizzle_info swz_info;
+   VGPU10_OPCODE_TYPE opcode;
+   boolean compare_in_shader;
 
    /* check that the sampler returns a float */
    if (!is_valid_tex_instruction(emit, inst))
       return TRUE;
 
-   begin_tex_swizzle(emit, unit, inst, FALSE, &swz_info);
+   compare_in_shader = emit->key.tex[unit].compare_in_shader;
+   if (compare_in_shader)
+      opcode = VGPU10_OPCODE_SAMPLE;
+   else
+      opcode = VGPU10_OPCODE_SAMPLE_C;
+
+   begin_tex_swizzle(emit, unit, inst, compare_in_shader, &swz_info);
 
    get_texel_offsets(emit, inst, offsets);
 
@@ -8277,13 +8871,15 @@ emit_tex2(struct svga_shader_emitter_v10 *emit,
 
    /* SAMPLE_C dst, coord, resource, sampler, ref */
    begin_emit_instruction(emit);
-   emit_sample_opcode(emit, VGPU10_OPCODE_SAMPLE_C,
+   emit_sample_opcode(emit, opcode,
                       inst->Instruction.Saturate, offsets);
    emit_dst_register(emit, get_tex_swizzle_dst(&swz_info));
    emit_src_register(emit, &coord);
    emit_resource_register(emit, unit);
    emit_sampler_register(emit, unit);
-   emit_tex_compare_refcoord(emit, target, &ref);
+   if (opcode == VGPU10_OPCODE_SAMPLE_C) {
+      emit_tex_compare_refcoord(emit, target, &ref);
+   }
    end_emit_instruction(emit);
 
    end_tex_swizzle(emit, &swz_info);
@@ -8312,12 +8908,16 @@ emit_txp(struct svga_shader_emitter_v10 *emit,
       scalar_src(&inst->Src[0], TGSI_SWIZZLE_W);
    struct tgsi_full_src_register coord;
    struct tex_swizzle_info swz_info;
+   boolean compare_in_shader;
 
    /* check that the sampler returns a float */
    if (!is_valid_tex_instruction(emit, inst))
       return TRUE;
 
-   begin_tex_swizzle(emit, unit, inst, FALSE, &swz_info);
+   compare_in_shader = tgsi_is_shadow_target(target) &&
+                       emit->key.tex[unit].compare_in_shader;
+
+   begin_tex_swizzle(emit, unit, inst, compare_in_shader, &swz_info);
 
    get_texel_offsets(emit, inst, offsets);
 
@@ -8330,7 +8930,7 @@ emit_txp(struct svga_shader_emitter_v10 *emit,
    /* SAMPLE dst, coord(tmp), resource, sampler */
    begin_emit_instruction(emit);
 
-   if (tgsi_is_shadow_target(target))
+   if (tgsi_is_shadow_target(target) && !compare_in_shader)
       /* NOTE: for non-fragment shaders, we should use
        * VGPU10_OPCODE_SAMPLE_C_LZ, but our virtual GPU accepts this as-is.
        */
@@ -8604,6 +9204,7 @@ opcode_has_dbl_dst(unsigned opcode)
    case TGSI_OPCODE_DNEG:
    case TGSI_OPCODE_I2D:
    case TGSI_OPCODE_U2D:
+   case TGSI_OPCODE_DFMA:
       // XXX more TBD
       return true;
    default:
@@ -8629,6 +9230,7 @@ opcode_has_dbl_src(unsigned opcode)
    case TGSI_OPCODE_DNEG:
    case TGSI_OPCODE_D2I:
    case TGSI_OPCODE_D2U:
+   case TGSI_OPCODE_DFMA:
       // XXX more TBD
       return true;
    default:
@@ -8676,6 +9278,7 @@ check_double_dst_writemask(const struct tgsi_full_instruction *inst)
    case TGSI_OPCODE_DRCP:
    case TGSI_OPCODE_DSQRT:
    case TGSI_OPCODE_F2D:
+   case TGSI_OPCODE_DFMA:
       assert(writemask == TGSI_WRITEMASK_XYZW ||
              writemask == TGSI_WRITEMASK_XY ||
              writemask == TGSI_WRITEMASK_ZW);
@@ -9263,14 +9866,870 @@ emit_vmware(struct svga_shader_emitter_v10 *emit,
    return TRUE;
 }
 
+/**
+ * Emit a memory register
+ */
+
+typedef enum {
+   MEM_STORE = 0,
+   MEM_LOAD = 1,
+   MEM_ATOMIC_COUNTER
+} memory_op;
+
+static void
+emit_memory_register(struct svga_shader_emitter_v10 *emit,
+                     memory_op mem_op,
+                     const struct tgsi_full_instruction *inst,
+                     unsigned regIndex, unsigned writemask)
+{
+   VGPU10OperandToken0 operand0;
+   unsigned resIndex = 0;
+
+   operand0.value = 0;
+   operand0.operandType = VGPU10_OPERAND_TYPE_THREAD_GROUP_SHARED_MEMORY;
+   operand0.indexDimension = VGPU10_OPERAND_INDEX_1D;
+   operand0.numComponents = VGPU10_OPERAND_4_COMPONENT;
+
+   switch (mem_op) {
+   case MEM_ATOMIC_COUNTER:
+   {
+      operand0.numComponents = VGPU10_OPERAND_0_COMPONENT;
+      resIndex = inst->Src[regIndex].Register.Index;
+      break;
+   }
+   case MEM_STORE:
+   {
+      const struct tgsi_full_dst_register *reg = &inst->Dst[regIndex];
+
+      operand0.selectionMode = VGPU10_OPERAND_4_COMPONENT_MASK_MODE;
+      operand0.mask = writemask;
+      resIndex = reg->Register.Index;
+      break;
+   }
+   case MEM_LOAD:
+   {
+      const struct tgsi_full_src_register *reg = &inst->Src[regIndex];
+
+      operand0.selectionMode = VGPU10_OPERAND_4_COMPONENT_SWIZZLE_MODE;
+      operand0.swizzleX = reg->Register.SwizzleX;
+      operand0.swizzleY = reg->Register.SwizzleY;
+      operand0.swizzleZ = reg->Register.SwizzleZ;
+      operand0.swizzleW = reg->Register.SwizzleW;
+      resIndex = reg->Register.Index;
+      break;
+   }
+   default:
+      assert(!"Unexpected memory opcode");
+      break;
+   }
+
+   emit_dword(emit, operand0.value);
+   emit_dword(emit, resIndex);
+}
+
+
+typedef enum {
+   UAV_STORE = 0,
+   UAV_LOAD = 1,
+   UAV_ATOMIC = 2,
+   UAV_RESQ = 3,
+} UAV_OP;
+
 
 /**
- * Translate a single TGSI instruction to VGPU10.
+ * Emit a uav register
+ * \param uav_index     index of resource register
+ * \param uav_op        UAV_STORE/ UAV_LOAD/ UAV_ATOMIC depending on opcode
+ * \param resourceType  resource file type
+ * \param writemask     resource writemask
+ */
+
+static void
+emit_uav_register(struct svga_shader_emitter_v10 *emit,
+                  unsigned res_index, UAV_OP uav_op,
+                  enum tgsi_file_type resourceType, unsigned writemask)
+{
+   VGPU10OperandToken0 operand0;
+   unsigned uav_index = INVALID_INDEX;
+
+   operand0.value = 0;
+   operand0.operandType = VGPU10_OPERAND_TYPE_UAV;
+   operand0.indexDimension = VGPU10_OPERAND_INDEX_1D;
+   operand0.numComponents = VGPU10_OPERAND_4_COMPONENT;
+
+   switch (resourceType) {
+   case TGSI_FILE_IMAGE:
+      uav_index = emit->key.images[res_index].uav_index;
+      break;
+   case TGSI_FILE_BUFFER:
+      uav_index = emit->key.shader_buf_uav_index[res_index];
+      break;
+   case TGSI_FILE_HW_ATOMIC:
+      uav_index = emit->key.atomic_buf_uav_index[res_index];
+      break;
+   default:
+      assert(0);
+   }
+
+   switch (uav_op) {
+   case UAV_ATOMIC:
+      operand0.numComponents = VGPU10_OPERAND_0_COMPONENT;
+      break;
+
+   case UAV_STORE:
+      operand0.selectionMode = VGPU10_OPERAND_4_COMPONENT_MASK_MODE;
+      operand0.mask = writemask;
+      break;
+
+   case UAV_LOAD:
+   case UAV_RESQ:
+      operand0.selectionMode = VGPU10_OPERAND_4_COMPONENT_SWIZZLE_MODE;
+      operand0.swizzleX = VGPU10_COMPONENT_X;
+      operand0.swizzleY = VGPU10_COMPONENT_Y;
+      operand0.swizzleZ = VGPU10_COMPONENT_Z;
+      operand0.swizzleW = VGPU10_COMPONENT_W;
+      break;
+
+   default:
+      break;
+   }
+
+   emit_dword(emit, operand0.value);
+   emit_dword(emit, uav_index);
+}
+
+
+/**
+ * A helper function to emit the uav address.
+ * For memory, buffer, and image resource, it is set to the specified address.
+ * For HW atomic counter, the address is the sum of the address offset and the
+ * offset into the HW atomic buffer as specified by the register index.
+ * It is also possible to specify the counter index as an indirect address.
+ * And in this case, the uav address will be the sum of the address offset and the
+ * counter index specified in the indirect address.
+ */
+static
+struct tgsi_full_src_register
+emit_uav_addr_offset(struct svga_shader_emitter_v10 *emit,
+                     enum tgsi_file_type resourceType,
+                     unsigned resourceIndex,
+                     unsigned resourceIndirect,
+                     unsigned resourceIndirectIndex,
+                     const struct tgsi_full_src_register *addr_reg)
+{
+   unsigned addr_tmp;
+   struct tgsi_full_dst_register addr_dst;
+   struct tgsi_full_src_register addr_src;
+   struct tgsi_full_src_register two = make_immediate_reg_int(emit, 2);
+
+   addr_tmp = get_temp_index(emit);
+   addr_dst = make_dst_temp_reg(addr_tmp);
+   addr_src = make_src_temp_reg(addr_tmp);
+
+   /* specified address offset */
+   emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &addr_dst, addr_reg);
+
+   /* For HW atomic counter, we need to find the index to the
+    * HW atomic buffer.
+    */
+   if (resourceType == TGSI_FILE_HW_ATOMIC) {
+      if (resourceIndirect) {
+
+         /**
+          * uav addr offset  = counter layout offset +
+          *                    counter indirect index address + address offset
+          */
+
+         /* counter layout offset */
+         struct tgsi_full_src_register layout_offset;
+         layout_offset =
+            make_immediate_reg_int(emit, resourceIndex);
+
+         /* counter layout offset + address offset */
+         emit_instruction_op2(emit, VGPU10_OPCODE_IADD, &addr_dst,
+                              &addr_src, &layout_offset);
+
+         /* counter indirect index address */
+         unsigned indirect_addr =
+            emit->address_reg_index[resourceIndirectIndex];
+
+         struct tgsi_full_src_register indirect_addr_src =
+            make_src_temp_reg(indirect_addr);
+
+         indirect_addr_src = scalar_src(&indirect_addr_src, TGSI_SWIZZLE_X);
+
+         /* counter layout offset + address offset + counter indirect address */
+         emit_instruction_op2(emit, VGPU10_OPCODE_IADD, &addr_dst,
+                              &addr_src, &indirect_addr_src);
+
+      } else {
+         struct tgsi_full_src_register index_src;
+
+         index_src = make_immediate_reg_int(emit, resourceIndex);
+
+         /* uav addr offset  = counter index address + address offset */
+         emit_instruction_op2(emit, VGPU10_OPCODE_ADD, &addr_dst,
+                              &addr_src, &index_src);
+      }
+
+      /* HW atomic buffer is declared as raw buffer, so the buffer address is
+       * the byte offset, so we need to multiple the counter addr offset by 4.
+       */
+      emit_instruction_op2(emit, VGPU10_OPCODE_ISHL, &addr_dst,
+                           &addr_src, &two);
+   }
+   else if (resourceType == TGSI_FILE_IMAGE) {
+      if ((emit->key.images[resourceIndex].resource_target == PIPE_TEXTURE_3D)
+             && emit->key.images[resourceIndex].is_single_layer) {
+
+         struct tgsi_full_dst_register addr_dst_z =
+            writemask_dst(&addr_dst, TGSI_WRITEMASK_Z);
+         struct tgsi_full_src_register zero = make_immediate_reg_int(emit, 0);
+
+         /* For non-layered 3D texture image view, we have to make sure the z
+          * component of the address offset is set to 0.
+          */
+         emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &addr_dst_z,
+                              &zero);
+      }
+   }
+
+   return addr_src;
+}
+
+
+
+/**
+ * A helper function to expand indirect indexing to uav resource
+ * by looping through the resource array, compare the indirect index and
+ * emit the instruction for each resource in the array.
+ */
+static void
+loop_instruction(unsigned index, unsigned count,
+                 struct tgsi_full_src_register *addr_index,
+                 void (*fb)(struct svga_shader_emitter_v10 *,
+                            const struct tgsi_full_instruction *, unsigned),
+                 struct svga_shader_emitter_v10 *emit,
+                 const struct tgsi_full_instruction *inst)
+{
+   if (count == 0)
+      return;
+
+   if (index > 0) {
+      /* ELSE */
+      emit_instruction_op0(emit, VGPU10_OPCODE_ELSE);
+   }
+
+   struct tgsi_full_src_register index_src =
+                                    make_immediate_reg_int(emit, index);
+
+   unsigned tmp_index = get_temp_index(emit);
+   struct tgsi_full_src_register tmp_src = make_src_temp_reg(tmp_index);
+   struct tgsi_full_src_register tmp_src_x =
+                scalar_src(&tmp_src, TGSI_SWIZZLE_X);
+   struct tgsi_full_dst_register tmp_dst = make_dst_temp_reg(tmp_index);
+
+   /* IEQ tmp, addr_tmp_index, index */
+   emit_instruction_op2(emit, VGPU10_OPCODE_IEQ, &tmp_dst,
+                        addr_index, &index_src);
+
+   /* IF tmp */
+   emit_if(emit, &tmp_src_x);
+
+   free_temp_indexes(emit);
+
+   (*fb)(emit, inst, index);
+
+   loop_instruction(index+1, count-1, addr_index, fb, emit, inst);
+
+   /* ENDIF */
+   emit_instruction_op0(emit, VGPU10_OPCODE_ENDIF);
+}
+
+
+/**
+ * A helper function to emit the load instruction.
+ */
+static void
+emit_load_instruction(struct svga_shader_emitter_v10 *emit,
+                      const struct tgsi_full_instruction *inst,
+                      unsigned resourceIndex)
+{
+   VGPU10OpcodeToken0 token0;
+   struct tgsi_full_src_register addr_src;
+   enum tgsi_file_type resourceType = inst->Src[0].Register.File;
+
+   /* Resolve the resource address for this resource first */
+   addr_src = emit_uav_addr_offset(emit, resourceType, resourceIndex,
+                                   inst->Src[0].Register.Indirect,
+                                   inst->Src[0].Indirect.Index,
+                                   &inst->Src[1]);
+
+   /* LOAD resource, address, src */
+   begin_emit_instruction(emit);
+
+   token0.value = 0;
+
+   if (resourceType == TGSI_FILE_MEMORY ||
+       resourceType == TGSI_FILE_BUFFER ||
+       resourceType == TGSI_FILE_HW_ATOMIC) {
+      token0.opcodeType = VGPU10_OPCODE_LD_RAW;
+      addr_src = scalar_src(&addr_src, TGSI_SWIZZLE_X);
+   }
+   else {
+      token0.opcodeType = VGPU10_OPCODE_LD_UAV_TYPED;
+   }
+
+   token0.saturate = inst->Instruction.Saturate,
+   emit_dword(emit, token0.value);
+
+   emit_dst_register(emit, &inst->Dst[0]);
+   emit_src_register(emit, &addr_src);
+
+   if (resourceType == TGSI_FILE_MEMORY) {
+      emit_memory_register(emit, MEM_LOAD, inst, 0, 0);
+   } else if (resourceType == TGSI_FILE_HW_ATOMIC) {
+      emit_uav_register(emit, inst->Src[0].Dimension.Index,
+                        UAV_LOAD, inst->Src[0].Register.File, 0);
+   } else {
+      emit_uav_register(emit, resourceIndex,
+                        UAV_LOAD, inst->Src[0].Register.File, 0);
+   }
+
+   end_emit_instruction(emit);
+
+   free_temp_indexes(emit);
+}
+
+
+/**
+ * Emit uav / memory load instruction
  */
 static boolean
-emit_vgpu10_instruction(struct svga_shader_emitter_v10 *emit,
-                        unsigned inst_number,
-                        const struct tgsi_full_instruction *inst)
+emit_load(struct svga_shader_emitter_v10 *emit,
+           const struct tgsi_full_instruction *inst)
+{
+   enum tgsi_file_type resourceType = inst->Src[0].Register.File;
+   unsigned resourceIndex = inst->Src[0].Register.Index;
+
+   /* If the resource register has indirect index, we will need
+    * to expand it since SM5 device does not support indirect indexing
+    * for uav.
+    */
+   if (inst->Src[0].Register.Indirect &&
+       (resourceType == TGSI_FILE_BUFFER || resourceType == TGSI_FILE_IMAGE)) {
+
+      unsigned indirect_index = inst->Src[0].Indirect.Index;
+      unsigned num_resources =
+         resourceType == TGSI_FILE_BUFFER ? emit->num_shader_bufs :
+                                            emit->num_images;
+
+      /* indirect index tmp register */
+      unsigned indirect_addr = emit->address_reg_index[indirect_index];
+      struct tgsi_full_src_register indirect_addr_src =
+         make_src_temp_reg(indirect_addr);
+      indirect_addr_src = scalar_src(&indirect_addr_src, TGSI_SWIZZLE_X);
+
+      /* Add offset to the indirect index */
+      if (inst->Src[0].Register.Index != 0) {
+         struct tgsi_full_src_register offset =
+            make_immediate_reg_int(emit, inst->Src[0].Register.Index);
+         struct tgsi_full_dst_register indirect_addr_dst =
+            make_dst_temp_reg(indirect_addr);
+         emit_instruction_op2(emit, VGPU10_OPCODE_IADD, &indirect_addr_dst,
+                              &indirect_addr_src, &offset);
+      }
+
+      /* Loop through the resource array to find which resource to use.
+       */
+      loop_instruction(0, num_resources, &indirect_addr_src,
+                       emit_load_instruction, emit, inst);
+   }
+   else {
+      emit_load_instruction(emit, inst, resourceIndex);
+   }
+
+   free_temp_indexes(emit);
+
+   return TRUE;
+}
+
+
+/**
+ * A helper function to emit a store instruction.
+ */
+static void
+emit_store_instruction(struct svga_shader_emitter_v10 *emit,
+                       const struct tgsi_full_instruction *inst,
+                       unsigned resourceIndex)
+{
+   VGPU10OpcodeToken0 token0;
+   enum tgsi_file_type resourceType = inst->Dst[0].Register.File;
+   unsigned writemask = inst->Dst[0].Register.WriteMask;
+   struct tgsi_full_src_register addr_src;
+
+   unsigned tmp_index = get_temp_index(emit);
+   struct tgsi_full_src_register tmp_src = make_src_temp_reg(tmp_index);
+   struct tgsi_full_dst_register tmp_dst_xyzw = make_dst_temp_reg(tmp_index);
+   struct tgsi_full_dst_register tmp_dst;
+
+   struct tgsi_full_src_register src = inst->Src[1];
+   struct tgsi_full_src_register four = make_immediate_reg_int(emit, 4);
+
+   boolean needLoad = FALSE;
+   boolean needPerComponentStore = FALSE;
+   unsigned swizzles = 0;
+
+   /* Resolve the resource address for this resource first */
+   addr_src = emit_uav_addr_offset(emit, resourceType,
+                                   inst->Dst[0].Register.Index,
+                                   inst->Dst[0].Register.Indirect,
+                                   inst->Dst[0].Indirect.Index,
+                                   &inst->Src[0]);
+
+   /* First check the writemask to see if it can be supported
+    * by the store instruction.
+    * store_raw only allows .x, .xy, .xyz, .xyzw. For the typeless memory,
+    * we can adjust the address offset, and do a per-component store.
+    * store_uav_typed only allows .xyzw. In this case, we need to
+    * do a load first, update the temporary and then issue the
+    * store. This does have a small risk that if different threads
+    * update different components of the same address, data might not be
+    * in sync.
+    */
+   if (resourceType == TGSI_FILE_IMAGE) {
+      needLoad = (writemask == TGSI_WRITEMASK_XYZW) ? FALSE : TRUE;
+   }
+   else if (resourceType == TGSI_FILE_BUFFER ||
+            resourceType == TGSI_FILE_MEMORY) {
+      if (!(writemask == TGSI_WRITEMASK_X || writemask == TGSI_WRITEMASK_XY ||
+            writemask == TGSI_WRITEMASK_XYZ ||
+            writemask == TGSI_WRITEMASK_XYZW)) {
+         needPerComponentStore = TRUE;
+      }
+   }
+
+   if (needLoad) {
+      assert(resourceType == TGSI_FILE_IMAGE);
+
+      /* LOAD resource, address, src */
+      begin_emit_instruction(emit);
+
+      token0.value = 0;
+      token0.opcodeType = VGPU10_OPCODE_LD_UAV_TYPED;
+      token0.saturate = inst->Instruction.Saturate,
+      emit_dword(emit, token0.value);
+
+      emit_dst_register(emit, &tmp_dst_xyzw);
+      emit_src_register(emit, &addr_src);
+      emit_uav_register(emit, resourceIndex, UAV_LOAD, resourceType, 0);
+
+      end_emit_instruction(emit);
+
+      /* MOV tmp(writemask) src */
+      tmp_dst = writemask_dst(&tmp_dst_xyzw, writemask);
+      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &tmp_dst, &inst->Src[1]);
+
+      /* Now set the writemask to xyzw for the store_uav_typed instruction */
+      writemask = TGSI_WRITEMASK_XYZW;
+   }
+   else if (needPerComponentStore) {
+      /* Save the src swizzles */
+      swizzles = src.Register.SwizzleX |
+                 src.Register.SwizzleY << 2 |
+                 src.Register.SwizzleZ << 4 |
+                 src.Register.SwizzleW << 6;
+   }
+
+   boolean storeDone = FALSE;
+   unsigned perComponentWritemask = writemask;
+   unsigned shift = 0;
+   struct tgsi_full_src_register shift_src;
+
+   while (!storeDone) {
+
+      if (needPerComponentStore) {
+         assert(perComponentWritemask);
+         while (!(perComponentWritemask & TGSI_WRITEMASK_X)) {
+            shift++;
+            perComponentWritemask >>= 1;
+         }
+
+         /* First adjust the addr_src to the next component */
+         if (shift != 0) {
+            struct tgsi_full_dst_register addr_dst =
+               make_dst_temp_reg(addr_src.Register.Index);
+            shift_src = make_immediate_reg_int(emit, shift);
+            emit_instruction_op3(emit, VGPU10_OPCODE_UMAD, &addr_dst, &four,
+                                 &shift_src, &addr_src);
+
+            /* Adjust the src swizzle as well */
+            swizzles >>= (shift * 2);
+         }
+
+         /* Now the address offset is set to the next component,
+          * we can set the writemask to .x and make sure to set
+          * the src swizzle as well.
+          */
+         src.Register.SwizzleX = swizzles & 0x3;
+         writemask = TGSI_WRITEMASK_X;
+
+         /* Shift for the next component check */
+         perComponentWritemask >>= 1;
+         shift = 1;
+      }
+
+      /* STORE resource, address, src */
+      begin_emit_instruction(emit);
+
+      token0.value = 0;
+      token0.saturate = inst->Instruction.Saturate;
+
+      if (resourceType == TGSI_FILE_MEMORY) {
+         token0.opcodeType = VGPU10_OPCODE_STORE_RAW;
+         addr_src = scalar_src(&addr_src, TGSI_SWIZZLE_X);
+         emit_dword(emit, token0.value);
+         emit_memory_register(emit, MEM_STORE, inst, 0, writemask);
+      }
+      else if (resourceType == TGSI_FILE_BUFFER ||
+               resourceType == TGSI_FILE_HW_ATOMIC) {
+         token0.opcodeType = VGPU10_OPCODE_STORE_RAW;
+         addr_src = scalar_src(&addr_src, TGSI_SWIZZLE_X);
+         emit_dword(emit, token0.value);
+         emit_uav_register(emit, resourceIndex, UAV_STORE,
+                           resourceType, writemask);
+      }
+      else {
+         token0.opcodeType = VGPU10_OPCODE_STORE_UAV_TYPED;
+         emit_dword(emit, token0.value);
+         emit_uav_register(emit, resourceIndex, UAV_STORE,
+                           resourceType, writemask);
+      }
+
+      emit_src_register(emit, &addr_src);
+
+      if (needLoad)
+         emit_src_register(emit, &tmp_src);
+      else
+         emit_src_register(emit, &src);
+
+      end_emit_instruction(emit);
+
+      if (!needPerComponentStore || !perComponentWritemask)
+         storeDone = TRUE;
+   }
+
+   free_temp_indexes(emit);
+}
+
+
+/**
+ * Emit uav / memory store instruction
+ */
+static boolean
+emit_store(struct svga_shader_emitter_v10 *emit,
+           const struct tgsi_full_instruction *inst)
+{
+   enum tgsi_file_type resourceType = inst->Dst[0].Register.File;
+   unsigned resourceIndex = inst->Dst[0].Register.Index;
+
+   /* If the resource register has indirect index, we will need
+    * to expand it since SM5 device does not support indirect indexing
+    * for uav.
+    */
+   if (inst->Dst[0].Register.Indirect &&
+       (resourceType == TGSI_FILE_BUFFER || resourceType == TGSI_FILE_IMAGE)) {
+
+      unsigned indirect_index = inst->Dst[0].Indirect.Index;
+      unsigned num_resources =
+         resourceType == TGSI_FILE_BUFFER ? emit->num_shader_bufs :
+                                            emit->num_images;
+
+      /* Indirect index tmp register */
+      unsigned indirect_addr = emit->address_reg_index[indirect_index];
+      struct tgsi_full_src_register indirect_addr_src =
+         make_src_temp_reg(indirect_addr);
+      indirect_addr_src = scalar_src(&indirect_addr_src, TGSI_SWIZZLE_X);
+
+      /* Add offset to the indirect index */
+      if (inst->Dst[0].Register.Index != 0) {
+         struct tgsi_full_src_register offset =
+            make_immediate_reg_int(emit, inst->Dst[0].Register.Index);
+         struct tgsi_full_dst_register indirect_addr_dst =
+            make_dst_temp_reg(indirect_addr);
+         emit_instruction_op2(emit, VGPU10_OPCODE_IADD, &indirect_addr_dst,
+                              &indirect_addr_src, &offset);
+      }
+
+      /* Loop through the resource array to find which resource to use.
+       */
+      loop_instruction(0, num_resources, &indirect_addr_src,
+                       emit_store_instruction, emit, inst);
+   }
+   else {
+      emit_store_instruction(emit, inst, resourceIndex);
+   }
+
+   free_temp_indexes(emit);
+
+   return TRUE;
+}
+
+
+/**
+ * A helper function to emit an atomic instruction.
+ */
+
+static void
+emit_atomic_instruction(struct svga_shader_emitter_v10 *emit,
+                        const struct tgsi_full_instruction *inst,
+                        unsigned resourceIndex)
+{
+   VGPU10OpcodeToken0 token0;
+   enum tgsi_file_type resourceType = inst->Src[0].Register.File;
+   struct tgsi_full_src_register addr_src;
+   VGPU10_OPCODE_TYPE opcode = emit->cur_atomic_opcode;
+
+   /* Resolve the resource address */
+   addr_src = emit_uav_addr_offset(emit, resourceType,
+                                   inst->Src[0].Register.Index,
+                                   inst->Src[0].Register.Indirect,
+                                   inst->Src[0].Indirect.Index,
+                                   &inst->Src[1]);
+
+   /* Emit the atomic operation */
+   begin_emit_instruction(emit);
+
+   token0.value = 0;
+   token0.opcodeType = opcode;
+   token0.saturate = inst->Instruction.Saturate,
+   emit_dword(emit, token0.value);
+
+   emit_dst_register(emit, &inst->Dst[0]);
+
+   if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
+      emit_memory_register(emit, MEM_ATOMIC_COUNTER, inst, 0, 0);
+   } else if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC) {
+      assert(inst->Src[0].Register.Dimension == 1);
+      emit_uav_register(emit, inst->Src[0].Dimension.Index,
+                        UAV_ATOMIC, inst->Src[0].Register.File, 0);
+   } else {
+      emit_uav_register(emit, resourceIndex,
+                        UAV_ATOMIC, inst->Src[0].Register.File, 0);
+   }
+
+   /* resource address offset */
+   emit_src_register(emit, &addr_src);
+
+   struct tgsi_full_src_register src0_x =
+         swizzle_src(&inst->Src[2], TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
+                     TGSI_SWIZZLE_X, TGSI_SWIZZLE_X);
+   emit_src_register(emit, &src0_x);
+
+   if (opcode == VGPU10_OPCODE_IMM_ATOMIC_CMP_EXCH) {
+      struct tgsi_full_src_register src1_x =
+         swizzle_src(&inst->Src[3], TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
+                     TGSI_SWIZZLE_X, TGSI_SWIZZLE_X);
+
+      emit_src_register(emit, &src1_x);
+   }
+
+   end_emit_instruction(emit);
+
+   free_temp_indexes(emit);
+}
+
+
+/**
+ * Emit atomic instruction
+ */
+static boolean
+emit_atomic(struct svga_shader_emitter_v10 *emit,
+            const struct tgsi_full_instruction *inst,
+            VGPU10_OPCODE_TYPE opcode)
+{
+   enum tgsi_file_type resourceType = inst->Src[0].Register.File;
+   unsigned resourceIndex = inst->Src[0].Register.Index;
+
+   emit->cur_atomic_opcode = opcode;
+
+   /* If the resource register has indirect index, we will need
+    * to expand it since SM5 device does not support indirect indexing
+    * for uav.
+    */
+   if (inst->Dst[0].Register.Indirect &&
+       (resourceType == TGSI_FILE_BUFFER || resourceType == TGSI_FILE_IMAGE)) {
+
+      unsigned indirect_index = inst->Dst[0].Indirect.Index;
+      unsigned num_resources =
+         resourceType == TGSI_FILE_BUFFER ? emit->num_shader_bufs :
+                                            emit->num_images;
+
+      /* indirect index tmp register */
+      unsigned indirect_addr = emit->address_reg_index[indirect_index];
+      struct tgsi_full_src_register indirect_addr_src =
+         make_src_temp_reg(indirect_addr);
+      indirect_addr_src = scalar_src(&indirect_addr_src, TGSI_SWIZZLE_X);
+
+      /* Loop through the resource array to find which resource to use.
+       */
+      loop_instruction(0, num_resources, &indirect_addr_src,
+                       emit_atomic_instruction, emit, inst);
+   }
+   else {
+      emit_atomic_instruction(emit, inst, resourceIndex);
+   }
+
+   free_temp_indexes(emit);
+
+   return TRUE;
+}
+
+
+/**
+ * Emit barrier instruction
+ */
+static boolean
+emit_barrier(struct svga_shader_emitter_v10 *emit,
+             const struct tgsi_full_instruction *inst)
+{
+   VGPU10OpcodeToken0 token0;
+
+   assert(emit->version >= 50);
+
+   token0.value = 0;
+   token0.opcodeType = VGPU10_OPCODE_SYNC;
+
+   if (emit->unit == PIPE_SHADER_TESS_CTRL && emit->version == 50) {
+      /* SM5 device doesn't support BARRIER in tcs . If barrier is used
+       * in shader, don't do anything for this opcode and continue rest
+       * of shader translation
+       */
+      pipe_debug_message(&emit->svga_debug_callback, INFO,
+                         "barrier instruction is not supported in tessellation control shader\n");
+      return TRUE;
+   }
+   else if (emit->unit == PIPE_SHADER_COMPUTE) {
+      if (emit->cs.shared_memory_declared)
+         token0.syncThreadGroupShared = 1;
+
+      if (emit->uav_declared)
+         token0.syncUAVMemoryGroup = 1;
+
+      token0.syncThreadsInGroup = 1;
+   } else {
+      token0.syncUAVMemoryGlobal = 1;
+   }
+
+   assert(token0.syncUAVMemoryGlobal || token0.syncUAVMemoryGroup ||
+          token0.syncThreadGroupShared);
+
+   begin_emit_instruction(emit);
+   emit_dword(emit, token0.value);
+   end_emit_instruction(emit);
+
+   return TRUE;
+}
+
+/**
+ * Emit memory barrier instruction
+ */
+static boolean
+emit_memory_barrier(struct svga_shader_emitter_v10 *emit,
+                    const struct tgsi_full_instruction *inst)
+{
+   unsigned index = inst->Src[0].Register.Index;
+   unsigned swizzle = inst->Src[0].Register.SwizzleX;
+   unsigned bartype = emit->immediates[index][swizzle].Int;
+   VGPU10OpcodeToken0 token0;
+
+   token0.value = 0;
+   token0.opcodeType = VGPU10_OPCODE_SYNC;
+
+   if (emit->unit == PIPE_SHADER_COMPUTE) {
+
+      /* For compute shader, issue sync opcode with different options
+       * depending on the memory barrier type.
+       *
+       * Bit 0: Shader storage buffers
+       * Bit 1: Atomic buffers
+       * Bit 2: Images
+       * Bit 3: Shared memory
+       * Bit 4: Thread group
+       */
+
+      if (bartype & (TGSI_MEMBAR_SHADER_BUFFER | TGSI_MEMBAR_ATOMIC_BUFFER |
+                     TGSI_MEMBAR_SHADER_IMAGE))
+         token0.syncUAVMemoryGlobal = 1;
+      else if (bartype & TGSI_MEMBAR_THREAD_GROUP)
+         token0.syncUAVMemoryGroup = 1;
+
+      if (bartype & TGSI_MEMBAR_SHARED)
+         token0.syncThreadGroupShared = 1;
+   }
+   else {
+      /**
+       * For graphics stages, only sync_uglobal is available.
+       */
+      if (bartype & (TGSI_MEMBAR_SHADER_BUFFER | TGSI_MEMBAR_ATOMIC_BUFFER |
+                     TGSI_MEMBAR_SHADER_IMAGE))
+         token0.syncUAVMemoryGlobal = 1;
+   }
+
+   assert(token0.syncUAVMemoryGlobal || token0.syncUAVMemoryGroup ||
+          token0.syncThreadGroupShared);
+
+   begin_emit_instruction(emit);
+   emit_dword(emit, token0.value);
+   end_emit_instruction(emit);
+
+   return TRUE;
+}
+
+
+/**
+ * Emit code for TGSI_OPCODE_RESQ (image size) instruction.
+ */
+static boolean
+emit_resq(struct svga_shader_emitter_v10 *emit,
+         const struct tgsi_full_instruction *inst)
+{
+   struct tgsi_full_src_register zero =
+      make_immediate_reg_int(emit, 0);
+
+   unsigned uav_resource = emit->image[inst->Src[0].Register.Index].Resource;
+
+   if (uav_resource == TGSI_TEXTURE_CUBE_ARRAY) {
+      struct tgsi_full_src_register image_src;
+
+      image_src = make_src_const_reg(emit->image_size_index + inst->Src[0].Register.Index);
+
+      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &inst->Dst[0], &image_src);
+      return TRUE;
+   }
+
+   begin_emit_instruction(emit);
+   if (uav_resource == TGSI_TEXTURE_BUFFER) {
+      emit_opcode(emit, VGPU10_OPCODE_BUFINFO, FALSE);
+      emit_dst_register(emit, &inst->Dst[0]);
+   }
+   else {
+      emit_opcode_resinfo(emit, VGPU10_RESINFO_RETURN_UINT);
+      emit_dst_register(emit, &inst->Dst[0]);
+      emit_src_register(emit, &zero);
+   }
+   emit_uav_register(emit, inst->Src[0].Register.Index,
+                     UAV_RESQ, inst->Src[0].Register.File, 0);
+   end_emit_instruction(emit);
+
+   return TRUE;
+}
+
+
+static boolean
+emit_instruction(struct svga_shader_emitter_v10 *emit,
+                 unsigned inst_number,
+                 const struct tgsi_full_instruction *inst)
 {
    const enum tgsi_opcode opcode = inst->Instruction.Opcode;
 
@@ -9397,9 +10856,9 @@ emit_vgpu10_instruction(struct svga_shader_emitter_v10 *emit,
    case TGSI_OPCODE_IF:
       return emit_if(emit, &inst->Src[0]);
    case TGSI_OPCODE_KILL:
-      return emit_kill(emit, inst);
+      return emit_discard(emit, inst);
    case TGSI_OPCODE_KILL_IF:
-      return emit_kill_if(emit, inst);
+      return emit_cond_discard(emit, inst);
    case TGSI_OPCODE_LG2:
       return emit_lg2(emit, inst);
    case TGSI_OPCODE_LIT:
@@ -9496,12 +10955,14 @@ emit_vgpu10_instruction(struct svga_shader_emitter_v10 *emit,
       return emit_simple(emit, inst);
    case TGSI_OPCODE_INTERP_OFFSET:
       return emit_interp_offset(emit, inst);
+   case TGSI_OPCODE_FMA:
+   case TGSI_OPCODE_DFMA:
+      return emit_simple(emit, inst);
 
    /* The following opcodes should never be seen here.  We return zero
     * for all the PIPE_CAP_TGSI_DROUND_SUPPORTED, DFRACEXP_DLDEXP_SUPPORTED,
     * FMA_SUPPORTED, LDEXP_SUPPORTED queries.
     */
-   case TGSI_OPCODE_FMA:
    case TGSI_OPCODE_LDEXP:
    case TGSI_OPCODE_DSSG:
    case TGSI_OPCODE_DFRACEXP:
@@ -9515,31 +10976,49 @@ emit_vgpu10_instruction(struct svga_shader_emitter_v10 *emit,
       return FALSE;
 
    case TGSI_OPCODE_LOAD:
+      return emit_load(emit, inst);
+
    case TGSI_OPCODE_STORE:
+      return emit_store(emit, inst);
+
    case TGSI_OPCODE_ATOMAND:
+      return emit_atomic(emit, inst, VGPU10_OPCODE_IMM_ATOMIC_AND);
+
    case TGSI_OPCODE_ATOMCAS:
+      return emit_atomic(emit, inst, VGPU10_OPCODE_IMM_ATOMIC_CMP_EXCH);
+
    case TGSI_OPCODE_ATOMIMAX:
+      return emit_atomic(emit, inst, VGPU10_OPCODE_IMM_ATOMIC_IMAX);
+
    case TGSI_OPCODE_ATOMIMIN:
+      return emit_atomic(emit, inst, VGPU10_OPCODE_IMM_ATOMIC_IMIN);
+
    case TGSI_OPCODE_ATOMOR:
+      return emit_atomic(emit, inst, VGPU10_OPCODE_IMM_ATOMIC_OR);
+
    case TGSI_OPCODE_ATOMUADD:
+      return emit_atomic(emit, inst, VGPU10_OPCODE_IMM_ATOMIC_IADD);
+
    case TGSI_OPCODE_ATOMUMAX:
+      return emit_atomic(emit, inst, VGPU10_OPCODE_IMM_ATOMIC_UMAX);
+
    case TGSI_OPCODE_ATOMUMIN:
+      return emit_atomic(emit, inst, VGPU10_OPCODE_IMM_ATOMIC_UMIN);
+
    case TGSI_OPCODE_ATOMXCHG:
+      return emit_atomic(emit, inst, VGPU10_OPCODE_IMM_ATOMIC_EXCH);
+
    case TGSI_OPCODE_ATOMXOR:
-      return FALSE;
+      return emit_atomic(emit, inst, VGPU10_OPCODE_IMM_ATOMIC_XOR);
+
    case TGSI_OPCODE_BARRIER:
-      if (emit->unit == PIPE_SHADER_TESS_CTRL) {
-         /* SM5 device doesn't support BARRIER in tcs . If barrier is used
-          * in shader, don't do anything for this opcode and continue rest
-          * of shader translation
-          */
-         pipe_debug_message(&emit->svga_debug_callback, INFO,
-                            "barrier instruction is not supported in tessellation control shader\n");
-         return TRUE;
-      }
-      else {
-         return emit_simple(emit, inst);
-      }
+      return emit_barrier(emit, inst);
+
+   case TGSI_OPCODE_MEMBAR:
+      return emit_memory_barrier(emit, inst);
+
+   case TGSI_OPCODE_RESQ:
+      return emit_resq(emit, inst);
 
    case TGSI_OPCODE_END:
       if (!emit_post_helpers(emit))
@@ -9556,6 +11035,38 @@ emit_vgpu10_instruction(struct svga_shader_emitter_v10 *emit,
 }
 
 
+/**
+ * Translate a single TGSI instruction to VGPU10.
+ */
+static boolean
+emit_vgpu10_instruction(struct svga_shader_emitter_v10 *emit,
+                        unsigned inst_number,
+                        const struct tgsi_full_instruction *inst)
+{
+   if (emit->skip_instruction)
+      return TRUE;
+
+   boolean ret = TRUE;
+   unsigned start_token = emit_get_num_tokens(emit);
+
+   emit->reemit_tgsi_instruction = FALSE;
+
+   ret = emit_instruction(emit, inst_number, inst);
+
+   if (emit->reemit_tgsi_instruction) {
+      /**
+       * Reset emit->ptr to where the translation of this tgsi instruction
+       * started.
+       */
+      VGPU10OpcodeToken0 *tokens = (VGPU10OpcodeToken0 *) emit->buf;
+      emit->ptr = (char *) (tokens + start_token);
+
+      emit->reemit_tgsi_instruction = FALSE;
+   }
+   return ret;
+}
+
+
 /**
  * Emit the extra instructions to adjust the vertex position.
  * There are two possible adjustments:
@@ -10232,50 +11743,61 @@ emit_tcs_default_control_point_output(struct svga_shader_emitter_v10 *emit)
 {
    assert(emit->unit == PIPE_SHADER_TESS_CTRL);
    assert(emit->tcs.control_point_phase);
-   assert(emit->tcs.control_point_input_index != INVALID_INDEX);
    assert(emit->tcs.control_point_out_index != INVALID_INDEX);
    assert(emit->tcs.invocation_id_sys_index != INVALID_INDEX);
 
-   /* UARL ADDR[INDEX].x INVOCATION.xxxx */
-
-   struct tgsi_full_src_register invocation_src;
-   struct tgsi_full_dst_register addr_dst;
-   struct tgsi_full_dst_register addr_dst_x;
-   unsigned addr_tmp;
-
-   addr_tmp = emit->address_reg_index[emit->tcs.control_point_addr_index];
-   addr_dst = make_dst_temp_reg(addr_tmp);
-   addr_dst_x = writemask_dst(&addr_dst, TGSI_WRITEMASK_X);
-
-   invocation_src = make_src_reg(TGSI_FILE_SYSTEM_VALUE,
-                                 emit->tcs.invocation_id_sys_index);
-
-   begin_emit_instruction(emit);
-   emit_opcode_precise(emit, VGPU10_OPCODE_MOV, FALSE, FALSE);
-   emit_dst_register(emit, &addr_dst_x);
-   emit_src_register(emit, &invocation_src);
-   end_emit_instruction(emit);
-
-
-   /* MOV OUTPUT INPUT[ADDR[INDEX].x][POSITION] */
-
-   struct tgsi_full_src_register input_control_point;
    struct tgsi_full_dst_register output_control_point;
-
-   input_control_point = make_src_reg(TGSI_FILE_INPUT,
-                                      emit->tcs.control_point_input_index);
-   input_control_point.Register.Dimension = 1;
-   input_control_point.Dimension.Indirect = 1;
-   input_control_point.DimIndirect.File = TGSI_FILE_ADDRESS;
-   input_control_point.DimIndirect.Index = emit->tcs.control_point_addr_index;
    output_control_point =
       make_dst_output_reg(emit->tcs.control_point_out_index);
 
-   begin_emit_instruction(emit);
-   emit_opcode_precise(emit, VGPU10_OPCODE_MOV, FALSE, FALSE);
-   emit_dst_register(emit, &output_control_point);
-   emit_src_register(emit, &input_control_point);
-   end_emit_instruction(emit);
+   if (emit->tcs.control_point_input_index == INVALID_INDEX) {
+      /* MOV OUTPUT 0.0f */
+      struct tgsi_full_src_register zero = make_immediate_reg_float(emit, 0.0f);
+      begin_emit_instruction(emit);
+      emit_opcode_precise(emit, VGPU10_OPCODE_MOV, FALSE, FALSE);
+      emit_dst_register(emit, &output_control_point);
+      emit_src_register(emit, &zero);
+      end_emit_instruction(emit);
+   }
+   else {
+      /* UARL ADDR[INDEX].x INVOCATION.xxxx */
+
+      struct tgsi_full_src_register invocation_src;
+      struct tgsi_full_dst_register addr_dst;
+      struct tgsi_full_dst_register addr_dst_x;
+      unsigned addr_tmp;
+
+      addr_tmp = emit->address_reg_index[emit->tcs.control_point_addr_index];
+      addr_dst = make_dst_temp_reg(addr_tmp);
+      addr_dst_x = writemask_dst(&addr_dst, TGSI_WRITEMASK_X);
+
+      invocation_src = make_src_reg(TGSI_FILE_SYSTEM_VALUE,
+                                    emit->tcs.invocation_id_sys_index);
+
+      begin_emit_instruction(emit);
+      emit_opcode_precise(emit, VGPU10_OPCODE_MOV, FALSE, FALSE);
+      emit_dst_register(emit, &addr_dst_x);
+      emit_src_register(emit, &invocation_src);
+      end_emit_instruction(emit);
+
+
+      /* MOV OUTPUT INPUT[ADDR[INDEX].x][POSITION] */
+
+      struct tgsi_full_src_register input_control_point;
+      input_control_point = make_src_reg(TGSI_FILE_INPUT,
+                                         emit->tcs.control_point_input_index);
+      input_control_point.Register.Dimension = 1;
+      input_control_point.Dimension.Indirect = 1;
+      input_control_point.DimIndirect.File = TGSI_FILE_ADDRESS;
+      input_control_point.DimIndirect.Index =
+         emit->tcs.control_point_addr_index;
+
+      begin_emit_instruction(emit);
+      emit_opcode_precise(emit, VGPU10_OPCODE_MOV, FALSE, FALSE);
+      emit_dst_register(emit, &output_control_point);
+      emit_src_register(emit, &input_control_point);
+      end_emit_instruction(emit);
+   }
 }
 
 /**
@@ -10404,6 +11926,9 @@ emit_pre_helpers(struct svga_shader_emitter_v10 *emit)
    else if (emit->unit == PIPE_SHADER_TESS_EVAL) {
       emit_domain_shader_declarations(emit);
    }
+   else if (emit->unit == PIPE_SHADER_COMPUTE) {
+      emit_compute_shader_declarations(emit);
+   }
 
    /* Declare inputs */
    if (!emit_input_declarations(emit))
@@ -10420,6 +11945,9 @@ emit_pre_helpers(struct svga_shader_emitter_v10 *emit)
     * will already be declared in hs_decls (emit_hull_shader_declarations)
     */
    if (emit->unit != PIPE_SHADER_TESS_CTRL) {
+
+      alloc_common_immediates(emit);
+
       /* Declare constant registers */
       emit_constant_declaration(emit);
 
@@ -10427,13 +11955,18 @@ emit_pre_helpers(struct svga_shader_emitter_v10 *emit)
       emit_sampler_declarations(emit);
       emit_resource_declarations(emit);
 
-      alloc_common_immediates(emit);
-      /* Now, emit the constant block containing all the immediates
-       * declared by shader, as well as the extra ones seen above.
-       */
+      /* Declare images */
+      emit_image_declarations(emit);
+
+      /* Declare shader buffers */
+      emit_shader_buf_declarations(emit);
+
+      /* Declare atomic buffers */
+      emit_atomic_buf_declarations(emit);
    }
 
-   if (emit->unit != PIPE_SHADER_FRAGMENT) {
+   if (emit->unit != PIPE_SHADER_FRAGMENT &&
+       emit->unit != PIPE_SHADER_COMPUTE) {
       /*
        * Declare clip distance output registers for ClipVertex or
        * user defined planes
@@ -10441,6 +11974,18 @@ emit_pre_helpers(struct svga_shader_emitter_v10 *emit)
       emit_clip_distance_declarations(emit);
    }
 
+   if (emit->unit == PIPE_SHADER_COMPUTE) {
+      emit_memory_declarations(emit);
+
+      if (emit->cs.grid_size.tgsi_index != INVALID_INDEX) {
+         emit->cs.grid_size.imm_index =
+            alloc_immediate_int4(emit,
+                                 emit->key.cs.grid_size[0],
+                                 emit->key.cs.grid_size[1],
+                                 emit->key.cs.grid_size[2], 0);
+      }
+   }
+
    if (emit->unit == PIPE_SHADER_FRAGMENT &&
        emit->key.fs.alpha_func != SVGA3D_CMP_ALWAYS) {
       float alpha = emit->key.fs.alpha_ref;
@@ -10536,7 +12081,9 @@ static void
 emit_alpha_test_instructions(struct svga_shader_emitter_v10 *emit,
                              unsigned fs_color_tmp_index)
 {
-   /* compare output color's alpha to alpha ref and kill */
+   /* compare output color's alpha to alpha ref and discard if comparison
+    * fails.
+    */
    unsigned tmp = get_temp_index(emit);
    struct tgsi_full_src_register tmp_src = make_src_temp_reg(tmp);
    struct tgsi_full_src_register tmp_src_x =
@@ -10670,6 +12217,93 @@ emit_post_helpers(struct svga_shader_emitter_v10 *emit)
 }
 
 
+/**
+ * Reemit rawbuf instruction
+ */
+static boolean
+emit_rawbuf_instruction(struct svga_shader_emitter_v10 *emit,
+                        unsigned inst_number,
+                        const struct tgsi_full_instruction *inst)
+{
+   boolean ret;
+
+   /* For all the rawbuf references in this instruction,
+    * load the rawbuf reference and assign to the designated temporary.
+    * Then reeemit the instruction.
+    */
+   emit->reemit_rawbuf_instruction = REEMIT_IN_PROGRESS;
+
+   unsigned offset_tmp = get_temp_index(emit);
+   struct tgsi_full_dst_register offset_dst = make_dst_temp_reg(offset_tmp);
+   struct tgsi_full_src_register offset_src = make_src_temp_reg(offset_tmp);
+   struct tgsi_full_src_register four = make_immediate_reg_int(emit, 4);
+
+   for (unsigned i = 0; i < emit->raw_buf_cur_tmp_index; i++) {
+      struct tgsi_full_src_register element_src;
+
+      /* First get the element index register. */
+
+      if (emit->raw_buf_tmp[i].indirect) {
+         unsigned tmp = get_temp_index(emit);
+         struct tgsi_full_dst_register element_dst = make_dst_temp_reg(tmp);
+         struct tgsi_full_src_register element_index =
+            make_src_temp_reg(emit->raw_buf_tmp[i].element_index);
+         struct tgsi_full_src_register element_rel =
+            make_immediate_reg_int(emit, emit->raw_buf_tmp[i].element_rel);
+
+         element_src = make_src_temp_reg(tmp);
+         element_src = scalar_src(&element_src, TGSI_SWIZZLE_X);
+         element_dst = writemask_dst(&element_dst, TGSI_WRITEMASK_X);
+
+         /* element index from the indirect register */
+         element_index = make_src_temp_reg(emit->raw_buf_tmp[i].element_index);
+         element_index = scalar_src(&element_index, TGSI_SWIZZLE_X);
+
+         /* IADD element_src element_index element_index_relative */
+         emit_instruction_op2(emit, VGPU10_OPCODE_IADD, &element_dst,
+                              &element_index, &element_rel);
+      }
+      else {
+         element_src =
+            make_immediate_reg_int(emit, emit->raw_buf_tmp[i].element_index);
+      }
+
+      /* byte offset = element index << 4 */
+      emit_instruction_op2(emit, VGPU10_OPCODE_ISHL, &offset_dst,
+                           &element_src, &four);
+
+      struct tgsi_full_dst_register dst_tmp =
+         make_dst_temp_reg(i + emit->raw_buf_tmp_index);
+
+      /* LD_RAW tmp, rawbuf byte offset, rawbuf */
+
+      begin_emit_instruction(emit);
+      emit_opcode(emit, VGPU10_OPCODE_LD_RAW, FALSE);
+      emit_dst_register(emit, &dst_tmp);
+
+      struct tgsi_full_src_register offset_x =
+            scalar_src(&offset_src, TGSI_SWIZZLE_X);
+      emit_src_register(emit, &offset_x);
+
+      emit_resource_register(emit,
+         emit->raw_buf_tmp[i].buffer_index + emit->raw_buf_srv_start_index);
+      end_emit_instruction(emit);
+   }
+
+   emit->raw_buf_cur_tmp_index = 0;
+
+   ret = emit_vgpu10_instruction(emit, inst_number, inst);
+
+   /* reset raw buf state */
+   emit->raw_buf_cur_tmp_index = 0;
+   emit->reemit_rawbuf_instruction = REEMIT_FALSE;
+
+   free_temp_indexes(emit);
+
+   return ret;
+}
+
+
 /**
  * Translate the TGSI tokens into VGPU10 tokens.
  */
@@ -10730,6 +12364,10 @@ emit_vgpu10_instructions(struct svga_shader_emitter_v10 *emit,
             ret = emit_vgpu10_instruction(emit, inst_number - 1,
                                           &parse.FullToken.FullInstruction);
          }
+         else if (emit->reemit_rawbuf_instruction) {
+            ret = emit_rawbuf_instruction(emit, inst_number - 1,
+                                          &parse.FullToken.FullInstruction);
+         }
 
          if (!ret)
             goto done;
@@ -10765,9 +12403,13 @@ emit_vgpu10_header(struct svga_shader_emitter_v10 *emit)
    VGPU10ProgramToken ptoken;
 
    /* First token: VGPU10ProgramToken  (version info, program type (VS,GS,PS)) */
+
+   /* Maximum supported shader version is 50 */
+   unsigned version = MIN2(emit->version, 50);
+
    ptoken.value = 0; /* init whole token to zero */
-   ptoken.majorVersion = emit->version / 10;
-   ptoken.minorVersion = emit->version % 10;
+   ptoken.majorVersion = version / 10;
+   ptoken.minorVersion = version % 10;
    ptoken.programType = translate_shader_type(emit->unit);
    if (!emit_dword(emit, ptoken.value))
       return FALSE;
@@ -10840,6 +12482,15 @@ emit_vgpu10_tail(struct svga_shader_emitter_v10 *emit)
       ptoken->refactoringAllowed = 1;
    }
 
+   if (emit->version >= 50 && emit->fs.forceEarlyDepthStencil) {
+      /* Replace the reserved token with the forceEarlyDepthStencil  global flag */
+      VGPU10OpcodeToken0 *ptoken;
+
+      ptoken = (VGPU10OpcodeToken0 *)&tokens[emit->reserved_token];
+      ptoken->opcodeType = VGPU10_OPCODE_DCL_GLOBAL_FLAGS;
+      ptoken->forceEarlyDepthStencil = 1;
+   }
+
    return TRUE;
 }
 
@@ -10884,6 +12535,9 @@ transform_fs_pstipple(struct svga_shader_emitter_v10 *emit,
 
    emit->fs.pstipple_sampler_unit = unit;
 
+   /* The new sampler state is appended to the end of the samplers list */
+   emit->fs.pstipple_sampler_state_index = emit->key.num_samplers++;
+
    /* Setup texture state for stipple */
    emit->sampler_target[unit] = TGSI_TEXTURE_2D;
    emit->key.tex[unit].swizzle_r = TGSI_SWIZZLE_X;
@@ -10891,6 +12545,7 @@ transform_fs_pstipple(struct svga_shader_emitter_v10 *emit,
    emit->key.tex[unit].swizzle_b = TGSI_SWIZZLE_Z;
    emit->key.tex[unit].swizzle_a = TGSI_SWIZZLE_W;
    emit->key.tex[unit].target = PIPE_TEXTURE_2D;
+   emit->key.tex[unit].sampler_index = emit->fs.pstipple_sampler_state_index;
 
    if (0) {
       debug_printf("After pstipple ------------------\n");
@@ -11020,6 +12675,7 @@ svga_tgsi_vgpu10_translate(struct svga_context *svga,
                            const struct svga_compile_key *key,
                            enum pipe_shader_type unit)
 {
+   struct svga_screen *svgascreen = svga_screen(svga->pipe.screen);
    struct svga_shader_variant *variant = NULL;
    struct svga_shader_emitter_v10 *emit;
    const struct tgsi_token *tokens = shader->tokens;
@@ -11045,7 +12701,9 @@ svga_tgsi_vgpu10_translate(struct svga_context *svga,
       goto done;
 
    emit->unit = unit;
-   if (svga_have_sm5(svga)) {
+   if (svga_have_gl43(svga)) {
+      emit->version = 51;
+   } else if (svga_have_sm5(svga)) {
       emit->version = 50;
    } else if (svga_have_sm4_1(svga)) {
       emit->version = 41;
@@ -11053,6 +12711,8 @@ svga_tgsi_vgpu10_translate(struct svga_context *svga,
       emit->version = 40;
    }
 
+   emit->use_sampler_state_mapping = emit->key.sampler_state_mapping;
+
    emit->signature.header.headerVersion = SVGADX_SIGNATURE_HEADER_VERSION_0;
 
    emit->key = *key;
@@ -11098,7 +12758,6 @@ svga_tgsi_vgpu10_translate(struct svga_context *svga,
    emit->tcs.control_point_tmp_index = INVALID_INDEX;
    emit->tcs.control_point_out_count = 0;
    emit->tcs.inner.out_index = INVALID_INDEX;
-   emit->tcs.inner.out_index = INVALID_INDEX;
    emit->tcs.inner.temp_index = INVALID_INDEX;
    emit->tcs.inner.tgsi_index = INVALID_INDEX;
    emit->tcs.outer.out_index = INVALID_INDEX;
@@ -11118,6 +12777,14 @@ svga_tgsi_vgpu10_translate(struct svga_context *svga,
    emit->tes.outer.tgsi_index = INVALID_INDEX;
    emit->tes.prim_id_index = INVALID_INDEX;
 
+   emit->cs.thread_id_index = INVALID_INDEX;
+   emit->cs.block_id_index = INVALID_INDEX;
+   emit->cs.grid_size.tgsi_index = INVALID_INDEX;
+   emit->cs.grid_size.imm_index = INVALID_INDEX;
+   emit->cs.block_width = 1;
+   emit->cs.block_height = 1;
+   emit->cs.block_depth = 1;
+
    emit->clip_dist_out_index = INVALID_INDEX;
    emit->clip_dist_tmp_index = INVALID_INDEX;
    emit->clip_dist_so_index = INVALID_INDEX;
@@ -11135,6 +12802,9 @@ svga_tgsi_vgpu10_translate(struct svga_context *svga,
    emit->current_loop_depth = 0;
 
    emit->initialize_temp_index = INVALID_INDEX;
+   emit->max_vs_inputs  = svgascreen->max_vs_inputs;
+   emit->max_vs_outputs = svgascreen->max_vs_outputs;
+   emit->max_gs_inputs  = svgascreen->max_gs_inputs;
 
    if (emit->key.fs.alpha_func == SVGA3D_CMP_INVALID) {
       emit->key.fs.alpha_func = SVGA3D_CMP_ALWAYS;
@@ -11202,6 +12872,12 @@ svga_tgsi_vgpu10_translate(struct svga_context *svga,
       }
    }
 
+   /* Determine if constbuf to rawbuf translation is needed */
+   if (emit->info.const_buffers_declared) {
+      emit->raw_bufs = emit->key.raw_buffers;
+      emit->raw_buf_srv_start_index = emit->key.srv_raw_buf_index;
+   }
+
    /*
     * Do actual shader translation.
     */
@@ -11262,6 +12938,8 @@ svga_tgsi_vgpu10_translate(struct svga_context *svga,
       struct svga_fs_variant *fs_variant = svga_fs_variant(variant);
 
       fs_variant->pstipple_sampler_unit = emit->fs.pstipple_sampler_unit;
+      fs_variant->pstipple_sampler_state_index =
+         emit->fs.pstipple_sampler_state_index;
 
       /* If there was exactly one write to a fragment shader output register
        * and it came from a constant buffer, we know all fragments will have
@@ -11275,7 +12953,7 @@ svga_tgsi_vgpu10_translate(struct svga_context *svga,
        */
       fs_variant->uses_flat_interp = emit->uses_flat_interp;
 
-      fs_variant->fs_shadow_compare_units = emit->fs.shadow_compare_units;
+      fs_variant->fs_shadow_compare_units = emit->shadow_compare_units;
    }
    else if (unit == PIPE_SHADER_TESS_EVAL) {
       struct svga_tes_variant *tes_variant = svga_tes_variant(variant);