gallivm: add fp64 support. (v2.1)

This adds support for ARB_gpu_shader_fp64 and ARB_vertex_attrib_64bit to llvmpipe. Two things that don't mix well are SoA and doubles, see emit_fetch_double, and emit_store_double_chan in this. I've also had to split emit_data.chan, to add src_chan, which can be different for doubles. It handles indirect double fetches from temps, inputs, constants and immediates. It doesn't handle double stores to indirects, however it appears the mesa/st doesn't currently emit these, it always does UARL/MOV combos, which will work fine. tested with piglit, no regressions, all the fp64 tests seem to pass. v2: switch to using shuffles for fetch/store (Roland) assert on indirect double stores - mesa/st never emits these (it uses MOV) fix indirect temp/input/constant/immediates (Roland) typos/formatting fixes (Roland) v2.1: cleanup some long lines, emit_store_double_chan cleanups. Reviewed-by: Roland Scheidegger <sroland@vmware.com> Signed-off-by: Dave Airlie <airlied@redhat.com>
2026-05-08 02:38:04 +02:00 · 2015-06-27 14:21:54 +10:00 · 2015-06-27 14:21:54 +10:00 · e35c571783
commit e35c571783
parent 5ccd61217d
8 changed files with 553 additions and 31 deletions
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@ -1997,6 +1997,12 @@ lp_build_floor(struct lp_build_context *bld,
      LLVMTypeRef int_vec_type = bld->int_vec_type;
      LLVMTypeRef vec_type = bld->vec_type;

+      if (type.width != 32) {
+         char intrinsic[32];
+         util_snprintf(intrinsic, sizeof intrinsic, "llvm.floor.v%uf%u", type.length, type.width);
+         return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
+      }
+
      assert(type.width == 32); /* might want to handle doubles at some point */

      inttype = type;
@ -2066,6 +2072,12 @@ lp_build_ceil(struct lp_build_context *bld,
      LLVMTypeRef int_vec_type = bld->int_vec_type;
      LLVMTypeRef vec_type = bld->vec_type;

+      if (type.width != 32) {
+         char intrinsic[32];
+         util_snprintf(intrinsic, sizeof intrinsic, "llvm.ceil.v%uf%u", type.length, type.width);
+         return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
+      }
+
      assert(type.width == 32); /* might want to handle doubles at some point */

      inttype = type;
--- a/src/gallium/auxiliary/gallivm/lp_bld_limits.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
@ -132,6 +132,7 @@ gallivm_get_shader_param(enum pipe_shader_cap param)
   case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
      return 1;
   case PIPE_SHADER_CAP_DOUBLES:
+      return 1;
   case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
   case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
   case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@ -81,7 +81,7 @@ lp_build_compare_ext(struct gallivm_state *gallivm,
                     boolean ordered)
 {
   LLVMBuilderRef builder = gallivm->builder;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type);
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, lp_type_int_vec(32, 32 * type.length));
   LLVMValueRef zeros = LLVMConstNull(int_vec_type);
   LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);
   LLVMValueRef cond;
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
@ -175,13 +175,52 @@ void lp_build_fetch_args(
   unsigned src;
   for (src = 0; src < emit_data->info->num_src; src++) {
      emit_data->args[src] = lp_build_emit_fetch(bld_base, emit_data->inst, src,
-                                               emit_data->chan);
+                                                 emit_data->src_chan);
   }
   emit_data->arg_count = emit_data->info->num_src;
   lp_build_action_set_dst_type(emit_data, bld_base,
 		emit_data->inst->Instruction.Opcode);
 }

+/**
+ * with doubles src and dst channels aren't 1:1.
+ * check the src/dst types for the opcode,
+ * 1. if neither is double then src == dst;
+ * 2. if dest is double
+ *     - don't store to y or w
+ *     - if src is double then src == dst.
+ *     - else for f2d, d.xy = s.x
+ *     - else for f2d, d.zw = s.y
+ * 3. if dst is single, src is double
+ *    - map dst x,z to src xy;
+ *    - map dst y,w to src zw;
+ */
+static int get_src_chan_idx(unsigned opcode,
+                            int dst_chan_index)
+{
+   enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(opcode);
+   enum tgsi_opcode_type stype = tgsi_opcode_infer_src_type(opcode);
+
+   if (dtype != TGSI_TYPE_DOUBLE && stype != TGSI_TYPE_DOUBLE)
+      return dst_chan_index;
+   if (dtype == TGSI_TYPE_DOUBLE) {
+      if (dst_chan_index == 1 || dst_chan_index == 3)
+         return -1;
+      if (stype == TGSI_TYPE_DOUBLE)
+         return dst_chan_index;
+      if (dst_chan_index == 0)
+         return 0;
+      if (dst_chan_index == 2)
+         return 1;
+   } else {
+      if (dst_chan_index == 0 || dst_chan_index == 2)
+         return 0;
+      if (dst_chan_index == 1 || dst_chan_index == 3)
+         return 2;
+   }
+   return -1;
+}
+
 /* XXX: COMMENT
 * It should be assumed that this function ignores writemasks
 */
@ -197,7 +236,6 @@ lp_build_tgsi_inst_llvm(
   struct lp_build_emit_data emit_data;
   unsigned chan_index;
   LLVMValueRef val;
-
   bld_base->pc++;

   if (bld_base->emit_debug) {
@ -240,7 +278,12 @@ lp_build_tgsi_inst_llvm(
   /* Emit the instructions */
   if (info->output_mode == TGSI_OUTPUT_COMPONENTWISE && bld_base->soa) {
      TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
+         int src_index = get_src_chan_idx(inst->Instruction.Opcode, chan_index);
+         /* ignore channels 1/3 in double dst */
+         if (src_index == -1)
+            continue;
         emit_data.chan = chan_index;
+         emit_data.src_chan = src_index;
         if (!action->fetch_args) {
            lp_build_fetch_args(bld_base, &emit_data);
         } else {
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@ -338,6 +338,7 @@ struct lp_build_tgsi_context
   struct lp_build_context uint_bld;
   struct lp_build_context int_bld;

+   struct lp_build_context dbl_bld;
   /** This array stores functions that are used to transform TGSI opcodes to
     * LLVM instructions.
     */
@ -349,6 +350,9 @@ struct lp_build_tgsi_context

   struct lp_build_tgsi_action sqrt_action;

+   struct lp_build_tgsi_action drsq_action;
+
+   struct lp_build_tgsi_action dsqrt_action;
   const struct tgsi_shader_info *info;

   lp_build_emit_fetch_fn emit_fetch_funcs[TGSI_FILE_COUNT];
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@ -894,6 +894,125 @@ const struct lp_build_tgsi_action xpd_action = {
   xpd_emit	 /* emit */
 };

+/* TGSI_OPCODE_D2F */
+static void
+d2f_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] =
+      LLVMBuildFPTrunc(bld_base->base.gallivm->builder,
+                      emit_data->args[0],
+                       bld_base->base.vec_type, "");
+}
+
+/* TGSI_OPCODE_D2I */
+static void
+d2i_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] =
+      LLVMBuildFPToSI(bld_base->base.gallivm->builder,
+                      emit_data->args[0],
+                      bld_base->base.int_vec_type, "");
+}
+
+/* TGSI_OPCODE_D2U */
+static void
+d2u_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] =
+      LLVMBuildFPToUI(bld_base->base.gallivm->builder,
+                      emit_data->args[0],
+                      bld_base->base.int_vec_type, "");
+}
+
+/* TGSI_OPCODE_F2D */
+static void
+f2d_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] =
+      LLVMBuildFPExt(bld_base->base.gallivm->builder,
+                      emit_data->args[0],
+                      bld_base->dbl_bld.vec_type, "");
+}
+
+/* TGSI_OPCODE_U2D */
+static void
+u2d_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] =
+      LLVMBuildUIToFP(bld_base->base.gallivm->builder,
+                      emit_data->args[0],
+                      bld_base->dbl_bld.vec_type, "");
+}
+
+/* TGSI_OPCODE_I2D */
+static void
+i2d_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] =
+      LLVMBuildSIToFP(bld_base->base.gallivm->builder,
+                      emit_data->args[0],
+                      bld_base->dbl_bld.vec_type, "");
+}
+
+/* TGSI_OPCODE_DMAD */
+static void
+dmad_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   LLVMValueRef tmp;
+   tmp = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_DMUL,
+                                   emit_data->args[0],
+                                   emit_data->args[1]);
+   emit_data->output[emit_data->chan] = lp_build_emit_llvm_binary(bld_base,
+                                       TGSI_OPCODE_DADD, tmp, emit_data->args[2]);
+}
+
+/*.TGSI_OPCODE_DRCP.*/
+static void drcp_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   LLVMValueRef one;
+   one = lp_build_const_vec(bld_base->dbl_bld.gallivm, bld_base->dbl_bld.type, 1.0f);
+   emit_data->output[emit_data->chan] = LLVMBuildFDiv(
+      bld_base->base.gallivm->builder,
+      one, emit_data->args[0], "");
+}
+
+/* TGSI_OPCODE_DFRAC */
+static void dfrac_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   LLVMValueRef tmp;
+   tmp = lp_build_floor(&bld_base->dbl_bld,
+			emit_data->args[0]);
+   emit_data->output[emit_data->chan] =  LLVMBuildFSub(bld_base->base.gallivm->builder,
+                                                       emit_data->args[0], tmp, "");
+}
+
 void
 lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
 {
@ -948,6 +1067,25 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base)

   bld_base->op_actions[TGSI_OPCODE_MAX].emit = fmax_emit;
   bld_base->op_actions[TGSI_OPCODE_MIN].emit = fmin_emit;
+
+   bld_base->op_actions[TGSI_OPCODE_DADD].emit = add_emit;
+   bld_base->op_actions[TGSI_OPCODE_DMAX].emit = fmax_emit;
+   bld_base->op_actions[TGSI_OPCODE_DMIN].emit = fmin_emit;
+   bld_base->op_actions[TGSI_OPCODE_DMUL].emit = mul_emit;
+
+   bld_base->op_actions[TGSI_OPCODE_D2F].emit = d2f_emit;
+   bld_base->op_actions[TGSI_OPCODE_D2I].emit = d2i_emit;
+   bld_base->op_actions[TGSI_OPCODE_D2U].emit = d2u_emit;
+
+   bld_base->op_actions[TGSI_OPCODE_F2D].emit = f2d_emit;
+   bld_base->op_actions[TGSI_OPCODE_I2D].emit = i2d_emit;
+   bld_base->op_actions[TGSI_OPCODE_U2D].emit = u2d_emit;
+
+   bld_base->op_actions[TGSI_OPCODE_DMAD].emit = dmad_emit;
+
+   bld_base->op_actions[TGSI_OPCODE_DRCP].emit = drcp_emit;
+   bld_base->op_actions[TGSI_OPCODE_DFRAC].emit = dfrac_emit;
+
 }

 /* CPU Only default actions */
@ -1792,6 +1930,104 @@ xor_emit_cpu(
                                                     emit_data->args[1]);
 }

+/* TGSI_OPCODE_DABS (CPU Only) */
+static void
+dabs_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] = lp_build_abs(&bld_base->dbl_bld,
+                                                       emit_data->args[0]);
+}
+
+/* TGSI_OPCODE_DNEG (CPU Only) */
+static void
+dneg_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] = lp_build_sub(&bld_base->dbl_bld,
+                                                     bld_base->dbl_bld.zero,
+                                                     emit_data->args[0]);
+}
+
+/* TGSI_OPCODE_DSET Helper (CPU Only) */
+static void
+dset_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data,
+   unsigned pipe_func)
+{
+   LLVMValueRef cond = lp_build_cmp(&bld_base->dbl_bld, pipe_func,
+                                    emit_data->args[0], emit_data->args[1]);
+   emit_data->output[emit_data->chan] = cond;
+}
+
+/* TGSI_OPCODE_DSEQ (CPU Only) */
+static void
+dseq_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   dset_emit_cpu(action, bld_base, emit_data, PIPE_FUNC_EQUAL);
+}
+
+/* TGSI_OPCODE_DSGE (CPU Only) */
+static void
+dsge_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   dset_emit_cpu(action, bld_base, emit_data, PIPE_FUNC_GEQUAL);
+}
+
+/* TGSI_OPCODE_DSLT (CPU Only) */
+static void
+dslt_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   dset_emit_cpu(action, bld_base, emit_data, PIPE_FUNC_LESS);
+}
+
+/* TGSI_OPCODE_DSNE (CPU Only) */
+static void
+dsne_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   dset_emit_cpu(action, bld_base, emit_data, PIPE_FUNC_NOTEQUAL);
+}
+
+/* Double Reciprocal squareroot (CPU Only) */
+static void
+drecip_sqrt_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] = lp_build_rsqrt(&bld_base->dbl_bld,
+                                                         emit_data->args[0]);
+}
+
+/* Double Squareroot (CPU Only) */
+static void
+dsqrt_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] = lp_build_sqrt(&bld_base->dbl_bld,
+                                                      emit_data->args[0]);
+}
+
 void
 lp_set_default_actions_cpu(
   struct lp_build_tgsi_context * bld_base)
@ -1864,4 +2100,14 @@ lp_set_default_actions_cpu(

   bld_base->op_actions[TGSI_OPCODE_XOR].emit = xor_emit_cpu;

+   bld_base->op_actions[TGSI_OPCODE_DABS].emit = dabs_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_DNEG].emit = dneg_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_DSEQ].emit = dseq_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_DSGE].emit = dsge_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_DSLT].emit = dslt_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_DSNE].emit = dsne_emit_cpu;
+
+   bld_base->op_actions[TGSI_OPCODE_DRSQ].emit = drecip_sqrt_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_DSQRT].emit = dsqrt_emit_cpu;
+
 }
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.h
@ -71,6 +71,11 @@ struct lp_build_emit_data {
    */
   unsigned chan;

+   /**
+    * This is used to specify the src channel to read from for doubles.
+    */
+   unsigned src_chan;
+
   /** The lp_build_tgsi_action::emit 'executes' the opcode and writes the
    * results to this array.
    */
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@ -947,15 +947,20 @@ static LLVMValueRef
 build_gather(struct lp_build_tgsi_context *bld_base,
             LLVMValueRef base_ptr,
             LLVMValueRef indexes,
-             LLVMValueRef overflow_mask)
+             LLVMValueRef overflow_mask,
+             LLVMValueRef indexes2)
 {
   struct gallivm_state *gallivm = bld_base->base.gallivm;
   LLVMBuilderRef builder = gallivm->builder;
   struct lp_build_context *uint_bld = &bld_base->uint_bld;
   struct lp_build_context *bld = &bld_base->base;
-   LLVMValueRef res = bld->undef;
+   LLVMValueRef res;
   unsigned i;

+   if (indexes2)
+      res = LLVMGetUndef(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), bld_base->base.type.length * 2));
+   else
+      res = bld->undef;
   /*
    * overflow_mask is a vector telling us which channels
    * in the vector overflowed. We use the overflow behavior for
@ -976,26 +981,47 @@ build_gather(struct lp_build_tgsi_context *bld_base,
       * control flow.
       */
      indexes = lp_build_select(uint_bld, overflow_mask, uint_bld->zero, indexes);
+      if (indexes2)
+         indexes2 = lp_build_select(uint_bld, overflow_mask, uint_bld->zero, indexes2);
   }

   /*
    * Loop over elements of index_vec, load scalar value, insert it into 'res'.
    */
-   for (i = 0; i < bld->type.length; i++) {
-      LLVMValueRef ii = lp_build_const_int32(bld->gallivm, i);
-      LLVMValueRef index = LLVMBuildExtractElement(builder,
-                                                   indexes, ii, "");
+   for (i = 0; i < bld->type.length * (indexes2 ? 2 : 1); i++) {
+      LLVMValueRef si, di;
+      LLVMValueRef index;
      LLVMValueRef scalar_ptr, scalar;

+      di = lp_build_const_int32(bld->gallivm, i);
+      if (indexes2)
+         si = lp_build_const_int32(bld->gallivm, i >> 1);
+      else
+         si = di;
+
+      if (indexes2 && (i & 1)) {
+         index = LLVMBuildExtractElement(builder,
+                                         indexes2, si, "");
+      } else {
+         index = LLVMBuildExtractElement(builder,
+                                         indexes, si, "");
+      }
      scalar_ptr = LLVMBuildGEP(builder, base_ptr,
                                &index, 1, "gather_ptr");
      scalar = LLVMBuildLoad(builder, scalar_ptr, "");

-      res = LLVMBuildInsertElement(builder, res, scalar, ii, "");
+      res = LLVMBuildInsertElement(builder, res, scalar, di, "");
   }

   if (overflow_mask) {
-      res = lp_build_select(bld, overflow_mask, bld->zero, res);
+      if (indexes2) {
+         res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, "");
+         overflow_mask = LLVMBuildSExt(builder, overflow_mask,
+                                       bld_base->dbl_bld.int_vec_type, "");
+         res = lp_build_select(&bld_base->dbl_bld, overflow_mask,
+                               bld_base->dbl_bld.zero, res);
+      } else
+         res = lp_build_select(bld, overflow_mask, bld->zero, res);
   }

   return res;
@ -1139,8 +1165,10 @@ stype_to_fetch(struct lp_build_tgsi_context * bld_base,
   case TGSI_TYPE_SIGNED:
      bld_fetch = &bld_base->int_bld;
      break;
-   case TGSI_TYPE_VOID:
   case TGSI_TYPE_DOUBLE:
+      bld_fetch = &bld_base->dbl_bld;
+      break;
+   case TGSI_TYPE_VOID:
   default:
      assert(0);
      bld_fetch = NULL;
@ -1216,6 +1244,7 @@ emit_fetch_constant(
         lp_build_const_int_vec(gallivm, uint_bld->type, swizzle);
      LLVMValueRef index_vec;  /* index into the const buffer */
      LLVMValueRef overflow_mask;
+      LLVMValueRef index_vec2 = NULL;

      indirect_index = get_indirect_index(bld,
                                          reg->Register.File,
@ -1235,22 +1264,33 @@ emit_fetch_constant(
      index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
      index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec);

+      if (stype == TGSI_TYPE_DOUBLE) {
+         LLVMValueRef swizzle_vec2;
+         swizzle_vec2 = lp_build_const_int_vec(gallivm, uint_bld->type, swizzle + 1);
+         index_vec2 = lp_build_shl_imm(uint_bld, indirect_index, 2);
+         index_vec2 = lp_build_add(uint_bld, index_vec2, swizzle_vec2);
+      }
      /* Gather values from the constant buffer */
-      res = build_gather(bld_base, consts_ptr, index_vec, overflow_mask);
+      res = build_gather(bld_base, consts_ptr, index_vec, overflow_mask, index_vec2);
   }
   else {
      LLVMValueRef index;  /* index into the const buffer */
      LLVMValueRef scalar, scalar_ptr;
-
+      struct lp_build_context *bld_broad = &bld_base->base;
      index = lp_build_const_int32(gallivm, reg->Register.Index * 4 + swizzle);

      scalar_ptr = LLVMBuildGEP(builder, consts_ptr,
                                &index, 1, "");
+      if (stype == TGSI_TYPE_DOUBLE) {
+         LLVMTypeRef dptr_type = LLVMPointerType(LLVMDoubleTypeInContext(gallivm->context), 0);
+         scalar_ptr = LLVMBuildBitCast(builder, scalar_ptr, dptr_type, "");
+         bld_broad = &bld_base->dbl_bld;
+      }
      scalar = LLVMBuildLoad(builder, scalar_ptr, "");
-      res = lp_build_broadcast_scalar(&bld_base->base, scalar);
+      res = lp_build_broadcast_scalar(bld_broad, scalar);
   }

-   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED) {
+   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED || stype == TGSI_TYPE_DOUBLE) {
      struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
      res = LLVMBuildBitCast(builder, res, bld_fetch->vec_type, "");
   }
@ -1258,6 +1298,39 @@ emit_fetch_constant(
   return res;
 }

+/**
+ * Fetch double values from two separate channels.
+ * Doubles are stored split across two channels, like xy and zw.
+ * This function creates a set of 16 floats,
+ * extracts the values from the two channels,
+ * puts them in the correct place, then casts to 8 doubles.
+ */
+static LLVMValueRef
+emit_fetch_double(
+   struct lp_build_tgsi_context * bld_base,
+   enum tgsi_opcode_type stype,
+   LLVMValueRef input,
+   LLVMValueRef input2)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef res;
+   struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
+   int i;
+   LLVMValueRef shuffles[16];
+   int len = bld_base->base.type.length * 2;
+   assert(len <= 16);
+
+   for (i = 0; i < bld_base->base.type.length * 2; i+=2) {
+      shuffles[i] = lp_build_const_int32(gallivm, i / 2);
+      shuffles[i + 1] = lp_build_const_int32(gallivm, i / 2 + bld_base->base.type.length);
+   }
+   res = LLVMBuildShuffleVector(builder, input, input2, LLVMConstVector(shuffles, len), "");
+
+   return LLVMBuildBitCast(builder, res, bld_fetch->vec_type, "");
+}
+
 static LLVMValueRef
 emit_fetch_immediate(
   struct lp_build_tgsi_context * bld_base,
@ -1281,7 +1354,7 @@ emit_fetch_immediate(
      if (reg->Register.Indirect) {
         LLVMValueRef indirect_index;
         LLVMValueRef index_vec;  /* index into the immediate register array */
-
+         LLVMValueRef index_vec2 = NULL;
         indirect_index = get_indirect_index(bld,
                                             reg->Register.File,
                                             reg->Register.Index,
@ -1296,25 +1369,46 @@ emit_fetch_immediate(
                                           indirect_index,
                                           swizzle,
                                           FALSE);
-
+         if (stype == TGSI_TYPE_DOUBLE)
+            index_vec2 = get_soa_array_offsets(&bld_base->uint_bld,
+                                              indirect_index,
+                                              swizzle + 1,
+                                              FALSE);
         /* Gather values from the immediate register array */
-         res = build_gather(bld_base, imms_array, index_vec, NULL);
+         res = build_gather(bld_base, imms_array, index_vec, NULL, index_vec2);
      } else {
         LLVMValueRef lindex = lp_build_const_int32(gallivm,
                                        reg->Register.Index * 4 + swizzle);
         LLVMValueRef imms_ptr =  LLVMBuildGEP(builder,
                                                bld->imms_array, &lindex, 1, "");
         res = LLVMBuildLoad(builder, imms_ptr, "");
+
+         if (stype == TGSI_TYPE_DOUBLE) {
+            LLVMValueRef lindex1;
+            LLVMValueRef imms_ptr2;
+            LLVMValueRef res2;
+
+            lindex1 = lp_build_const_int32(gallivm,
+                                           reg->Register.Index * 4 + swizzle + 1);
+            imms_ptr2 = LLVMBuildGEP(builder,
+                                      bld->imms_array, &lindex1, 1, "");
+            res2 = LLVMBuildLoad(builder, imms_ptr2, "");
+            res = emit_fetch_double(bld_base, stype, res, res2);
+         }
      }
   }
   else {
      res = bld->immediates[reg->Register.Index][swizzle];
+      if (stype == TGSI_TYPE_DOUBLE)
+         res = emit_fetch_double(bld_base, stype, res, bld->immediates[reg->Register.Index][swizzle + 1]);
   }

   if (stype == TGSI_TYPE_UNSIGNED) {
      res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
   } else if (stype == TGSI_TYPE_SIGNED) {
      res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
+   } else if (stype == TGSI_TYPE_DOUBLE) {
+      res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, "");
   }
   return res;
 }
@ -1334,6 +1428,7 @@ emit_fetch_input(
   if (reg->Register.Indirect) {
      LLVMValueRef indirect_index;
      LLVMValueRef index_vec;  /* index into the input reg array */
+      LLVMValueRef index_vec2 = NULL;
      LLVMValueRef inputs_array;
      LLVMTypeRef fptr_type;

@ -1346,23 +1441,43 @@ emit_fetch_input(
                                        indirect_index,
                                        swizzle,
                                        TRUE);
-
+      if (stype == TGSI_TYPE_DOUBLE) {
+         index_vec2 = get_soa_array_offsets(&bld_base->uint_bld,
+                                           indirect_index,
+                                           swizzle + 1,
+                                           TRUE);
+      }
      /* cast inputs_array pointer to float* */
      fptr_type = LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0);
      inputs_array = LLVMBuildBitCast(builder, bld->inputs_array, fptr_type, "");

      /* Gather values from the input register array */
-      res = build_gather(bld_base, inputs_array, index_vec, NULL);
+      res = build_gather(bld_base, inputs_array, index_vec, NULL, index_vec2);
   } else {
      if (bld->indirect_files & (1 << TGSI_FILE_INPUT)) {
         LLVMValueRef lindex = lp_build_const_int32(gallivm,
                                        reg->Register.Index * 4 + swizzle);
-         LLVMValueRef input_ptr =  LLVMBuildGEP(builder,
-                                                bld->inputs_array, &lindex, 1, "");
+         LLVMValueRef input_ptr = LLVMBuildGEP(builder,
+                                               bld->inputs_array, &lindex, 1, "");
+
         res = LLVMBuildLoad(builder, input_ptr, "");
+         if (stype == TGSI_TYPE_DOUBLE) {
+            LLVMValueRef lindex1;
+            LLVMValueRef input_ptr2;
+            LLVMValueRef res2;
+
+            lindex1 = lp_build_const_int32(gallivm,
+                                           reg->Register.Index * 4 + swizzle + 1);
+            input_ptr2 = LLVMBuildGEP(builder,
+                                      bld->inputs_array, &lindex1, 1, "");
+            res2 = LLVMBuildLoad(builder, input_ptr2, "");
+            res = emit_fetch_double(bld_base, stype, res, res2);
+         }
      }
      else {
         res = bld->inputs[reg->Register.Index][swizzle];
+         if (stype == TGSI_TYPE_DOUBLE)
+            res = emit_fetch_double(bld_base, stype, res, bld->inputs[reg->Register.Index][swizzle + 1]);
      }
   }

@ -1372,6 +1487,8 @@ emit_fetch_input(
      res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
   } else if (stype == TGSI_TYPE_SIGNED) {
      res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
+   } else if (stype == TGSI_TYPE_DOUBLE) {
+      res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, "");
   }

   return res;
@ -1413,7 +1530,7 @@ emit_fetch_gs_input(
   } else {
      attrib_index = lp_build_const_int32(gallivm, reg->Register.Index);
   }
-   
+
   if (reg->Dimension.Indirect) {
      vertex_index = get_indirect_index(bld,
                                        reg->Register.File,
@ -1436,6 +1553,8 @@ emit_fetch_gs_input(
      res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
   } else if (stype == TGSI_TYPE_SIGNED) {
      res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
+   } else if (stype == TGSI_TYPE_DOUBLE) {
+      res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, "");
   }

   return res;
@ -1455,7 +1574,7 @@ emit_fetch_temporary(

   if (reg->Register.Indirect) {
      LLVMValueRef indirect_index;
-      LLVMValueRef index_vec;  /* index into the temp reg array */
+      LLVMValueRef index_vec, index_vec2 = NULL;  /* index into the temp reg array */
      LLVMValueRef temps_array;
      LLVMTypeRef fptr_type;

@ -1468,21 +1587,35 @@ emit_fetch_temporary(
                                        indirect_index,
                                        swizzle,
                                        TRUE);
+      if (stype == TGSI_TYPE_DOUBLE) {
+               index_vec2 = get_soa_array_offsets(&bld_base->uint_bld,
+                                                  indirect_index,
+                                                  swizzle + 1,
+                                                  TRUE);
+      }

      /* cast temps_array pointer to float* */
      fptr_type = LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0);
      temps_array = LLVMBuildBitCast(builder, bld->temps_array, fptr_type, "");

      /* Gather values from the temporary register array */
-      res = build_gather(bld_base, temps_array, index_vec, NULL);
+      res = build_gather(bld_base, temps_array, index_vec, NULL, index_vec2);
   }
   else {
      LLVMValueRef temp_ptr;
      temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle);
      res = LLVMBuildLoad(builder, temp_ptr, "");
+
+      if (stype == TGSI_TYPE_DOUBLE) {
+         LLVMValueRef temp_ptr2, res2;
+
+         temp_ptr2 = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle + 1);
+         res2 = LLVMBuildLoad(builder, temp_ptr2, "");
+         res = emit_fetch_double(bld_base, stype, res, res2);
+      }
   }

-   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED) {
+   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED || stype == TGSI_TYPE_DOUBLE) {
      struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
      res = LLVMBuildBitCast(builder, res, bld_fetch->vec_type, "");
   }
@ -1648,6 +1781,50 @@ emit_fetch_predicate(
   }
 }

+/**
+ * store an array of 8 doubles into two arrays of 8 floats
+ * i.e.
+ * value is d0, d1, d2, d3 etc.
+ * each double has high and low pieces x, y
+ * so gets stored into the separate channels as:
+ * chan_ptr = d0.x, d1.x, d2.x, d3.x
+ * chan_ptr2 = d0.y, d1.y, d2.y, d3.y
+ */
+static void
+emit_store_double_chan(struct lp_build_tgsi_context *bld_base,
+                       int dtype,
+                       LLVMValueRef chan_ptr, LLVMValueRef chan_ptr2,
+                       LLVMValueRef pred,
+                       LLVMValueRef value)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_build_context *float_bld = &bld_base->base;
+   int i;
+   LLVMValueRef temp, temp2;
+   LLVMValueRef shuffles[8];
+   LLVMValueRef shuffles2[8];
+
+   for (i = 0; i < bld_base->base.type.length; i++) {
+      shuffles[i] = lp_build_const_int32(gallivm, i * 2);
+      shuffles2[i] = lp_build_const_int32(gallivm, (i * 2) + 1);
+   }
+
+   temp = LLVMBuildShuffleVector(builder, value,
+                                 LLVMGetUndef(LLVMTypeOf(value)),
+                                 LLVMConstVector(shuffles,
+                                                 bld_base->base.type.length),
+                                 "");
+   temp2 = LLVMBuildShuffleVector(builder, value,
+                                  LLVMGetUndef(LLVMTypeOf(value)),
+                                  LLVMConstVector(shuffles2,
+                                                  bld_base->base.type.length),
+                                  "");
+
+   lp_exec_mask_store(&bld->exec_mask, float_bld, pred, temp, chan_ptr);
+   lp_exec_mask_store(&bld->exec_mask, float_bld, pred, temp2, chan_ptr2);
+}

 /**
 * Register store.
@ -1683,6 +1860,11 @@ emit_store_chan(
   }

   if (reg->Register.Indirect) {
+      /*
+       * Currently the mesa/st doesn't generate indirect stores
+       * to doubles, it normally uses MOV to do indirect stores.
+       */
+      assert(dtype != TGSI_TYPE_DOUBLE);
      indirect_index = get_indirect_index(bld,
                                          reg->Register.File,
                                          reg->Register.Index,
@ -1721,13 +1903,23 @@ emit_store_chan(
      else {
         LLVMValueRef out_ptr = lp_get_output_ptr(bld, reg->Register.Index,
                                                  chan_index);
-         lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, out_ptr);
+
+         if (dtype == TGSI_TYPE_DOUBLE) {
+            LLVMValueRef out_ptr2 = lp_get_output_ptr(bld, reg->Register.Index,
+                                                      chan_index + 1);
+            emit_store_double_chan(bld_base, dtype, out_ptr, out_ptr2,
+                                   pred, value);
+         } else
+            lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, out_ptr);
      }
      break;

   case TGSI_FILE_TEMPORARY:
      /* Temporaries are always stored as floats */
-      value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
+      if (dtype != TGSI_TYPE_DOUBLE)
+         value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
+      else
+         value = LLVMBuildBitCast(builder, value,  LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), bld_base->base.type.length * 2), "");

      if (reg->Register.Indirect) {
         LLVMValueRef index_vec;  /* indexes into the temp registers */
@ -1749,7 +1941,16 @@ emit_store_chan(
      else {
         LLVMValueRef temp_ptr;
         temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, chan_index);
-         lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, temp_ptr);
+
+         if (dtype == TGSI_TYPE_DOUBLE) {
+            LLVMValueRef temp_ptr2 = lp_get_temp_ptr_soa(bld,
+                                                         reg->Register.Index,
+                                                         chan_index + 1);
+            emit_store_double_chan(bld_base, dtype, temp_ptr, temp_ptr2,
+                                   pred, value);
+         }
+         else
+            lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, temp_ptr);
      }
      break;

@ -1818,13 +2019,16 @@ emit_store(
 {
   unsigned chan_index;
   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
-
+   enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(inst->Instruction.Opcode);
   if(info->num_dst) {
      LLVMValueRef pred[TGSI_NUM_CHANNELS];

      emit_fetch_predicate( bld, inst, pred );

      TGSI_FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+
+         if (dtype == TGSI_TYPE_DOUBLE && (chan_index == 1 || chan_index == 3))
+             continue;
         emit_store_chan(bld_base, inst, 0, chan_index, pred[chan_index], dst[chan_index]);
      }
   }
@ -2823,6 +3027,7 @@ void lp_emit_immediate_soa(
               lp_build_const_vec(gallivm, bld_base->base.type, imm->u[i].Float);

      break;
+   case TGSI_IMM_FLOAT64:
   case TGSI_IMM_UINT32:
      for( i = 0; i < size; ++i ) {
         LLVMValueRef tmp = lp_build_const_vec(gallivm, bld_base->uint_bld.type, imm->u[i].Uint);
@ -3674,6 +3879,12 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
   lp_build_context_init(&bld.bld_base.uint_bld, gallivm, lp_uint_type(type));
   lp_build_context_init(&bld.bld_base.int_bld, gallivm, lp_int_type(type));
   lp_build_context_init(&bld.elem_bld, gallivm, lp_elem_type(type));
+   {
+      struct lp_type dbl_type;
+      dbl_type = type;
+      dbl_type.width *= 2;
+      lp_build_context_init(&bld.bld_base.dbl_bld, gallivm, dbl_type);
+   }
   bld.mask = mask;
   bld.inputs = inputs;
   bld.outputs = outputs;